# Libraries

In [None]:
!pip install Sastrawi
!pip install clean-text
!pip install deep-translator



In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
from cleantext import clean
from collections import Counter

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import sklearn

In [None]:
# !pip freeze > requirements.txt

In [None]:
# Input
text_url = 'https://raw.githubusercontent.com/TaillessTanuki/Berita_Banggai/main/input_text.csv'

# Read CSV file with 'latin-1' encoding
input_df = pd.read_csv(text_url, encoding='utf-8')

# Convert all columns to string
input_df = input_df.astype(str)

In [None]:
input_df.head()

# Preprocessing

In [None]:
# Delete Duplicates
input_df.duplicated(subset='title').sum()
input_df = input_df.drop_duplicates(subset='title')

# Delete Blank text (Nan) Rows
input_df = input_df[input_df['text'] != '']

# Case Folding
def clean_lower(lwr):
    lwr = lwr.lower() # lowercase text
    return lwr

input_df['lwr'] = input_df['text'].apply(clean_lower)
input_df['lwr']


# Remove Numbers
def remove_numbers(text):
  text = re.sub('[0-9]+', '', text)
  return text

input_df['clean_number'] = input_df['lwr'].apply(remove_numbers)
input_df['clean_number']


# Remove Punctuation
clean_spcl = re.compile('[/(){}\[\]\|@,;]')
clean_symbol = re.compile('[^0-9a-z]')
def clean_punct(text):
    text = clean_spcl.sub('', text)
    text = clean_symbol.sub(' ', text)
    return text

input_df['clean_punct'] = input_df['clean_number'].apply(clean_punct)
input_df['clean_punct']


# Delete Whitespaces
def normalize_whitespace(text):
    corrected = str(text)
    corrected = re.sub(r"//t",r"\t", corrected)
    corrected = re.sub(r"( )\1+",r"\1", corrected)
    corrected = re.sub(r"(\n)\1+",r"\1", corrected)
    corrected = re.sub(r"(\r)\1+",r"\1", corrected)
    corrected = re.sub(r"(\t)\1+",r"\1", corrected)
    return corrected.strip(" ")

input_df['clean_double_ws'] = input_df['clean_punct'].apply(normalize_whitespace)
input_df['clean_double_ws']


In [None]:
len(input_df)

In [None]:
input_df.head()

In [None]:
# Adding Stopwords
url1 = 'https://raw.githubusercontent.com/TaillessTanuki/Berita_Banggai/main/ID-stopwords_banggai_addedwords.csv'
df_stopword1 = pd.read_csv(url1, delimiter = ";",encoding='cp1252')

url2 = 'https://raw.githubusercontent.com/TaillessTanuki/Berita_Banggai/main/ID-Stopwords(MasDevid).csv'
df_stopword2 = pd.read_csv(url2, delimiter = ";",encoding='cp1252')

custom_stopword1 = list(df_stopword1['stopword'])
custom_stopword2 = list(df_stopword2['Stopwords'])

factory = StopWordRemoverFactory()
stopwords = factory.get_stop_words() + custom_stopword1 + custom_stopword2


# 1st Stopwords Removal
def filtering(text):
  stop = stopword.remove(text)
  return stop

input_df['filtered'] = input_df['clean_double_ws'].apply(filtering)
input_df['filtered']


# Stemming
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
stem_factory = StemmerFactory()
stemmer = stem_factory.create_stemmer()

def stem(text):
  return stemmer.stem(text)

input_df['stemmed'] = input_df['filtered'].apply(stem)
input_df['stemmed']


# Tokenizing
def tokenize(text):
  tokens = nltk.tokenize.word_tokenize(text)
  return tokens

input_df['tokenized'] = input_df['stemmed'].apply(tokenize)
input_df['tokenized']


# 2nd Stopwords Removal
# Define a function to remove stopwords from a list of tokens
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stopwords]

# Apply the remove_stopwords function to each row in the 'tokenized' column
input_df['filtered2'] = input_df['tokenized'].apply(lambda tokens: remove_stopwords(tokens))

In [None]:
input_df.head()

In [None]:
#input_df.drop(columns=['predictions'], inplace=True)

In [None]:
# Convert tokenized text to string representation
#text_strings = [' '.join(doc) for doc in df['tokenized']]
input_df['text2'] = input_df['tokenized'].apply(lambda x: ' '.join(x))
