### Import all library


In [1]:
import re
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from textblob import TextBlob
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, ArrayDictionary
from sklearn.feature_extraction.text import TfidfVectorizer


#### KONSTANTA


In [4]:
# default stop word --> stopword bawaan dari sastrawi
DEFAULT_STOPWORD = StopWordRemoverFactory().get_stop_words()
# add more stopword here
MORE_STOPWORD = ['pada', 'yaitu' ]


def gen_stopWord():
  wordlist = []
  with open("./kamus/id-stopwords.txt", 'r') as f:
    stop = f.read().split("\n")

  for x in stop:
    if (x not in DEFAULT_STOPWORD):
      wordlist.append(x)
  newStopWord = DEFAULT_STOPWORD + MORE_STOPWORD + wordlist
  return newStopWord


# KONSTANTA ---> untuk memberi tahu bahwa variable dibawah jangan diubah
ALAY = pd.read_csv('./kamus/alay.csv')
ABUSSIVE = pd.read_csv('./kamus/abusive.csv')
NORMAL_WORDS = pd.read_csv('./kamus/id-normalwords.csv')

STOP_WORD = ArrayDictionary(gen_stopWord())
STEM_FACT = StemmerFactory().create_stemmer()
STOP_FACT = StopWordRemoverFactory().create_stop_word_remover()


#### FUNCTION


In [5]:
def remEmoji(text):
  # Smile
  text = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\)|:O)', ' positiveemoji ', text)
  # Laugh
  text = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' positiveemoji ', text)
  # Love
  text = re.sub(r'(<3|:\*)', ' positiveemoji ', text)
  # Wink
  text = re.sub(r'(;-?\)|;-?D|\(-?;|@-\))', ' positiveemoji ', text)
  # Sad
  text = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:|:-/|:-\|)', ' negetiveemoji ', text)
  # Cry
  text = re.sub(r'(:,\(|:\'\(|:"\()', ' negetiveemoji ', text)
  return text

# preNormalize --- mengubah kata alay menjadi kata biasa. ex: "akoh" -->"aku"
def preNormalizer(text):
  new_text = ''
  for word in text.split():
    if (ALAY['alay'] == word).any():
      text = ALAY[ALAY['alay'] == word]['arti'].values[0]
      new_text = ' '.join([new_text, text])
    else:
      new_text = ' '.join([new_text, word])
  new_text = new_text.lower().strip()
  return new_text

# casefolding --- lowercase, remove whitwspace (spasi di awal/akhir kalimat), remove punctuation (karakter huruf) , emoji, dll
def CASEFOLDER(text):
  rem_emoji = remEmoji(text)
  rem_URL = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', rem_emoji)
  rem_number = re.sub('\d+', '', rem_URL)
  rem_punctuation = re.sub(r'[^\w\s]', '', rem_number)
  rem_mid_space = re.sub(' +', ' ', rem_punctuation)
  rem_whitespace = rem_mid_space.strip()
  # reduce character - from "kaaaaaamu" --> "kaamu"
  reduce_char = re.sub(r'(.)\1+', r'\1\1', rem_whitespace)
  lower = reduce_char.lower()
  return lower


# nomalisasi kata --- mengubah kata ke bentuk normal (kata umum yang dikenali secara luas) ex: "wis"--> "sudah"
def NORMALIZER(text):
  new_text = ''
  for word in text.split():
    if (NORMAL_WORDS['singkat'] == word).any():
      text = NORMAL_WORDS[NORMAL_WORDS['singkat'] == word]['hasil'].values[0]
      new_text = ' '.join([new_text, text])
    else:
      new_text = ' '.join([new_text, word])
  new_text = new_text.lower().strip()
  return new_text

# remove stopword (filtering) --- menghapus kata-kita minim makna. ex--> "adalah", "yaitu"
def STOPWORDREMOVER(text):
  return STOP_FACT.remove(text)

# stemming (sastrawi) --- mengubah text ke bentuk dasarnya. ex "membuka" --> "buka"
def STEMMER(text):
  return STEM_FACT.stem(text)


#### PRE-PROCESS PIPELINE

##### Tahapan pre-process secara umum meliputi:
1. Casefolding
2. Word Normalization
3. StopWord Removal (Filtering)
4. Stemming

note: tahapan dapat dikurangi atau ditambah sesuai kebutuhan

In [6]:
# PRE-PROCESSING --> disarankan urutanya seperti dibawah.
def PREPROCESSING(text):
  text = preNormalizer(text)          # step tambahan
  text = CASEFOLDER(text)
  text = NORMALIZER(text)
  text = STOPWORDREMOVER(text)
  text = STEMMER(text)
  return text

#### FITUR ENGINERING

In [7]:
dataframe = pd.read_csv("./dataset/percumalaporpolisi.csv")
data = dataframe[['Username','Text']].copy()
data['CleanText'] = data['Text'].apply(PREPROCESSING)
data

Unnamed: 0,Username,Text,CleanText
0,HerinDika,#percumalaporpolisi,percumalaporpolisi
1,ruthlessfire13,Astaghfirullah. #PercumaAdaPolisi #PercumaLapo...,astaghfirullah percumaadapolisi percumalaporpo...
2,cinnamonroll719,@Nabdsftr_ @yourrainbow97 @DivHumas_Polri @Lis...,nabdsftr yourrainbow divhumas polri listyosigi...
3,bowiedj,"Si pelaku nantang laporin aja polisi, … dan 4 ...",si laku tantang lapor polisi lapor tindak tuh ...
4,sallon68,#daftarpencarianorang \n#uutpks #herrywirawan ...,daftarpencarianorang uutpks herrywirawan julia...
...,...,...,...
1996,dzibrhm,#percumalaporpolisi https://t.co/69S4y1DteF,percumalaporpolisi
1997,Berita_Orbit,Film waralaba ‘Insidious’ telah lama tidak akt...,film waralaba insidious aktif dekade film lima...
1998,_____kingAJ,Lgi ngebayangin ada teroris transaksi bubuk Me...,bayang teroris transaksi bubuk mesiu gramsetel...
1999,SNraeniii,Waspada! Modus mafia tanah sudah pakai sistem ...,waspada modus mafia tanah pakai sistem canggih...


In [8]:
X = data['CleanText']

vec_TF_IDF = TfidfVectorizer(ngram_range=(1,1)).fit(X)
x_TF_IDF = vec_TF_IDF.transform(X)

# menampilkan vocabulary dari TF-IDF
# print('{} Vocabulary TF-IDF {}'.format('='*20, '='*20))
# print(vec_TF_IDF.vocabulary_)

# menampilkan jumlah fitur dari TF-IDF
print('{} Jumlah fiture TF-IDF {}'.format('='*20, '='*20))
print (len(vec_TF_IDF.get_feature_names()))

#Melihat fitur-fitur apa saja yang ada di dalam corpus kita
print(vec_TF_IDF.get_feature_names())

5156
['aa', 'aahhsudahlah', 'aahyarum', 'aamiin', 'abadi', 'abah', 'abai', 'abar', 'abcdyougoblog', 'abdurachman', 'abg', 'abigailimuriaa', 'abolishthepolice', 'about', 'abri', 'abu', 'abuse', 'acab', 'academia', 'acar', 'acara', 'according', 'actionnya', 'activism', 'acu', 'acuh', 'ad', 'ada', 'adab', 'adakan', 'adam', 'adavisum', 'adhitamabader', 'adi', 'adik', 'adil', 'adilisambo', 'adilisamboamppc', 'adindaasmara', 'adityaa', 'administrasi', 'administration', 'adminnya', 'adu', 'aduankonten', 'aducupangs', 'aduh', 'advokat', 'adwibasuki', 'aepenilaian', 'afdhalsyah', 'affriliankp', 'afinta', 'against', 'agama', 'agamalgsg', 'agen', 'agenda', 'agenslot', 'agever', 'agis', 'agr', 'agus', 'agusmagelangan', 'agustus', 'ah', 'ahh', 'ahlanwasahlanibhrs', 'ahli', 'ahmad', 'ahsanridha', 'ahy', 'aimanwitjaksono', 'ainurohman', 'air', 'airpewangi', 'aja', 'ajaa', 'ajab', 'ajajdi', 'ajak', 'ajar', 'ajessies', 'aji', 'ajiesra', 'ajjaa', 'akal', 'akan', 'akar', 'akarakarnya', 'akbp', 'akh', 'ak



In [None]:
#Lihat data tabular yang menggunakan metode TF-IDF
#Data ini siap untuk dimasukkan dalam proses machine learning

x1 = vec_TF_IDF.transform(X).toarray()
data_tabular_tf_idf= pd.DataFrame(x1,columns=vec_TF_IDF.get_feature_names_out())
data_tabular_tf_idf

Unnamed: 0,aa,aahhsudahlah,aahyarum,aamiin,abadi,abah,abai,abar,abcdyougoblog,abdurachman,...,zerodemocrazy,zina,zipper,zivamunafik,zoelfick,zonaba,zonajajan,zonauang,zrn,zulkiflilubis
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.185285,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
