### Import all library


In [2]:
import re
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from textblob import TextBlob
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, ArrayDictionary
from sklearn.feature_extraction.text import TfidfVectorizer


#### KONSTANTA


In [3]:
# default stop word --> stopword bawaan dari sastrawi
DEFAULT_STOPWORD = StopWordRemoverFactory().get_stop_words()
# add more stopword here
MORE_STOPWORD = ['pada', 'yaitu', 'yg' ]


def gen_stopWord():
  wordlist = []
  with open("./kamus/id-stopwords.txt", 'r') as f:
    stop = f.read().split("\n")

  for x in stop:
    if (x not in DEFAULT_STOPWORD):
      wordlist.append(x)
  newStopWord = DEFAULT_STOPWORD + MORE_STOPWORD + wordlist
  return newStopWord


# KONSTANTA ---> untuk memberi tahu bahwa variable dibawah jangan diubah
ALAY = pd.read_csv('./kamus/alay.csv')
ABUSSIVE = pd.read_csv('./kamus/abusive.csv')
NORMAL_WORDS = pd.read_csv('./kamus/id-normalwords.csv')

STOP_WORD = ArrayDictionary(gen_stopWord())
STEM_FACT = StemmerFactory().create_stemmer()
STOP_FACT = StopWordRemoverFactory().create_stop_word_remover()


#### FUNCTION


In [4]:
def remEmoji(text):
  # Smile
  text = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\)|:O)', ' positiveemoji ', text)
  # Laugh
  text = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' positiveemoji ', text)
  # Love
  text = re.sub(r'(<3|:\*)', ' positiveemoji ', text)
  # Wink
  text = re.sub(r'(;-?\)|;-?D|\(-?;|@-\))', ' positiveemoji ', text)
  # Sad
  text = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:|:-/|:-\|)', ' negetiveemoji ', text)
  # Cry
  text = re.sub(r'(:,\(|:\'\(|:"\()', ' negetiveemoji ', text)
  return text

# preNormalize --- mengubah kata alay menjadi kata biasa. ex: "akoh" -->"aku"
def preNormalizer(text):
  new_text = ''
  for word in text.split():
    if (ALAY['alay'] == word).any():
      text = ALAY[ALAY['alay'] == word]['arti'].values[0]
      new_text = ' '.join([new_text, text])
    else:
      new_text = ' '.join([new_text, word])
  new_text = new_text.lower().strip()
  return new_text

# casefolding --- lowercase, remove whitwspace (spasi di awal/akhir kalimat), remove punctuation (karakter huruf) , emoji, dll
def CASEFOLDER(text):
  rem_emoji = remEmoji(text)
  rem_userid=re.sub('@[\w]+','',rem_emoji).strip()
  rem_hashtag = re.sub('#[\w]+','',rem_userid).strip()
  rem_URL = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', rem_hashtag)
  rem_number = re.sub('\d+', '', rem_URL)
  rem_punctuation = re.sub(r'[^\w\s]', '', rem_number)
  rem_mid_space = re.sub(' +', ' ', rem_punctuation)
  rem_whitespace = rem_mid_space.strip()
  # reduce character - from "kaaaaaamu" --> "kaamu"
  reduce_char = re.sub(r'(.)\1+', r'\1\1', rem_whitespace)
  lower = reduce_char.lower()
  return lower


# nomalisasi kata --- mengubah kata ke bentuk normal (kata umum yang dikenali secara luas) ex: "wis"--> "sudah"
def NORMALIZER(text):
  new_text = ''
  for word in text.split():
    if (NORMAL_WORDS['singkat'] == word).any():
      text = NORMAL_WORDS[NORMAL_WORDS['singkat'] == word]['hasil'].values[0]
      new_text = ' '.join([new_text, text])
    else:
      new_text = ' '.join([new_text, word])
  new_text = new_text.lower().strip()
  return new_text

# remove stopword (filtering) --- menghapus kata-kita minim makna. ex--> "adalah", "yaitu"
def STOPWORDREMOVER(text):
  return STOP_FACT.remove(text)

# stemming (sastrawi) --- mengubah text ke bentuk dasarnya. ex "membuka" --> "buka"
def STEMMER(text):
  return STEM_FACT.stem(text)


#### PRE-PROCESS PIPELINE

##### Tahapan pre-process secara umum meliputi:
1. Casefolding
2. Word Normalization
3. StopWord Removal (Filtering)
4. Stemming

note: tahapan dapat dikurangi atau ditambah sesuai kebutuhan

In [5]:
# PRE-PROCESSING --> disarankan urutanya seperti dibawah.
def PREPROCESSING(text):
  text = preNormalizer(text)          # step tambahan
  text = CASEFOLDER(text)
  text = NORMALIZER(text)
  text = STOPWORDREMOVER(text)
  text = STEMMER(text)
  return text

#### FITUR ENGINERING

In [6]:
dataframe = pd.read_csv("./dataset/percumaLaporPolisi.csv")
data = dataframe[['Username','Text']].copy()
data['CleanText'] = data['Text'].apply(PREPROCESSING)
data

Unnamed: 0,Username,Text,CleanText
0,Lesmono8i,@anggitob29 @HumasPolri @ListyoSigitP @mohmahf...,lapor tindaklanjut
1,findnuralisah,"Bssn Brin Kominfo , gunanya apaan itu? Buang b...",bssn brin komunikasi informatika guna buang bu...
2,yujiniece,#percumalaporpolisi anjing bet ternyata,anjing banget
3,findnuralisah,"Kerja belom, udah dimintain duit, kocak, tar k...",bom duit kocak viral bilang oknum viral bilang...
4,findnuralisah,"In great power, comes great responsibility, bu...",in great power great responsibility but not polri
...,...,...,...
1996,arsipsumut,"Autopsi Ulang Direstui Keluarga Brigadir J, Jo...",autopsi ulang restu keluarga brigadir jokowi j...
1997,frontconsole,#percumalaporpolisi https://t.co/7x137qcR7P,
1998,Ivan_Pahlev1,Betooll #percumalaporpolisi,betooll
1999,mia_natasia,Kalo gak punya DUIT BANYAK dan gak ada JABATAN...,duit jabat sih


In [7]:
X = data['CleanText']

vec_TF_IDF = TfidfVectorizer(ngram_range=(1,1)).fit(X)
x_TF_IDF = vec_TF_IDF.transform(X)

# menampilkan vocabulary dari TF-IDF
# print('{} Vocabulary TF-IDF {}'.format('='*20, '='*20))
# print(vec_TF_IDF.vocabulary_)

# menampilkan jumlah fitur dari TF-IDF
print('{} Jumlah fiture TF-IDF {}'.format('='*20, '='*20))
print (len(vec_TF_IDF.get_feature_names()))

#Melihat fitur-fitur apa saja yang ada di dalam corpus kita
print(vec_TF_IDF.get_feature_names())

3341
['aahhsudahlah', 'aamiin', 'abadi', 'abah', 'abai', 'abar', 'abdurachman', 'abg', 'about', 'abri', 'abu', 'abuse', 'acab', 'academia', 'acar', 'acara', 'according', 'actionnya', 'activism', 'acu', 'acuh', 'ada', 'adab', 'adad', 'adakan', 'adam', 'adavisum', 'adik', 'adil', 'administration', 'adminnya', 'adu', 'aduh', 'advokat', 'aepenilaian', 'afinta', 'against', 'agama', 'agamalgsg', 'agen', 'agenda', 'agr', 'ah', 'ahh', 'ahli', 'ahmad', 'ahsanul', 'air', 'aja', 'ajaa', 'ajab', 'ajajdi', 'ajak', 'ajar', 'aji', 'ajjaa', 'akal', 'akan', 'akar', 'akarakarnya', 'akb', 'akh', 'akherat', 'akhirx', 'akhlak', 'akhlaq', 'akhsebentar', 'akibat', 'akp', 'aksi', 'aktif', 'aktivitas', 'aktor', 'aku', 'akuin', 'akun', 'ala', 'alam', 'alamat', 'alami', 'alamsyah', 'alas', 'alat', 'alesannya', 'alfa', 'alfamart', 'alffy', 'alfin', 'alhamdulillah', 'alhasil', 'alias', 'alibi', 'alih', 'all', 'allah', 'alm', 'almarhum', 'almjosua', 'alokasi', 'along', 'alshad', 'alter', 'alternatif', 'alumni', 'am



In [8]:
#Lihat data tabular yang menggunakan metode TF-IDF
#Data ini siap untuk dimasukkan dalam proses machine learning

x1 = vec_TF_IDF.transform(X).toarray()
data_tabular_tf_idf= pd.DataFrame(x1,columns=vec_TF_IDF.get_feature_names_out())
data_tabular_tf_idf

Unnamed: 0,aahhsudahlah,aamiin,abadi,abah,abai,abar,abdurachman,abg,about,abri,...,yoyakarta,ypi,yra,yrbiasa,yuho,yuk,yup,yusuf,zaman,zina
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
