# Menyiapkan Dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('train_preprocess.tsv.txt', delimiter='\t')

In [3]:
df.columns = ['text','label']

In [4]:
df.head()

Unnamed: 0,text,label
0,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
1,lokasi strategis di jalan sumatera bandung . t...,positive
2,betapa bahagia nya diri ini saat unboxing pake...,positive
3,duh . jadi mahasiswa jangan sombong dong . kas...,negative
4,"makanan beragam , harga makanan di food stall ...",positive


In [5]:
df.shape

(10999, 2)

In [6]:
df.isna().sum()

text     0
label    0
dtype: int64

In [7]:
df.duplicated().sum()

67

In [8]:
df = df.drop_duplicates()

# Data Preprocessing

In [9]:
df_stopword = pd.read_csv('stopword.csv', encoding = 'latin1')
df_alay = pd.read_csv('kamus_alay1.csv',  encoding = 'latin1')

In [10]:
df_stopword.head()

Unnamed: 0,stopword
0,ada
1,adalah
2,adanya
3,adapun
4,agak


In [11]:
df_alay.head()

Unnamed: 0,Kata_lama,Kata_baru
0,anakjakartaasikasik,anak jakarta asyik asyik
1,pakcikdahtua,pak cik sudah tua
2,pakcikmudalagi,pak cik muda lagi
3,t3tapjokowi,tetap jokowi
4,3x,tiga kali


In [12]:
import re

In [13]:
#cleansing
def preprocess_text(text):
    # lower text
    text = text.lower()

    # Hapus emotikon dan karakter khusus
    text = re.sub(r'[^\w\d\s]', '', text)

    # mengganti spasi yang berlebihan
    text = re.sub(r'\s+', ' ', text)

    # Menghapus kata dan huruf yang bergabung
    text = re.sub(r'\w*\d\w*', '', text)
    
    # Mengganti kata yang berulang
    text = re.sub(r'\b(\w+)\1\b', r'\1', text)
    return text


#removing stopwords
stopwords = df_stopword['stopword'].tolist()
def remove_stopwords(text):
    list_stopwords = text.split()
    return ' '.join([text for text in list_stopwords if text not in stopwords])

#normalization
kamus_alay = dict(zip(df_alay['Kata_lama'], df_alay['Kata_baru']))
def normalize(text):
    for word in kamus_alay:
        return ' '.join([kamus_alay[word] if word in kamus_alay else word for word in text.split(' ')])


In [14]:
def cleansing_text(text):
    text = preprocess_text(text)
    text = remove_stopwords(text)
    text = normalize(text)
    
    return text

In [15]:
df['text_clean'] = df['text'].apply(cleansing_text)

In [19]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [20]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()
def stem_text(tokens):
    return stemmer.stem(tokens)

df['text_clean'] = df['text_clean'].apply(stem_text)
df.to_csv('data_clean.csv',index = False)

In [16]:
import pandas as pd

df = pd.read_csv('data_clean.csv')
df

Unnamed: 0,text,label,text_clean
0,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral,mohon ulama lurus beri hujjah partai wilah sua...
1,lokasi strategis di jalan sumatera bandung . t...,positive,lokasi strategis jalan sumatra bandung nyaman ...
2,betapa bahagia nya diri ini saat unboxing pake...,positive,betapa bahagia unboxing paket barang bagus tet...
3,duh . jadi mahasiswa jangan sombong dong . kas...,negative,aduh mahasiswa sombong kasih kartu kuning ajar...
4,"makanan beragam , harga makanan di food stall ...",positive,makan agam harga makan food stall kasir suasan...
...,...,...,...
10927,f - demokrat dorong upaya kemandirian energi n...,neutral,f demokrat dorong upaya mandiri energi nasional
10928,tidak bosan,positive,bosan
10929,enak rasa masakan nya apalagi kepiting yang me...,positive,enak masakan kepiting senang pilih kepiting se...
10930,"pagi pagi di tol pasteur sudah macet parah , b...",negative,pagi pagi tol pasteur macet parah jengkel


In [70]:
neg = df.loc[df['label'] == 'negative'].text.tolist()
neu = df.loc[df['label'] == 'neutral'].text.tolist()
pos = df.loc[df['label'] == 'positive'].text.tolist()

neg_label = df.loc[df['label'] == 'negative'].label.tolist()
neu_label = df.loc[df['label'] == 'neutral'].label.tolist()
pos_label = df.loc[df['label'] == 'positive'].label.tolist()

total_data = pos + neu + neg
labels = pos_label + neu_label + neg_label

print('Pos: %s, Neu: %s, Neg: %s' % (len(pos), len(neu), len(neg)))
print('Total data: %s' % len(total_data))

Pos: 6377, Neu: 1138, Neg: 3405
Total data: 10920


# Feature Extraction

In [71]:
cleaned_data = total_data

In [72]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
count_vect.fit(cleaned_data)

X = count_vect.transform(cleaned_data)
print ("Feature Extraction selesai")

Feature Extraction selesai


In [73]:
X.shape

(10920, 17238)

In [74]:
import pickle

pickle.dump(count_vect, open("feature.p", "wb"))

# Split Data

In [75]:
from sklearn.model_selection import train_test_split

classes = labels

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, classes, test_size = 0.2)

# Training

In [77]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier()
model.fit(X_train, y_train)

print ("Training selesai")

Training selesai


In [78]:
pickle.dump(model, open("model.p", "wb"))

In [79]:
from sklearn.metrics import classification_report

test = model.predict(X_test)

print ("Testing selesai")
print(classification_report(y_test, test))

Testing selesai
              precision    recall  f1-score   support

    negative       0.77      0.79      0.78       683
     neutral       0.76      0.65      0.70       246
    positive       0.88      0.89      0.88      1255

    accuracy                           0.83      2184
   macro avg       0.80      0.78      0.79      2184
weighted avg       0.83      0.83      0.83      2184



# Predict

In [92]:
original_text =  '''
aku pintar
'''

text = count_vect.transform([cleansing_text(original_text)])

result = model.predict(text)[0]
print("Sentiment:")
print()
print(result)

Sentiment:

positive


In [91]:
original_text =  '''
kamu ga bodoh
'''

text = count_vect.transform([cleansing_text(original_text)])

result = model.predict(text)[0]
print("Sentiment:")
print()
print(result)

Sentiment:

negative
