### 1. Import Dependencies

In [93]:
import string
import re
import nltk
import pandas as pd
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yonti's\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yonti's\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 2. Preprocessing

In [139]:
import pandas as pd

df = pd.read_csv('data/twitter.csv')

df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       300 non-null    object
 1   sentiment  300 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 4.8+ KB


In [140]:
import random

# Shuffle the dataframe
df = df.sample(frac=1).reset_index(drop=True)

# Create a new dataframe with a random sample of 100 rows from each class
data = pd.concat([df[df['sentiment']==0].sample(n=100), 
                  df[df['sentiment']==1].sample(n=100), 
                  df[df['sentiment']==2].sample(n=100)])

# Reset the index of the new dataframe
data = data.reset_index(drop=True)

# Display the value counts of the sentiment column in the new dataframe
display(data['sentiment'].value_counts())
data.head() # Show dataframe

sentiment
0    100
1    100
2    100
Name: count, dtype: int64

Unnamed: 0,text,sentiment
0,twitter sepi anjir. ini pada ptm semua kali ya,0
1,kok orang-orang seneng PTM 100%,0
2,skolahku 100% ptm gk ya,0
3,kalian hariini ada yang uda ptm 100%? ada wakt...,0
4,ngerant selama ptm keliatannya asik deh gue ga...,0


In [141]:
data['lower_text']=data['text'].str.lower()  # Convert to lowercase
data['remove_url']=data['lower_text'].apply(lambda x: re.sub(r"http\S+", "", x))    # remove url
data['remove_num']=data['remove_url'].apply(lambda x: re.sub(r'\d+', '', x))    # Remove number
data['punctuation']=data['remove_num'].apply(lambda x: re.sub(r'[^\w\s]', '', x))    # Remove punctuation
data['tokenized_text']=data['punctuation'].apply(nltk.word_tokenize) # Tokenize the text

# Get the Indonesian stopwords
indonesian_stopwords = set(nltk.corpus.stopwords.words('indonesian'))

# Remove the stopwords from the tokenized texts
data['stopwords']=data['tokenized_text'].apply(lambda x: [w for w in x if not w in indonesian_stopwords])

# Initialize the Porter stemmer
stemmer = PorterStemmer()

# Stem the tokenized texts in the 'stopwords' column of the dataframe
data['stemmed']=data['stopwords'].apply(lambda x: [stemmer.stem(w) for w in x])

data['normalized']=data['stemmed'].apply(lambda x: ' '.join(x)) # Join the stemmed words into a single string

In [142]:
data.head()

Unnamed: 0,text,sentiment,lower_text,remove_url,remove_num,punctuation,tokenized_text,stopwords,stemmed,normalized
0,twitter sepi anjir. ini pada ptm semua kali ya,0,twitter sepi anjir. ini pada ptm semua kali ya,twitter sepi anjir. ini pada ptm semua kali ya,twitter sepi anjir. ini pada ptm semua kali ya,twitter sepi anjir ini pada ptm semua kali ya,"[twitter, sepi, anjir, ini, pada, ptm, semua, ...","[twitter, sepi, anjir, ptm, kali, ya]","[twitter, sepi, anjir, ptm, kali, ya]",twitter sepi anjir ptm kali ya
1,kok orang-orang seneng PTM 100%,0,kok orang-orang seneng ptm 100%,kok orang-orang seneng ptm 100%,kok orang-orang seneng ptm %,kok orangorang seneng ptm,"[kok, orangorang, seneng, ptm]","[orangorang, seneng, ptm]","[orangorang, seneng, ptm]",orangorang seneng ptm
2,skolahku 100% ptm gk ya,0,skolahku 100% ptm gk ya,skolahku 100% ptm gk ya,skolahku % ptm gk ya,skolahku ptm gk ya,"[skolahku, ptm, gk, ya]","[skolahku, ptm, gk, ya]","[skolahku, ptm, gk, ya]",skolahku ptm gk ya
3,kalian hariini ada yang uda ptm 100%? ada wakt...,0,kalian hariini ada yang uda ptm 100%? ada wakt...,kalian hariini ada yang uda ptm 100%? ada wakt...,kalian hariini ada yang uda ptm %? ada waktu i...,kalian hariini ada yang uda ptm ada waktu ist...,"[kalian, hariini, ada, yang, uda, ptm, ada, wa...","[hariini, uda, ptm, istirahat, gaa, ptm, sampe...","[hariini, uda, ptm, istirahat, gaa, ptm, samp,...",hariini uda ptm istirahat gaa ptm samp jam ber...
4,ngerant selama ptm keliatannya asik deh gue ga...,0,ngerant selama ptm keliatannya asik deh gue ga...,ngerant selama ptm keliatannya asik deh gue ga...,ngerant selama ptm keliatannya asik deh gue ga...,ngerant selama ptm keliatannya asik deh gue ga...,"[ngerant, selama, ptm, keliatannya, asik, deh,...","[ngerant, ptm, keliatannya, asik, deh, gue, ga...","[ngerant, ptm, keliatannya, asik, deh, gue, ga...",ngerant ptm keliatannya asik deh gue gabisaa n...


In [143]:
data['freq_token']=data['tokenized_text'].apply(nltk.FreqDist).apply(lambda x: dict(x)) # Frequency word token

data['freq_token']


0      {'twitter': 1, 'sepi': 1, 'anjir': 1, 'ini': 1...
1      {'kok': 1, 'orangorang': 1, 'seneng': 1, 'ptm'...
2            {'skolahku': 1, 'ptm': 1, 'gk': 1, 'ya': 1}
3      {'kalian': 1, 'hariini': 1, 'ada': 2, 'yang': ...
4      {'ngerant': 2, 'selama': 1, 'ptm': 1, 'keliata...
                             ...                        
295      {'gabung': 1, 'banget': 1, 'ptm': 1, 'gini': 1}
296    {'ptm': 1, 'mental': 1, 'gw': 1, 'gk': 1, 'sia...
297    {'abis': 1, 'ngomongin': 1, 'ptm': 2, 'sama': ...
298                       {'ptm': 1, 'ga': 1, 'asik': 1}
299    {'mampus': 1, 'ptm': 1, 'hari': 1, 'siang': 2,...
Name: freq_token, Length: 300, dtype: object

### 3. TF-IDF Vectorizer

In [144]:
from sklearn.model_selection import train_test_split

# Split data random 80%
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size=0.2, random_state=42)

print("Train: ",X_train.shape,y_train.shape,"Test: ",(X_test.shape,y_test.shape))

Train:  (240,) (240,) Test:  ((60,), (60,))


In [147]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create Tfidf vectorizer
vectorizer = TfidfVectorizer()

# Convert the 'review' column into its TF-IDF vectorized form
X = vectorizer.fit_transform(data['normalized'])

# Get the feature names (word tokens)
feature_names = vectorizer.get_feature_names_out()
print("Feature Names: ",feature_names[:10]) # Print only first 5 words

Feature Names:  ['aaa' 'aaaa' 'aaakkk' 'aamiin' 'abi' 'abieezzz' 'absen' 'adakan'
 'adaptasi' 'adek']


### 4. Train and Evaluate Data

In [148]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [149]:
# Convert training and test datasets into vectorized form
X_train_vectorized =  vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Create a Multinomial Naive Bayes model
model = MultinomialNB(alpha=1)

# Train the model
model.fit(X_train_vectorized, y_train)

# Predict sentiment for test dataset
y_pred = model.predict(X_test_vectorized)

report=classification_report(y_test, y_pred,output_dict=True)
print(pd.DataFrame(report).transpose())

# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred, normalize=True)
print(f'\nAccuracy of the model: {accuracy*100:.2f}%')

              precision    recall  f1-score    support
0              0.428571  0.600000  0.500000  20.000000
1              0.722222  0.650000  0.684211  20.000000
2              0.642857  0.450000  0.529412  20.000000
accuracy       0.566667  0.566667  0.566667   0.566667
macro avg      0.597884  0.566667  0.571207  60.000000
weighted avg   0.597884  0.566667  0.571207  60.000000

Accuracy of the model: 56.67%
