### 1. Import Dependencies

In [93]:
import string
import re
import nltk
import pandas as pd
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yonti's\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yonti's\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 2. Preprocessing

In [109]:
import pandas as pd

df = pd.read_csv('data/twitter.csv')

df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       300 non-null    object
 1   sentiment  300 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 4.8+ KB


In [112]:
import random

# Shuffle the dataframe
df = df.sample(frac=1).reset_index(drop=True)

# Create a new dataframe with a random sample of 100 rows from each class
data = pd.concat([df[df['sentiment']==0].sample(n=100), 
                  df[df['sentiment']==1].sample(n=100), 
                  df[df['sentiment']==2].sample(n=100)])

# Reset the index of the new dataframe
data = data.reset_index(drop=True)

# Display the value counts of the sentiment column in the new dataframe
display(data['sentiment'].value_counts())
data.head() # Show dataframe

sentiment
0    100
1    100
2    100
Name: count, dtype: int64

Unnamed: 0,text,sentiment
0,gua harusnya hari ini ptm,0
1,belom ada ptm ygy https://t.co/lMYicodnts,0
2,twt sepi bgt pada ptm yaa,0
3,oi kalian ptm nya gimana?,0
4,sp yg hr ini ptm cungg,0


In [113]:
data['lower_text'] = data['text'].str.lower()  # Convert to lowercase
data['remove_url']=data['lower_text'].apply(lambda x: re.sub(r"http\S+", "", x))    # remove url
data['remove_num']=data['remove_url'].apply(lambda x: re.sub(r'\d+', '', x))    # Remove number
data['punctuation']=data['remove_num'].apply(lambda x: re.sub(r'[^\w\s]', '', x))    # Remove punctuation
data['tokenized_text']=data['punctuation'].apply(nltk.word_tokenize) # Tokenize the text

# Get the Indonesian stopwords
indonesian_stopwords = set(nltk.corpus.stopwords.words('indonesian'))

# Remove the stopwords from the tokenized texts
data['stopwords']=data['tokenized_text'].apply(lambda x: [w for w in x if not w in indonesian_stopwords])

# Initialize the Porter stemmer
stemmer = PorterStemmer()

# Stem the tokenized texts in the 'stopwords' column of the dataframe
data['stemmed']=data['stopwords'].apply(lambda x: [stemmer.stem(w) for w in x])

In [114]:
data.head()

Unnamed: 0,text,sentiment,lower_text,remove_url,remove_num,punctuation,tokenized_text,stopwords,stemmed
0,gua harusnya hari ini ptm,0,gua harusnya hari ini ptm,gua harusnya hari ini ptm,gua harusnya hari ini ptm,gua harusnya hari ini ptm,"[gua, harusnya, hari, ini, ptm]","[gua, ptm]","[gua, ptm]"
1,belom ada ptm ygy https://t.co/lMYicodnts,0,belom ada ptm ygy https://t.co/lmyicodnts,belom ada ptm ygy,belom ada ptm ygy,belom ada ptm ygy,"[belom, ada, ptm, ygy]","[belom, ptm, ygy]","[belom, ptm, ygi]"
2,twt sepi bgt pada ptm yaa,0,twt sepi bgt pada ptm yaa,twt sepi bgt pada ptm yaa,twt sepi bgt pada ptm yaa,twt sepi bgt pada ptm yaa,"[twt, sepi, bgt, pada, ptm, yaa]","[twt, sepi, bgt, ptm, yaa]","[twt, sepi, bgt, ptm, yaa]"
3,oi kalian ptm nya gimana?,0,oi kalian ptm nya gimana?,oi kalian ptm nya gimana?,oi kalian ptm nya gimana?,oi kalian ptm nya gimana,"[oi, kalian, ptm, nya, gimana]","[oi, ptm, nya, gimana]","[oi, ptm, nya, gimana]"
4,sp yg hr ini ptm cungg,0,sp yg hr ini ptm cungg,sp yg hr ini ptm cungg,sp yg hr ini ptm cungg,sp yg hr ini ptm cungg,"[sp, yg, hr, ini, ptm, cungg]","[sp, yg, hr, ptm, cungg]","[sp, yg, hr, ptm, cungg]"


In [132]:
data['freq_token']=data['tokenized_text'].apply(nltk.FreqDist).apply(lambda x: dict(x)) # Frequency word token

data['freq_token']


0      {'gua': 1, 'harusnya': 1, 'hari': 1, 'ini': 1,...
1             {'belom': 1, 'ada': 1, 'ptm': 1, 'ygy': 1}
2      {'twt': 1, 'sepi': 1, 'bgt': 1, 'pada': 1, 'pt...
3      {'oi': 1, 'kalian': 1, 'ptm': 1, 'nya': 1, 'gi...
4      {'sp': 1, 'yg': 1, 'hr': 1, 'ini': 1, 'ptm': 1...
                             ...                        
295    {'kita': 1, 'adalah': 1, 'remaja': 1, 'yang': ...
296       {'ptm': 1, 'males': 1, 'bngt': 1, 'coookk': 1}
297    {'kayaknya': 1, 'udah': 1, 'nyaman': 1, 'sekol...
298    {'ptm': 2, 'shift': 1, 'digabung': 1, 'agaknya...
299    {'gue': 1, 'ptm': 1, 'jadi': 1, 'pendiem': 1, ...
Name: freq_token, Length: 300, dtype: object

### 3. TF-IDF Vectorizer

In [11]:
from sklearn.model_selection import train_test_split

# Split data random 80%
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size=0.2, random_state=42)

print("Train: ",X_train.shape,y_train.shape,"Test: ",(X_test.shape,y_test.shape))

Train:  (240,) (240,) Test:  ((60,), (60,))


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create Tfidf vectorizer
vectorizer = TfidfVectorizer()
# vectorizer = TfidfVectorizer(stop_words='english')

# Convert the 'review' column into its TF-IDF vectorized form
X = vectorizer.fit_transform(df['tokenized_text'])

# Get the feature names (word tokens)
feature_names = vectorizer.get_feature_names_out()
print("Feature Names: ",feature_names[:10]) # Print only first 5 words

Feature Names:  ['0dmkgngj95' '10' '100' '12an' '17' '19' '1q2kmlkxol' '1vpywrykyo' '2022'
 '2jam']


### 4. Train and Evaluate Data

In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [14]:
# Convert training and test datasets into vectorized form
X_train_vectorized =  vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Create a Multinomial Naive Bayes model
model = MultinomialNB(alpha=1)

# Train the model
model.fit(X_train_vectorized, y_train)

# Predict sentiment for test dataset
y_pred = model.predict(X_test_vectorized)

report=classification_report(y_test, y_pred,output_dict=True)
print(pd.DataFrame(report).transpose())

# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred, normalize=True)
print(f'\nAccuracy of the model: {accuracy*100:.2f}%')

              precision    recall  f1-score    support
0              0.280000  0.437500  0.341463  16.000000
1              0.684211  0.590909  0.634146  22.000000
2              0.562500  0.409091  0.473684  22.000000
accuracy       0.483333  0.483333  0.483333   0.483333
macro avg      0.508904  0.479167  0.483098  60.000000
weighted avg   0.531794  0.483333  0.497261  60.000000

Accuracy of the model: 48.33%
