### 1. Import Dependencies

In [9]:
import string
import re
import nltk
import pandas as pd
import numpy as np

nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem.snowball import SnowballStemmer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yonti's\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 2. Preprocessing

import csv

In [10]:
import pandas as pd

df = pd.read_csv('data/twitter.csv')
df.head()

Unnamed: 0,text,sentiment
0,"hari ini aku masih libur, masih ada waktu semi...",1
1,yg ptm hari ini semangaattt,1
2,yg lagi ptm sini ciss dulu,1
3,yang ptm semangat ya,1
4,yang ptm selamat menikmati hari bahagia kalian ya,1


In [11]:
df['lower_text'] = df['text'].str.lower()  # Convert to lowercase
df['remove_number'] = df['lower_text'].str.replace(r'\d+', '') # Remove number
df['punctuation'] = df['remove_number'].str.replace('[^\w\s]', '')  # Remove punctuation
df['tokenized_text'] = df['punctuation'].apply(nltk.word_tokenize) # Tokenize the text
df['freq_token'] = df['tokenized_text'].apply(nltk.FreqDist).apply(lambda x: dict(x)) # Frequency word token
df['stemmed'] = df['punctuation'].apply(lambda x: ' '.join(nltk.corpus.stopwords.words('indonesia') if x.lower() in nltk.corpus.stopwords.words('english') else [x]))

print(df.head)

<bound method NDFrame.head of                                                   text  sentiment  \
0    hari ini aku masih libur, masih ada waktu semi...          1   
1                          yg ptm hari ini semangaattt          1   
2                           yg lagi ptm sini ciss dulu          1   
3                                 yang ptm semangat ya          1   
4    yang ptm selamat menikmati hari bahagia kalian ya          1   
..                                                 ...        ...   
295  IHHH YANG PTM JAMKOS MULU, giliran sesi gue ga...          2   
296  Hari pertama ptm udah ada tugas kelompok aja, ...          2   
297  Hari pertama ptm full ditambah lagi dapet itu ...          2   
298                    Hari pertama PTM, ngantuk berat          2   
299  Hadahh baru hari pertama ptm udah dengerin ora...          2   

                                            lower_text  \
0    hari ini aku masih libur, masih ada waktu semi...   
1                        

### 3. TF-IDF Vectorizer

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
# Create Tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Convert the 'review' column into its TF-IDF vectorized form
X = vectorizer.fit_transform(df['punctuation'])

### 4. Train and Evaluate Data

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [None]:
# Split data random 80%
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size=0.2, random_state=42)

# Convert training and test datasets into vectorized form
X_train_vectorized =  vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Create a Multinomial Naive Bayes model
model = MultinomialNB(alpha=1)

# Train the model
model.fit(X_train_vectorized, y_train)

# Predict sentiment for test dataset
y_pred = model.predict(X_test_vectorized)

# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred, normalize=True)
print(f'Accuracy of the model: {accuracy*100:.2f}%')