# Import Libraries

In [1]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import string
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [2]:
data = pd.read_csv('cleaned_tweets_sentiments.csv')

In [3]:
data.head()

Unnamed: 0,Tweet,Sentiment
0,social distancing done right,POSITIVE
1,deepaavali 2020 day 2 thaaimaman house family ...,POSITIVE
2,kluster najib apa sik saman terus sidak semua ...,NEGATIVE
3,ju yy that s the unique part of our compoundin...,NEUTRAL
4,cherating #socialdistancing,NEUTRAL


In [4]:
data.shape

(6999, 2)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6999 entries, 0 to 6998
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Tweet      6999 non-null   object
 1   Sentiment  6999 non-null   object
dtypes: object(2)
memory usage: 109.5+ KB


In [7]:
data['Sentiment'].unique()

array(['POSITIVE', 'NEGATIVE', 'NEUTRAL'], dtype=object)

# Data Preprocessing

### Encode Target

In [9]:
#Encode
le = preprocessing.LabelEncoder()
data['Sentiment'] = le.fit_transform(data['Sentiment'])

In [11]:
data.head()
#0:Negative, 1:Neutral, 2:Positive

Unnamed: 0,Tweet,Sentiment
0,social distancing done right,2
1,deepaavali 2020 day 2 thaaimaman house family ...,2
2,kluster najib apa sik saman terus sidak semua ...,0
3,ju yy that s the unique part of our compoundin...,1
4,cherating #socialdistancing,1


### Lowercasing

In [15]:
data['Tweet']=data['Tweet'].str.lower()
data['Tweet'].tail()

6994    sebuah kluster baru dikesan di sebuah restoran...
6995                comei je yg social distancing nya pon
6996    ya allah kena pergi sekolah kemas harini penat...
6997    we open now guys jom ke kita bukak setiap hari...
6998    thank you for pointing this out i m thankful f...
Name: Tweet, dtype: object

### Punctuation Removal

In [21]:
english_punctuations = string.punctuation
punctuations_list = english_punctuations

def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

data['Tweet']= data['Tweet'].apply(lambda x: cleaning_punctuations(x))
data['Tweet'].head()

0                        social distancing done right 
1    deepaavali 2020 day 2 thaaimaman house family ...
2    kluster najib apa sik saman terus sidak semua ...
3    ju yy that s the unique part of our compoundin...
4                          cherating socialdistancing 
Name: Tweet, dtype: object

### Stopword removal

In [22]:
nltk.download('stopwords')
english_stop_words = stopwords.words('english')
english_stop_words

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [23]:
malay_stop_words = []

with open('stopwords-ms.txt', "r") as file:
    for line in file:
        stop_word = line.strip()
        malay_stop_words.append(stop_word)

print(malay_stop_words)

['abdul', 'abdullah', 'acara', 'ada', 'adalah', 'ahmad', 'air', 'akan', 'akhbar', 'akhir', 'aktiviti', 'alam', 'amat', 'amerika', 'anak', 'anggota', 'antara', 'antarabangsa', 'apa', 'apabila', 'april', 'as', 'asas', 'asean', 'asia', 'asing', 'atas', 'atau', 'australia', 'awal', 'awam', 'bagaimanapun', 'bagi', 'bahagian', 'bahan', 'baharu', 'bahawa', 'baik', 'bandar', 'bank', 'banyak', 'barangan', 'baru', 'baru-baru', 'bawah', 'beberapa', 'bekas', 'beliau', 'belum', 'berada', 'berakhir', 'berbanding', 'berdasarkan', 'berharap', 'berikutan', 'berjaya', 'berjumlah', 'berkaitan', 'berkata', 'berkenaan', 'berlaku', 'bermula', 'bernama', 'bernilai', 'bersama', 'berubah', 'besar', 'bhd', 'bidang', 'bilion', 'bn', 'boleh', 'bukan', 'bulan', 'bursa', 'cadangan', 'china', 'dagangan', 'dalam', 'dan', 'dana', 'dapat', 'dari', 'daripada', 'dasar', 'datang', 'datuk', 'demikian', 'dengan', 'depan', 'derivatives', 'dewan', 'di', 'diadakan', 'dibuka', 'dicatatkan', 'dijangka', 'diniagakan', 'dis', 'dis

In [24]:
frequent_hashtags = ['mco','sosial','penjarakan','socialdistancing', 'covid19', 'kitajagakita', 'stayathome', 'staysafe', 'covid', 'penjarakansosial','social','distancing']
stop_words = english_stop_words + malay_stop_words + frequent_hashtags
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [25]:
STOPWORDS = set(stop_words)

def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

data['Tweet'] = data['Tweet'].apply(lambda text: cleaning_stopwords(text))
data.head()

Unnamed: 0,Tweet,Sentiment
0,done right,2
1,deepaavali 2020 day 2 thaaimaman house family ...,2
2,kluster sik saman sidak sia sikda gk https,0
3,ju yy unique part compounding lab fits sop,1
4,cherating,1


### Clean Numbers

In [26]:
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)

data['Tweet'] = data['Tweet'].apply(lambda x: cleaning_numbers(x))
data['Tweet'].head()

0                                           done right
1    deepaavali  day  thaaimaman house family dinne...
2           kluster sik saman sidak sia sikda gk https
3           ju yy unique part compounding lab fits sop
4                                            cherating
Name: Tweet, dtype: object

In [27]:
cleansed_negative = data[data['Sentiment'] == 0]
cleansed_neutral = data[data['Sentiment'] == 1]
cleansed_positive = data[data['Sentiment'] == 2]

### Tokenization

In [28]:
tokenizer = RegexpTokenizer(r'\w+')

data['Tweet'] = data['Tweet'].apply(tokenizer.tokenize)
data['Tweet'].head()

0                                        [done, right]
1    [deepaavali, day, thaaimaman, house, family, d...
2    [kluster, sik, saman, sidak, sia, sikda, gk, h...
3    [ju, yy, unique, part, compounding, lab, fits,...
4                                          [cherating]
Name: Tweet, dtype: object

### Separating Dataset

In [32]:
X = data['Tweet']
y = data['Sentiment']

In [33]:
print(X)

0                                           [done, right]
1       [deepaavali, day, thaaimaman, house, family, d...
2       [kluster, sik, saman, sidak, sia, sikda, gk, h...
3       [ju, yy, unique, part, compounding, lab, fits,...
4                                             [cherating]
                              ...                        
6994    [kluster, dikesan, restoran, brickfields, sari...
6995                            [comei, je, yg, nya, pon]
6996    [ya, allah, kena, pergi, sekolah, kemas, harin...
6997    [open, guys, jom, bukak, jam, pm, nasi, kerabu...
6998    [thank, pointing, thankful, whatever, returned...
Name: Tweet, Length: 6999, dtype: object


In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state =42)

### Text Vectorization

##### TF-IDF

In [35]:
documents_X_train_tf = [" ".join(doc) for doc in X_train]
documents_X_test_tf = [" ".join(doc) for doc in X_test]

In [36]:
tf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
tfidf_matrix = tf_vectorizer.fit_transform(documents_X_train_tf)
print('No. of feature_words: ', len(tf_vectorizer.get_feature_names()))

No. of feature_words:  66616




In [37]:
X_test_tf  = tf_vectorizer.transform(documents_X_test_tf)

##### Word Embedding

In [38]:
documents_X_train_w2v = [" ".join(doc) for doc in X_train]
documents_X_test_w2v = [" ".join(doc) for doc in X_test]

In [39]:
model = Word2Vec(documents_X_train_w2v, min_count=1, vector_size=100, window=5, sg=1)

def calculate_document_vectors(documents, model):
    document_features = []
    for doc in documents:
        tokens = doc.split()
        valid_tokens = [word for word in tokens if word in model.wv]
        if valid_tokens:
            doc_vector = np.mean([model.wv[word] for word in valid_tokens], axis=0)
        else:
            doc_vector = np.zeros(model.vector_size)
        document_features.append(doc_vector)
    return document_features

In [49]:
X_train_w2v = calculate_document_vectors(documents_X_train_w2v, model)
X_test_w2v = calculate_document_vectors(documents_X_test_w2v, model)

##### CountVectorizer

In [41]:
documents_X_train_cv = [" ".join(doc) for doc in X_train]
documents_X_test_cv = [" ".join(doc) for doc in X_test]

In [42]:
cv_vectorizer = CountVectorizer()

In [43]:
X_train_cv = cv_vectorizer.fit_transform(documents_X_train_cv)
X_test_cv = cv_vectorizer.transform(documents_X_test_cv)

# Model Training and Evaluation

In [44]:
def model_Evaluate(model):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    cf_matrix = confusion_matrix(y_test, y_pred)
    categories = ['Negative', 'Neutral', 'Positive']
    group_names = ['TN', 'FP', 'FN', 'T']
    group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]
    labels = [f'{v1}\n{v2}' for v1, v2 in zip(group_names, group_percentages)]
    labels = np.asarray(labels).reshape(2, 2)
    sns.heatmap(cf_matrix, annot=labels, cmap='Blues', fmt='',
                xticklabels=categories, yticklabels=categories)
    plt.xlabel("Predicted values", fontdict={'size': 14}, labelpad=10)
    plt.ylabel("Actual values", fontdict={'size': 14}, labelpad=10)
    plt.title("Confusion Matrix", fontdict={'size': 18}, pad=20)

### TF-IDF

In [45]:
rf = RandomForestClassifier()
rf.fit(tfidf_matrix, y_train)
predictions_2 = rf.predict(X_test_tf)

confusion_mat = confusion_matrix(y_test, predictions_2)
print("Confusion Matrix:")
print(confusion_mat)

class_report = classification_report(y_test, predictions_2)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[421 101  36]
 [ 46 291  39]
 [143 100 223]]

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.75      0.72       558
           1       0.59      0.77      0.67       376
           2       0.75      0.48      0.58       466

    accuracy                           0.67      1400
   macro avg       0.68      0.67      0.66      1400
weighted avg       0.68      0.67      0.66      1400



In [46]:
ls = LinearSVC()
ls.fit(tfidf_matrix, y_train)
predictions_3 = ls.predict(X_test_tf)

confusion_mat = confusion_matrix(y_test, predictions_3)
print("Confusion Matrix:")
print(confusion_mat)

class_report = classification_report(y_test, predictions_3)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[442  67  49]
 [ 66 230  80]
 [140  59 267]]

Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.79      0.73       558
           1       0.65      0.61      0.63       376
           2       0.67      0.57      0.62       466

    accuracy                           0.67      1400
   macro avg       0.67      0.66      0.66      1400
weighted avg       0.67      0.67      0.67      1400



In [47]:
lr = LogisticRegression()
lr.fit(tfidf_matrix, y_train)
predictions_5 = lr.predict(X_test_tf)

confusion_mat = confusion_matrix(y_test, predictions_5)
print("Confusion Matrix:")
print(confusion_mat)

class_report = classification_report(y_test, predictions_5)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[443  69  46]
 [ 69 226  81]
 [159  62 245]]

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.79      0.72       558
           1       0.63      0.60      0.62       376
           2       0.66      0.53      0.58       466

    accuracy                           0.65      1400
   macro avg       0.65      0.64      0.64      1400
weighted avg       0.65      0.65      0.65      1400



### Word Embedding

In [51]:
rf = RandomForestClassifier()
rf.fit(X_train_w2v, y_train)
predictions_2 = rf.predict(X_test_w2v)

confusion_mat = confusion_matrix(y_test, predictions_2)
print("Confusion Matrix:")
print(confusion_mat)

class_report = classification_report(y_test, predictions_2)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[548   1   9]
 [365   4   7]
 [449   2  15]]

Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.98      0.57       558
           1       0.57      0.01      0.02       376
           2       0.48      0.03      0.06       466

    accuracy                           0.41      1400
   macro avg       0.49      0.34      0.22      1400
weighted avg       0.47      0.41      0.25      1400



In [52]:
ls = LinearSVC()
ls.fit(X_train_w2v, y_train)
predictions_3 = ls.predict(X_test_w2v)

confusion_mat = confusion_matrix(y_test, predictions_3)
print("Confusion Matrix:")
print(confusion_mat)

class_report = classification_report(y_test, predictions_3)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[550   1   7]
 [365   3   8]
 [450   1  15]]

Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.99      0.57       558
           1       0.60      0.01      0.02       376
           2       0.50      0.03      0.06       466

    accuracy                           0.41      1400
   macro avg       0.50      0.34      0.22      1400
weighted avg       0.49      0.41      0.25      1400



In [53]:
lr = LogisticRegression()
lr.fit(X_train_w2v, y_train)
predictions_5 = lr.predict(X_test_w2v)

confusion_mat = confusion_matrix(y_test, predictions_5)
print("Confusion Matrix:")
print(confusion_mat)

class_report = classification_report(y_test, predictions_5)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[549   1   8]
 [365   1  10]
 [450   0  16]]

Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.98      0.57       558
           1       0.50      0.00      0.01       376
           2       0.47      0.03      0.06       466

    accuracy                           0.40      1400
   macro avg       0.46      0.34      0.21      1400
weighted avg       0.45      0.40      0.25      1400



### CountVectorizer

In [54]:
rf_count = RandomForestClassifier()
rf_count.fit(X_train_cv, y_train)
predictions_2 = rf_count.predict(X_test_cv)

confusion_mat = confusion_matrix(y_test, predictions_2)
print("Confusion Matrix:")
print(confusion_mat)

class_report = classification_report(y_test, predictions_2)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[421  95  42]
 [ 39 283  54]
 [140  74 252]]

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.75      0.73       558
           1       0.63      0.75      0.68       376
           2       0.72      0.54      0.62       466

    accuracy                           0.68      1400
   macro avg       0.68      0.68      0.68      1400
weighted avg       0.69      0.68      0.68      1400



In [55]:
ls = LinearSVC()
ls.fit(X_train_cv, y_train)
predictions_3 = ls.predict(X_test_cv)

confusion_mat = confusion_matrix(y_test, predictions_3)
print("Confusion Matrix:")
print(confusion_mat)

class_report = classification_report(y_test, predictions_3)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[388  83  87]
 [ 56 256  64]
 [108  73 285]]

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.70      0.70       558
           1       0.62      0.68      0.65       376
           2       0.65      0.61      0.63       466

    accuracy                           0.66      1400
   macro avg       0.66      0.66      0.66      1400
weighted avg       0.66      0.66      0.66      1400



In [56]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_cv, y_train)
predictions_5 = lr.predict(X_test_cv)

confusion_mat = confusion_matrix(y_test, predictions_5)
print("Confusion Matrix:")
print(confusion_mat)

class_report = classification_report(y_test, predictions_5)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[405  88  65]
 [ 53 266  57]
 [110  80 276]]

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.73      0.72       558
           1       0.61      0.71      0.66       376
           2       0.69      0.59      0.64       466

    accuracy                           0.68      1400
   macro avg       0.67      0.68      0.67      1400
weighted avg       0.68      0.68      0.68      1400



In [None]:
# import pickle
# with open('model.pkl', 'wb') as model_file:
#     pickle.dump(rf_count, model_file)

In [None]:
# with open('count_vectorizer.pkl', 'wb') as vectorizer_file:
#     pickle.dump(cv_vectorizer, vectorizer_file)