In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import average_precision_score,roc_auc_score, roc_curve, precision_recall_curve

In [4]:
data = pd.read_excel('./dataset/preprocessed.xlsx')
data.head()

Unnamed: 0.1,Unnamed: 0,content,label,length
0,0,চট্টগ্রাম প্রিমিয়ার বিশ্ববিদ্যালয়ের দামপাড়া ভব...,1,157
1,1,কক্সবাজারের রামুতে বৌদ্ধ বিহার বসতিতে হামলার স...,1,386
2,2,গার্লফ্রেন্ড দেয়া জিপিএ পেলে যেনো মায়ের হাতের ...,0,76
3,3,কারণে বিখ্যাত বন্দর নগরী চট্টগ্রাম পাশ্চাত্যের...,0,112
4,4,বাম গণতান্ত্রিক জোটের নির্বাচন কমিশন ইসি কার্য...,1,146


In [5]:
data = data[data['content'].notna()]
data = data[data['label'].notna()]

In [6]:
def calc_unigram_tfidf(reviews):
    tfidf = TfidfVectorizer(use_idf=True,tokenizer=lambda x: x.split()) 
    X = tfidf.fit_transform(reviews.values.astype('U'))
    return tfidf,X

def label_encoding(label):
    le = LabelEncoder()
    le.fit(label)
    encoded_labels = le.transform(label)
    labels = np.array(encoded_labels) # Converting into numpy array
    class_names =le.classes_ ## Define the class names again
    return labels

def dataset_split(feature_space,sentiment):
    X_train,X_test,y_train,y_test = train_test_split(feature_space,sentiment,train_size = 0.7,test_size = 0.3,random_state=0)
    return X_train,X_test,y_train,y_test

In [7]:
def models_results(X_train, X_test, y_train,y_test):
    print("-----------Multinomial Naive----------------")
    MNB = MultinomialNB()
    MNB.fit(X_train, y_train)
    y_pred = MNB.predict(X_test)
    print(classification_report(y_test, y_pred))
    print("---------------------------------------")
    print("-----------Support Vector Machine----------------")
    svc = SVC()
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X_test)
    print(classification_report(y_test, y_pred))
    print("---------------------------------------")
    print("-----------Random Forest----------------")
    rf = RandomForestClassifier(n_estimators=100, criterion ='entropy', random_state = 0)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    print(classification_report(y_test, y_pred))
    print("---------------------------------------")
    print("-----------Logistic Regression----------------")
    lr = LogisticRegression(random_state = 123)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    print(classification_report(y_test, y_pred))
    print("---------------------------------------")

In [8]:
# calculate the Unigram Tf-idf feature
cv,feature_vector = calc_unigram_tfidf(data.content)
# Encode the labels
labels = label_encoding(data.label)
print(labels)
# Split the Feature into train and test set
X_train_cleaned,X_test_cleaned,y_train_cleaned,y_test_cleaned = dataset_split(feature_space=feature_vector,sentiment=labels)

[1 1 0 ... 0 1 0]


In [9]:
print(len(labels))
print(X_train_cleaned.shape)
print(X_test_cleaned.shape)

2000
(1400, 52233)
(600, 52233)


In [10]:
models_results(X_train_cleaned,X_test_cleaned,y_train_cleaned,y_test_cleaned)

-----------Multinomial Naive----------------
              precision    recall  f1-score   support

           0       0.94      0.84      0.89       306
           1       0.85      0.94      0.89       294

    accuracy                           0.89       600
   macro avg       0.90      0.89      0.89       600
weighted avg       0.90      0.89      0.89       600

---------------------------------------
-----------Support Vector Machine----------------
              precision    recall  f1-score   support

           0       0.92      0.92      0.92       306
           1       0.92      0.92      0.92       294

    accuracy                           0.92       600
   macro avg       0.92      0.92      0.92       600
weighted avg       0.92      0.92      0.92       600

---------------------------------------
-----------Random Forest----------------
              precision    recall  f1-score   support

           0       0.89      0.88      0.88       306
           1       0.