# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Dataset

In [2]:
data = pd.read_json("Dataset.json")
data['sentiment'] = data['sentiment'].replace([-1,0,1],[0,1,2])
data.head()

Unnamed: 0,index,text,Preprocesed,LangId,POS,sentiment
0,1,Don't know much about java.... Kadhi use nhi k...,Dont know much about java Kadhi use nhi kela j...,Dont\EN know\EN much\EN about\EN java\BO Kadhi...,Dont\ADV know\VB much\UN about\ADV java\NOU Ka...,1
1,2,Theory paper madhe ek answer dila ki atleast 4...,Theory paper madhe ek answer dila ki atleast 4...,Theory\EN paper\EN madhe\MR ek\MR answer\EN di...,Theory\NOU paper\NOU madhe\UN ek\NOU answer\NO...,2
2,3,khup assignment n exam load rahato Engineering...,khup assignment n exam load rahato Engineering...,khup\MR assignment\EN n\MR exam\EN load\EN rah...,khup\ADJ assignment\NOU n\UN exam\NOU load\NOU...,1
3,4,rutwik sathi online gheu shakto or tyala nntr ...,rutwik sathi online gheu shakto or tyala nntr ...,rutwik\BO sathi\MR online\EN gheu\MR shakto\MR...,rutwik\NOU sathi\UN online\ADV gheu\VB shakto\...,2
4,5,offline chalel mi roj yeto college madhe so,offline chalel mi roj yeto college madhe so,offline\EN chalel\MR mi\MR roj\MR yeto\MR coll...,offline\ADV chalel\UN mi\UN roj\ADV yeto\VB co...,2


In [3]:
data["sentiment"].value_counts()

2    392
1    320
0    297
Name: sentiment, dtype: int64

In [4]:
data.shape

(1009, 6)

In [5]:
data.isnull().sum()

index          0
text           0
Preprocesed    0
LangId         0
POS            0
sentiment      0
dtype: int64

In [6]:
data.dtypes

index           int64
text           object
Preprocesed    object
LangId         object
POS            object
sentiment       int64
dtype: object

# Metrics

In [7]:
# Positive and other
def positive(Y):
    A = np.copy(Y)
    A[A == 0] = 1
    return A

# Negative and other
def negative(Y):
    A = np.copy(Y)
    A[A == 2] = 1
    return A

# Neutral and other
def neutral(Y):
    A = np.copy(Y)
    A[A == 2] = 0
    return A

# Convert to binary labels for individual score calculation
def metrics(Y_test,Y_pred):

    # Positive accuracy and f1 score
    pos_t = positive(Y_test)
    pos_p = positive(Y_pred)
    pos_a = accuracy_score(pos_t,pos_p)
    pos_f = f1_score(pos_t,pos_p)

    # Negative accuracy and f1 score
    neg_t = negative(Y_test)
    neg_p = negative(Y_pred)
    neg_a = accuracy_score(neg_t,neg_p)
    neg_f = f1_score(neg_t,neg_p)

    # Neutral accuracy and f1 score
    neu_t = neutral(Y_test)
    neu_p = neutral(Y_pred)
    neu_a = accuracy_score(neu_t,neu_p)
    neu_f = f1_score(neu_t,neu_p)

    # Average accuracy and f1 score
    a_t = (pos_a+neg_a+neu_a)/3
    f_t = (pos_f+neg_f+neu_f)/3

    print("metrics","total","pos","neg","neu")
    print("accuracy_score",a_t,pos_a,neg_a,neu_a)
    print("f1_score",f_t,pos_f,neg_f,neu_f)

# Split

In [8]:
X = data['text']
Y = data['sentiment']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0, stratify=data['sentiment'])

# SVM

In [9]:
vectorizer = TfidfVectorizer(min_df = 5,max_df = 0.8,sublinear_tf = True,use_idf = True)
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
SVMm = svm.SVC(kernel='linear')
SVMm.fit(train_vectors, Y_train)
Y_pred = SVMm.predict(test_vectors)

In [10]:
cfSVM = confusion_matrix(Y_test, Y_pred)
print(cfSVM)
metrics(Y_test,Y_pred)

[[40 33 16]
 [20 39 37]
 [15 26 77]]
metrics total pos neg neu
accuracy_score 0.6765676567656765 0.6897689768976898 0.7227722772277227 0.6171617161716172
f1_score 0.6498155914663634 0.7374301675977654 0.8099547511312217 0.40206185567010305


# Naive Bayes

In [11]:
vectorizer = TfidfVectorizer(min_df = 5,max_df = 0.8,sublinear_tf = True,use_idf = True)
NBx = vectorizer.fit_transform(X_train).toarray()
NBx_test = vectorizer.transform(X_test).toarray()
NB = MultinomialNB()
NB.fit(NBx, Y_train)
Y_pred = NB.predict(NBx_test)

In [12]:
cfNB = confusion_matrix(Y_test, Y_pred)
print(cfNB)
metrics(Y_test,Y_pred)

[[33 31 25]
 [20 36 40]
 [20 23 75]]
metrics total pos neg neu
accuracy_score 0.6501650165016502 0.6435643564356436 0.6831683168316832 0.6237623762376238
f1_score 0.6201785767970418 0.6896551724137931 0.7837837837837838 0.38709677419354843
