In [25]:
# Importing libraries
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
# Create sample set of documents
docs = np.array(['Mirabai has won a silver medal in weight lifting in Tokyo olympics 2021',
                 'Sindhu has won a bronze medal in badminton in Tokyo olympics',
                 'Indian hockey team is in top four team in Tokyo olympics 2021 after 40 years'])

In [27]:
vect = CountVectorizer()

In [28]:
# Fit the bag-of-words model
bag_of_words = vect.fit_transform(docs)

In [29]:
# Get unique words / tokens found in all the documents. The unique words / tokens represents the features
print(vect.get_feature_names())

['2021', '40', 'after', 'badminton', 'bronze', 'four', 'has', 'hockey', 'in', 'indian', 'is', 'lifting', 'medal', 'mirabai', 'olympics', 'silver', 'sindhu', 'team', 'tokyo', 'top', 'weight', 'won', 'years']


In [30]:
# Associate the indices with each unique word
print(vect.vocabulary_)

{'mirabai': 13, 'has': 6, 'won': 21, 'silver': 15, 'medal': 12, 'in': 8, 'weight': 20, 'lifting': 11, 'tokyo': 18, 'olympics': 14, '2021': 0, 'sindhu': 16, 'bronze': 4, 'badminton': 3, 'indian': 9, 'hockey': 7, 'team': 17, 'is': 10, 'top': 19, 'four': 5, 'after': 2, '40': 1, 'years': 22}


In [31]:
# Print the numerical feature vector
print(bag_of_words.toarray())

[[1 0 0 0 0 0 1 0 2 0 0 1 1 1 1 1 0 0 1 0 1 1 0]
 [0 0 0 1 1 0 1 0 2 0 0 0 1 0 1 0 1 0 1 0 0 1 0]
 [1 1 1 0 0 1 0 1 2 1 1 0 0 0 1 0 0 2 1 1 0 0 1]]


In [32]:
# Sentimental analysis on "bag of words meets the bag of popcorns" dataset
data = pd.read_csv("C:/Users/user/Downloads/labeledTrainData.tsv", delimiter="\t")
data

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
...,...,...,...
24995,3453_3,0,It seems like more consideration has gone into...
24996,5064_1,0,I don't believe they made this film. Completel...
24997,10905_3,0,"Guy is a loser. Can't get girls, needs to buil..."
24998,10194_3,0,This 30 minute documentary Buñuel made in the ...


In [33]:
# splitting the data into training and testing sets.
def simple_split(data,y,length,split_mark=0.7):
    if split_mark > 0 and split_mark < 1:
        n = int(split_mark * length)
    else:
        n - int(split_mark)
    x_train = data[:n].copy()
    x_test= data[n:].copy()
    y_train= y[:n].copy()
    y_test= y[n:].copy()
    return x_train,x_test,y_train,y_test

In [34]:
# Initializing CountVectorizer
vectorizer = CountVectorizer()

In [35]:
x_train,x_test,y_train,y_test = simple_split(data.review,data.sentiment,len(data))
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(17500,) (7500,) (17500,) (7500,)


In [36]:
print("Samples per class: {}" . format(np.bincount(y_train)))
print("Samples per class: {}" . format(np.bincount(y_test)))

Samples per class: [8761 8739]
Samples per class: [3739 3761]


In [37]:
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [38]:
f_names = vectorizer.get_feature_names()
print("Number of Features : {}". format(len(f_names)))
print("Features 1000 to 1050 : \n{}". format(f_names[1000:1050]))
print("Unique words:\n{}".format(vectorizer.get_feature_names()))

Number of Features : 65005
Features 1000 to 1050 : 
['abhisheh', 'abhishek', 'abhor', 'abhorrence', 'abhorrent', 'abhors', 'abi', 'abide', 'abides', 'abiding', 'abigail', 'abigil', 'abilities', 'ability', 'abishai', 'abject', 'abjectly', 'abkani', 'able', 'ably', 'abm', 'abner', 'abnormal', 'abnormality', 'abnormally', 'abo', 'aboard', 'abode', 'abodes', 'abolish', 'abolished', 'abolition', 'abolitionism', 'abolitionists', 'abominable', 'abominably', 'abomination', 'abominations', 'abominator', 'abominibal', 'aboooot', 'aborigin', 'aboriginal', 'aboriginals', 'aborigine', 'aborigines', 'aboriginies', 'aborigins', 'aborigone', 'abort']
Unique words:


In [39]:
vectorizer.vocabulary_

{'with': 63728,
 'all': 2250,
 'this': 57865,
 'stuff': 55431,
 'going': 24332,
 'down': 17279,
 'at': 4082,
 'the': 57672,
 'moment': 37800,
 'mj': 37605,
 've': 61636,
 'started': 54657,
 'listening': 33896,
 'to': 58358,
 'his': 27044,
 'music': 38654,
 'watching': 62795,
 'odd': 40456,
 'documentary': 16869,
 'here': 26685,
 'and': 2756,
 'there': 57747,
 'watched': 62789,
 'wiz': 63780,
 'moonwalker': 37998,
 'again': 1818,
 'maybe': 36083,
 'just': 31182,
 'want': 62647,
 'get': 23765,
 'certain': 9833,
 'insight': 29452,
 'into': 29840,
 'guy': 25447,
 'who': 63352,
 'thought': 57914,
 'was': 62740,
 'really': 46724,
 'cool': 12754,
 'in': 28719,
 'eighties': 18394,
 'make': 35139,
 'up': 61140,
 'my': 38747,
 'mind': 37212,
 'whether': 63236,
 'he': 26290,
 'is': 30128,
 'guilty': 25297,
 'or': 40900,
 'innocent': 29371,
 'part': 42097,
 'biography': 6386,
 'feature': 21000,
 'film': 21398,
 'which': 63239,
 'remember': 47561,
 'see': 50884,
 'cinema': 10761,
 'when': 63222,
 '

In [40]:
i = 45000
j=10
words = vectorizer.get_feature_names()[i:i+10]
pd.DataFrame(x_train[j:j+7,i:i+10].todense(),columns = words)

Unnamed: 0,producer,producer9and,producers,produces,producing,product,production,productions,productive,productively
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,1,0,0,0
4,0,0,0,0,0,0,1,0,0,0
5,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0


In [41]:
scores=cross_val_score(LogisticRegression(max_iter=18000),x_train,y_train,cv=5)
print("Mean cross validation accuracy: {:.2f}".format(np.mean(scores)))

Mean cross validation accuracy: 0.88


In [42]:
logreg = LogisticRegression(max_iter=18000)
logreg.fit(x_train,y_train)
y_pred = logreg.predict(x_test) 
print("Training set score:{:.3f}".format(logreg.score(x_train,y_train)))
print("Testing set score:{:.3f}".format(logreg.score(x_test,y_test)))

Training set score:0.999
Testing set score:0.879


In [43]:
pred_logreg = logreg.predict(x_test)
confusion_logreg = confusion_matrix(y_test,pred_logreg)
print("Confusion matrix:\n{}".format(confusion_logreg))

Confusion matrix:
[[3279  460]
 [ 451 3310]]


In [44]:
accuracy_logreg = accuracy_score(y_test, pred_logreg).round(2)
print("Accuracy of Logistic Regression:",accuracy_logreg)

Accuracy of Logistic Regression: 0.88


In [45]:
nb=MultinomialNB()
nb.fit(x_train,y_train)
print("Training set score:{:.3f}".format(nb.score(x_train,y_train)))
print("Testing set score:{:.3f}".format(nb.score(x_test,y_test)))

Training set score:0.908
Testing set score:0.845


In [46]:
pred_nb = nb.predict(x_test)
confusion_nb = confusion_matrix(y_test,pred_nb)
print("Confusion matrix:\n{}".format(confusion_nb))

Confusion matrix:
[[3275  464]
 [ 702 3059]]


In [47]:
accuracy_nb = accuracy_score(y_test, pred_nb).round(2)
print("Accuracy of Multinomial Naive Bayes:",accuracy_nb)

Accuracy of Multinomial Naive Bayes: 0.84


In [48]:
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
print("Training set score:{:.3f}".format(rf.score(x_train,y_train)))
print("Testing set score:{:.3f}".format(rf.score(x_test,y_test)))

Training set score:1.000
Testing set score:0.845


In [49]:
pred_rf = rf.predict(x_test)
confusion_rf = confusion_matrix(y_test,pred_rf)
print("Confusion matrix:\n{}".format(confusion_rf))

Confusion matrix:
[[3146  593]
 [ 573 3188]]


In [50]:
accuracy_rf = accuracy_score(y_test, pred_rf).round(2)
print("Accuracy of Random Forest Classifier:",accuracy_rf)

Accuracy of Random Forest Classifier: 0.84
