In [1]:
import pandas as pd #dataframes and data manipulation
import numpy as np #maths funcationalities
import matplotlib.pyplot as plt #visualization
import sklearn #ml lib

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression


In [2]:
phrases = ["The quick brown fox jump over the lazy dog","You will face many defeats in life, but never let yourself be defeated", "The greatest glory in living lies not in never falling, but in rising every time we fall"]
phrases

['The quick brown fox jump over the lazy dog',
 'You will face many defeats in life, but never let yourself be defeated',
 'The greatest glory in living lies not in never falling, but in rising every time we fall']

In [4]:
vect = CountVectorizer() #convert text (list) into token (tokenize it)
vect.fit_transform(phrases)


<3x32 sparse matrix of type '<class 'numpy.int64'>'
	with 36 stored elements in Compressed Sparse Row format>

In [16]:
#get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.

print(vect.get_feature_names()) # labels of unique words 
len(vect.get_feature_names()) #count of unique words

['be', 'brown', 'but', 'defeated', 'defeats', 'dog', 'every', 'face', 'fall', 'falling', 'fox', 'glory', 'greatest', 'in', 'jump', 'lazy', 'let', 'lies', 'life', 'living', 'many', 'never', 'not', 'over', 'quick', 'rising', 'the', 'time', 'we', 'will', 'you', 'yourself']


32

In [15]:
print(vect.vocabulary_) # words position in list
len(vect.vocabulary_)

{'the': 26, 'quick': 24, 'brown': 1, 'fox': 10, 'jump': 14, 'over': 23, 'lazy': 15, 'dog': 5, 'you': 30, 'will': 29, 'face': 7, 'many': 20, 'defeats': 4, 'in': 13, 'life': 18, 'but': 2, 'never': 21, 'let': 16, 'yourself': 31, 'be': 0, 'defeated': 3, 'greatest': 12, 'glory': 11, 'living': 19, 'lies': 17, 'not': 22, 'falling': 9, 'rising': 25, 'every': 6, 'time': 27, 'we': 28, 'fall': 8}


32

In [17]:
bow = vect.transform(phrases) #Extract token counts out of raw text documen
bow 

<3x32 sparse matrix of type '<class 'numpy.int64'>'
	with 36 stored elements in Compressed Sparse Row format>

In [19]:
print(bow)
print(type(bow)) # 1st element of tuple is the position os our phrase in phrases
                 # 2nd element in tuple is the word's vocabulary and 3rd count is no. of occurences.

  (0, 1)	1
  (0, 5)	1
  (0, 10)	1
  (0, 14)	1
  (0, 15)	1
  (0, 23)	1
  (0, 24)	1
  (0, 26)	2
  (1, 0)	1
  (1, 2)	1
  (1, 3)	1
  (1, 4)	1
  (1, 7)	1
  (1, 13)	1
  (1, 16)	1
  (1, 18)	1
  (1, 20)	1
  (1, 21)	1
  (1, 29)	1
  (1, 30)	1
  (1, 31)	1
  (2, 2)	1
  (2, 6)	1
  (2, 8)	1
  (2, 9)	1
  (2, 11)	1
  (2, 12)	1
  (2, 13)	3
  (2, 17)	1
  (2, 19)	1
  (2, 21)	1
  (2, 22)	1
  (2, 25)	1
  (2, 26)	1
  (2, 27)	1
  (2, 28)	1
<class 'scipy.sparse.csr.csr_matrix'>


In [20]:
print(bow.toarray()) #one hot encoding of each phrase in phrases
                    # results in all 28 tokens with values ranging from 0,1,2.....

[[0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 2 0 0 0 0 0]
 [1 0 1 1 1 0 0 1 0 0 0 0 0 1 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 1 1 1]
 [0 0 1 0 0 0 1 0 1 1 0 1 1 3 0 0 0 1 0 1 0 1 1 0 0 1 1 1 1 0 0 0]]


In [22]:
data = pd.read_csv(r"C:\Users\risha\bag of words\bow labeledTrainData.tsv", delimiter ="\t")
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [25]:
data['review'][0] # multiple reviews scraped together

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [28]:
np.bincount(data.sentiment) #counting using numpy

array([12500, 12500], dtype=int64)

In [29]:
data['sentiment'].value_counts() #counting using pandas value_counts method

0    12500
1    12500
Name: sentiment, dtype: int64

In [31]:
#its a classification problem, we have to predict sentimetn
#split our dataset into train and test 


In [136]:
#not using traditional train_test_split function, instead writing own function
def split(X,y,length,split_mark):
    if split_mark > 0. and split_mark < 1.0:
        n = int(length*split_mark)
    else:
        return("Enter value between 0.0 and 1.0")
    
    X_train = X[:n].copy()
    X_test = X[n:].copy()
    y_train = y[:n].copy()
    y_test = y[n:].copy()
    return  X_train, X_test, y_train, y_test
    

In [137]:
#initialiszing count vectorizer
vect = CountVectorizer()

In [138]:
# Shuffling dataset using sklearn shuffle 
from sklearn.utils import shuffle

data = shuffle(data)
print(data.head())

            id  sentiment                                             review
8828    9075_1          0  The Slackers as titled in this movie are three...
21475  7793_10          1  My brother brought this movie home from the re...
6581    8340_1          0  One of the worst films I have ever seen. Got s...
22585   3432_4          0  A killer, cannibal rapist is killed by a craze...
10741   9514_1          0  Please, spare me of these movies that teach us...


In [139]:
#spliiting data set using split function
X_train, X_test, y_train, y_test = split(data.review, data.sentiment, len(data), 0.7)



In [140]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(17500,) (7500,) (17500,) (7500,)


In [141]:
print(np.bincount(y_train)) #counting using numpy which results in no. of 0 and 1.
print(np.bincount(y_test))

[8763 8737]
[3737 3763]


In [142]:
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)


In [143]:
print(X_train.shape)
print(len(y_train))
print(X_test.shape)
print(len(y_test))

(17500, 64987)
17500
(7500, 64987)
7500


In [144]:
feature_names = vect.get_feature_names()

print("No. of features/words : {}".format(len(feature_names)))
print("First 10 features : {}".format(feature_names[0:10]))
print("Last 10 features : {}".format(feature_names[-11:-1]))


No. of features/words : 64987
First 10 features : ['00', '000', '0000000000001', '00001', '00015', '001', '003830', '006', '007', '0080']
Last 10 features : ['åmål', 'écran', 'élan', 'émigré', 'émigrés', 'était', 'état', 'évery', 'ísnt', 'østbye']


In [145]:
feature_vocab = vect.vocabulary_

print("length of vocab : {}".format(feature_vocab))
print("length of vocab : {}".format(len(feature_vocab)))

length of vocab : 64987


In [146]:
# cross validation on train data for accuracy
scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print("Mean Cross-Validation Accuracy: {:.2f}".format(np.mean(scores)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Mean Cross-Validation Accuracy: 0.87


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [147]:
# model creation (logistic regression)
model = LogisticRegression()
model.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [148]:
print("The training score is: {}".format(model.score(X_train, y_train)))
print("The test score is : {}".format(model.score(X_test, y_test)))

The training score is: 0.9892
The test score is : 0.88


In [149]:
# confusion matrix
pred = model.predict(X_test)
cm = confusion_matrix(y_test, pred)
print("Confusion Matrix: \n{}".format(cm)) 

Confusion Matrix: 
[[3276  461]
 [ 439 3324]]


## below is for kaggle

In [180]:
Data = pd.read_csv(r"C:\Users\risha\bag of words\bow labeledTrainData.tsv", delimiter ="\t")
Data2 = pd.read_csv(r"C:\Users\risha\bag of words\bow testData.tsv", delimiter ="\t")

X_train_new = Data["review"]
X_test_new = Data2["review"]
y_train_new = Data["sentiment"]

In [181]:
X_train_new = vect.fit_transform(X_train_new)
X_test_new = vect.transform(X_test_new)

In [182]:
print(X_train_new.shape)
print(len(y_train_new))
print(X_test_new.shape)


(25000, 74849)
25000
(25000, 74849)


In [183]:
feature_names = vect.get_feature_names()

print("No. of features/words : {}".format(len(feature_names)))
print("First 10 features : {}".format(feature_names[0:10]))
print("Last 10 features : {}".format(feature_names[-11:-1]))

No. of features/words : 74849
First 10 features : ['00', '000', '0000000000001', '00001', '00015', '000s', '001', '003830', '006', '007']
Last 10 features : ['émigrés', 'était', 'état', 'étc', 'évery', 'êxtase', 'ís', 'ísnt', 'østbye', 'über']


In [184]:
feature_vocab = vect.vocabulary_

print("length of vocab : {}".format(feature_vocab))
print("length of vocab : {}".format(len(feature_vocab)))

length of vocab : 74849


In [185]:
# model creation (logistic regression)
model = LogisticRegression()
model.fit(X_train_new, y_train_new)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [186]:
print("The training score is: {}".format(model.score(X_train_new, y_train_new)))

The training score is: 0.9816


In [187]:
pred_new = model.predict(X_test_new)
pred_new

array([1, 0, 0, ..., 0, 1, 1], dtype=int64)

In [196]:
submission = pd.DataFrame({
        "id" : Data2["id"],
        "sentiment": pred_new
    })
submission.head(20)


Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,0
3,7186_2,1
4,12128_7,1
5,2913_8,1
6,4396_1,0
7,395_2,0
8,10616_1,0
9,9074_9,0


In [198]:
submission.to_csv('Bag of Words.csv', index=False)