In [50]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import re
from nltk.corpus import stopwords
import joblib

In [2]:
dataset = pd.read_csv('../Datasets/Train.csv')

# 1.) Exploratory Data Analysis

In [3]:
dataset.shape

(40000, 2)

In [4]:
dataset['label'].value_counts()

0    20019
1    19981
Name: label, dtype: int64

In [5]:
dataset['label'].value_counts()

0    20019
1    19981
Name: label, dtype: int64

In [6]:
valueCounts = pd.DataFrame(dataset['label'].value_counts())

In [7]:
valueCounts.rename(columns={'label':'Count'}, inplace=True)

In [8]:
valueCounts

Unnamed: 0,Count
0,20019
1,19981


In [9]:
px.pie(data_frame=valueCounts, values='Count', labels=valueCounts.index, title='Count of Positive and Negative comments')

# 2.) Text Preprocessing

## 2.1) Stemming the text

In [10]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [11]:
ps = PorterStemmer()
lm = WordNetLemmatizer()

In [12]:
dataset.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [13]:
sw = stopwords.words('English')
sw[0:5]

['i', 'me', 'my', 'myself', 'we']

In [14]:
def cleanData(value):
  value = re.sub("[^a-zA-Z]", " ", value)
  value = re.sub(" +", " ", value)
  return value

In [15]:
def stemSentences(sentence):
  sentence = cleanData(sentence)
  wordsList = list()
  sentence = sentence.lower()
  wordsList = sentence.split(" ")
  stemmedWords = [ps.stem(x) for x in wordsList if not x in sw]
  return " ".join(stemmedWords)

In [16]:
def lemamtizeSentence(sentence):
  sentence = cleanData(sentence)
  wordsList = list()
  sentence = sentence.lower()
  wordsList = sentence.split(" ")
  stemmedWords = [lm.lemmatize(x) for x in wordsList if not x in sw]
  return " ".join(stemmedWords)

In [17]:
dataset['Stemmed Text'] = dataset['text'].apply(stemSentences)
dataset['Lemmatized Text'] = dataset['text'].apply(lemamtizeSentence)

In [18]:
dataset.head()

Unnamed: 0,text,label,Stemmed Text,Lemmatized Text
0,I grew up (b. 1965) watching and loving the Th...,0,grew b watch love thunderbird mate school watc...,grew b watching loving thunderbird mate school...
1,"When I put this movie in my DVD player, and sa...",0,put movi dvd player sat coke chip expect hope ...,put movie dvd player sat coke chip expectation...
2,Why do people who do not know what a particula...,0,peopl know particular time past like feel need...,people know particular time past like feel nee...
3,Even though I have great interest in Biblical ...,0,even though great interest biblic movi bore de...,even though great interest biblical movie bore...
4,Im a die hard Dads Army fan and nothing will e...,1,im die hard dad armi fan noth ever chang got t...,im die hard dad army fan nothing ever change g...


## 2.2) Vectorizing the text

In [19]:
X = dataset['Lemmatized Text']
y_train = dataset['label']

In [20]:
X.shape

(40000,)

In [21]:
tfidf = TfidfVectorizer(max_features=50000)
X_train = tfidf.fit_transform(X).toarray()
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [23]:
X_train.shape

(40000, 50000)

# 3.) Model Creation

### 3.1) Baseline modelling

In [26]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression

In [46]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

In [None]:
accuracy_score(y_train, mnb.predict(X_train))

0.90255

# 4.) Model Validation

In [None]:
validation_dataset = pd.read_csv('../Datasets/Valid.csv')

In [None]:
validation_dataset.head()

Unnamed: 0,text,label
0,It's been about 14 years since Sharon Stone aw...,0
1,someone needed to make a car payment... this i...,0
2,The Guidelines state that a comment must conta...,0
3,This movie is a muddled mish-mash of clichés f...,0
4,Before Stan Laurel became the smaller half of ...,0


In [None]:
validation_dataset.isnull().sum()

text     0
label    0
dtype: int64

In [None]:
validation_dataset['Lemmatized text'] = validation_dataset['text'].apply(lemamtizeSentence)

In [None]:
validation_dataset.head()

Unnamed: 0,text,label,Lemmatized text
0,It's been about 14 years since Sharon Stone aw...,0,year since sharon stone awarded viewer leg cro...
1,someone needed to make a car payment... this i...,0,someone needed make car payment truly awful ma...
2,The Guidelines state that a comment must conta...,0,guideline state comment must contain minimum f...
3,This movie is a muddled mish-mash of clichés f...,0,movie muddled mish mash clich recent cinema pr...
4,Before Stan Laurel became the smaller half of ...,0,stan laurel became smaller half time greatest ...


In [None]:
vectorized_valid = tfidf.transform(validation_dataset['Lemmatized text']).toarray()

In [None]:
vectorized_valid.shape

(5000, 50000)

In [None]:
X_valid = vectorized_valid
y_valid = validation_dataset['label']

In [None]:
train_pred = mnb.predict(X_train)
valid_pred = mnb.predict(X_valid)

train_accuracy = accuracy_score(y_train, train_pred)
valid_accuracy = accuracy_score(y_valid, valid_pred)


print('Training data accuracy score: ', train_accuracy)
print('Validation data accuracy score: ', valid_accuracy)

print('Training Data', classification_report(y_train, train_pred))
print()
print('Validation Data', classification_report(y_valid, valid_pred))

Training data accuracy score:  0.90255
Validation data accuracy score:  0.8654
Training Data               precision    recall  f1-score   support

           0       0.89      0.92      0.90     20019
           1       0.91      0.89      0.90     19981

    accuracy                           0.90     40000
   macro avg       0.90      0.90      0.90     40000
weighted avg       0.90      0.90      0.90     40000


Validation Data               precision    recall  f1-score   support

           0       0.86      0.87      0.87      2486
           1       0.87      0.86      0.86      2514

    accuracy                           0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000



# 5.) Testing model performance

In [None]:
test_dataset = pd.read_csv('../Datasets/Test.csv')

In [None]:
test_dataset.head()

Unnamed: 0,text,label
0,I always wrote this series off as being a comp...,0
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,0
2,This movie was so poorly written and directed ...,0
3,The most interesting thing about Miryang (Secr...,1
4,"when i first read about ""berlin am meer"" i did...",0


In [None]:
test_dataset['Lemmatized Text'] = test_dataset['text'].apply(lemamtizeSentence)

In [None]:
test_dataset.head()

Unnamed: 0,text,label,Lemmatized Text
0,I always wrote this series off as being a comp...,0,always wrote series complete stink fest jim be...
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,0,st watched dir steve purcell typical mary kat...
2,This movie was so poorly written and directed ...,0,movie poorly written directed fell asleep minu...
3,The most interesting thing about Miryang (Secr...,1,interesting thing miryang secret sunshine acto...
4,"when i first read about ""berlin am meer"" i did...",0,first read berlin meer expect much thought rig...


In [None]:
test_dataset.shape

(5000, 3)

In [None]:
X_test = tfidf.transform(test_dataset['Lemmatized Text']).toarray()
y_test = test_dataset['label']

In [None]:
test_pred = mnb.predict(X_test)
test_accuracy = accuracy_score(y_test, test_pred)

print('Test data accuracy score: ', test_accuracy)
print()
print('Test Data', classification_report(y_test, test_pred))

Test data accuracy score:  0.8684

Test Data               precision    recall  f1-score   support

           0       0.86      0.88      0.87      2495
           1       0.88      0.86      0.87      2505

    accuracy                           0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000



# 6.) Saving Model

In [None]:
import joblib

In [None]:
joblib.dump(mnb, filename = '../Model/lemmatize_tfidf_naiveBayes.pkl')
#joblib.dump(mnb, filename = '../Model/lemmatize_Bow_naiveBayes.pkl')
joblib.dump(tfidf, '../Model/tfidf_vectorizer.pkl')

['../Model/tfidf_vectorizer.pkl']

# 7.) Word2Vec Implementation

In [22]:
import gensim

In [29]:
dataset.head()

Unnamed: 0,text,label,Stemmed Text,Lemmatized Text
0,I grew up (b. 1965) watching and loving the Th...,0,grew b watch love thunderbird mate school watc...,grew b watching loving thunderbird mate school...
1,"When I put this movie in my DVD player, and sa...",0,put movi dvd player sat coke chip expect hope ...,put movie dvd player sat coke chip expectation...
2,Why do people who do not know what a particula...,0,peopl know particular time past like feel need...,people know particular time past like feel nee...
3,Even though I have great interest in Biblical ...,0,even though great interest biblic movi bore de...,even though great interest biblical movie bore...
4,Im a die hard Dads Army fan and nothing will e...,1,im die hard dad armi fan noth ever chang got t...,im die hard dad army fan nothing ever change g...


In [38]:
processedWordsList = list()

In [39]:
def processSentence(sentence):
  processedWords = gensim.utils.simple_preprocess(sentence)
  processedWordsList.append(processedWords)
  return " ".join(processedWords)

In [40]:
dataset['W2V Processed'] = dataset['text'].apply(processSentence)

In [41]:
dataset.head()

Unnamed: 0,text,label,Stemmed Text,Lemmatized Text,W2V Processed
0,I grew up (b. 1965) watching and loving the Th...,0,grew b watch love thunderbird mate school watc...,grew b watching loving thunderbird mate school...,grew up watching and loving the thunderbirds a...
1,"When I put this movie in my DVD player, and sa...",0,put movi dvd player sat coke chip expect hope ...,put movie dvd player sat coke chip expectation...,when put this movie in my dvd player and sat d...
2,Why do people who do not know what a particula...,0,peopl know particular time past like feel need...,people know particular time past like feel nee...,why do people who do not know what particular ...
3,Even though I have great interest in Biblical ...,0,even though great interest biblic movi bore de...,even though great interest biblical movie bore...,even though have great interest in biblical mo...
4,Im a die hard Dads Army fan and nothing will e...,1,im die hard dad armi fan noth ever chang got t...,im die hard dad army fan nothing ever change g...,im die hard dads army fan and nothing will eve...


In [42]:
models = gensim.models.Word2Vec(window=10, min_count=2, workers=4)

In [45]:
models.build_vocab(processedWordsList, progress_per=1000)

In [46]:
models.epochs

5

In [47]:
models.corpus_count

40000

In [48]:
models.train(processedWordsList, total_examples= models.corpus_count, epochs=models.epochs )

(33975497, 44737235)

In [52]:
joblib.dump(value = models, filename= '../Model/Word2Vec.pkl')

['../Model/Word2Vec.pkl']

In [59]:
models.wv.most_similar('good')

[('decent', 0.7491356134414673),
 ('bad', 0.7384039759635925),
 ('great', 0.7032320499420166),
 ('nice', 0.6488243937492371),
 ('cool', 0.626008152961731),
 ('ok', 0.6119771599769592),
 ('fine', 0.5877658724784851),
 ('disappointing', 0.5551345348358154),
 ('solid', 0.5548805594444275),
 ('impressive', 0.5513992309570312)]