In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from wcbtfidf import Wcbtfidf

In [2]:
df = pd.read_csv('imdb_dataset.csv')
df.shape

(50000, 2)

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# Target data distribution
df['sentiment'].value_counts(normalize=True)

negative    0.5
positive    0.5
Name: sentiment, dtype: float64

In [5]:
df['sentiment'] = df['sentiment'].map({'negative':0,'positive':1})

In [17]:
# To test our hypothesis let us convert into an imbalance problem with fewer positive samples
# We will take a total of 25k points with 23k points to class 0 and 2k points to class 1

negative_samples = df[df['sentiment'] == 0].sample(n=23000,random_state=60)
positive_samples = df[df['sentiment'] == 1].sample(n=2000,random_state=60)

final_df = pd.concat([negative_samples,positive_samples]).sample(frac=1) # A sample operation with full data is 
                                                                         # performed to shuffle the data points
final_df['sentiment'].value_counts(normalize=True)

0    0.92
1    0.08
Name: sentiment, dtype: float64

In [18]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub("[^a-z0-9]"," ",text)
    text = re.sub("(\s)+"," ",text)
    return text

In [19]:
final_df['clean_text'] = final_df['review'].apply(preprocess_text)

In [20]:
print(final_df.shape)
final_df = final_df[['clean_text','sentiment']]
print(final_df.shape)

(25000, 3)
(25000, 2)


In [21]:
xtrain,xtest,ytrain,ytest = train_test_split(final_df['clean_text'],final_df['sentiment'],test_size=0.25,random_state=60,stratify=final_df['sentiment'])

print(xtrain.shape,ytrain.shape)
print(xtest.shape,ytest.shape)

(18750,) (18750,)
(6250,) (6250,)


In [22]:
# Distribution check in train and test

print(ytrain.value_counts(normalize=True))
print(ytest.value_counts(normalize=True))

0    0.92
1    0.08
Name: sentiment, dtype: float64
0    0.92
1    0.08
Name: sentiment, dtype: float64


In [23]:
def check_hypothesis(xtrain,xtest,ytrain,ytest,max_feat,model):
    
    print('Running base version')
    tfidf = TfidfVectorizer(max_features=max_feat,stop_words='english')
    train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.vocabulary_)
    test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.vocabulary_)
    
    
    model.fit(train_df,ytrain)
    preds = model.predict(test_df)
    print(f'Precision is {precision_score(ytest,preds)}')
    print(f'Recall is {recall_score(ytest,preds)}')
    print(f'ROC curve is {roc_auc_score(ytest,preds)}')
    print(classification_report(ytest,preds))
    
    print('Running my version')
    wcbtfidf = Wcbtfidf(max_features=max_feat)
    wcbtfidf.fit(xtrain,ytrain)
    
    train_df = wcbtfidf.transform(xtrain)
    test_df = wcbtfidf.transform(xtest)
    
    model.fit(train_df,ytrain)
    preds = model.predict(test_df)
    print(f'Precision is {precision_score(ytest,preds)}')
    print(f'Recall is {recall_score(ytest,preds)}')
    print(f'ROC curve is {roc_auc_score(ytest,preds)}')
    print(classification_report(ytest,preds))
    return wcbtfidf,tfidf

In [24]:
model = LogisticRegression()
wcbtfidf_object,tfidf_object = check_hypothesis(xtrain,xtest,ytrain,ytest,300,model)

Running base version
Precision is 0.672566371681416
Recall is 0.152
ROC curve is 0.5727826086956521
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      5750
           1       0.67      0.15      0.25       500

    accuracy                           0.93      6250
   macro avg       0.80      0.57      0.60      6250
weighted avg       0.91      0.93      0.90      6250

Running my version
Precision is 0.7112676056338029
Recall is 0.202
ROC curve is 0.5974347826086956
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      5750
           1       0.71      0.20      0.31       500

    accuracy                           0.93      6250
   macro avg       0.82      0.60      0.64      6250
weighted avg       0.92      0.93      0.91      6250



## ANALYSIS

In [25]:
# Length Comparison

tfidf_vocab = tfidf_object.vocabulary_
wcbtfidf_vocab = wcbtfidf_object.combine_vocab

print(len(wcbtfidf_vocab),len(tfidf_vocab))

300 300


In [26]:
# Words that are present in tfidf vocab but not in wcbtfidf

print(list(set(tfidf_vocab) - set(wcbtfidf_vocab)))

['beginning', 'having', 'turn', 'finally', 'act', 'cheap', 'don', 'children', 'won', 'writing', 'god', 'kill', 'hope', 'doing', 'white', 'moments', 'ridiculous', 'gave', 'tries', 'hour', 'thinking', 'ok', 'lack', 'wanted', 'went', 'flick', 'lead', 'lost', 'direction', 'totally', 'stars', 'small', 'title', 'blood', 'decent', 'starts', 'just', 'called', 'attempt', 'remember', 'truly', 'run', 'felt', 'mr', 'picture', 'boy', 'stop', 'recommend', 'face', 'mother', 'gore', 'looked', 'quality', 'yes', 'sound', 'killed', 'hell', 'example', 'style', 'save', 'wouldn', 'does', 'came', 'overall', 'case', 'friend', 'written', 'stuff', 'car', 'evil', 'care', 'obviously', 'early', 'annoying', 'did', 'girls']


In [27]:
# Words that are present in wcbtfidf but not in tfidf

print(list(set(wcbtfidf_vocab) - set(tfidf_vocab)))

['even', 'us', 'might', 'full', 'heart', 'every', 'seems', 'loved', 'last', 'top', 'could', 'without', 'less', 'get', 'three', 'done', 'well', 'next', 'wonderful', 'performances', 'human', 'else', 'put', 'may', 'since', 'much', 'although', 'another', 'anyone', 'almost', 'anything', 'also', 'would', 'go', 'however', 'rather', 'one', 'definitely', 'yet', 'though', 'nothing', 'around', 'everyone', 'something', 'take', 'perfect', 'part', 'enough', 'everything', 'made', 'ever', 'whole', 'must', 're', 'seem', 'never', 'many', 'find', 'always', 'two', 'together', 'give', 'excellent', 'found', 'name', 'still', 'amazing', 'someone', 'show', 'first', 'back', 'least', 'either', 'along', 'see', 'today']
