In [16]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from wcbtfidf import Wcbtfidf

In [17]:
file = "imdb_dataset.csv"
def read_and_prepare_data(filename):
    df = pd.read_csv('imdb_dataset.csv')
    print(f'Shape of the dataset is {df.shape}')
    print(f'Target distribution is \n{df.sentiment.value_counts(normalize=True)}')
    df['sentiment'] = df['sentiment'].map({'negative':0,'positive':1})
    # To test our hypothesis let us convert into an imbalance problem with fewer positive samples
    # We will take a total of 25k points with 23k points to class 0 and 2k points to class 1

    negative_samples = df[df['sentiment'] == 0].sample(n=23000,random_state=60)
    positive_samples = df[df['sentiment'] == 1].sample(n=2000,random_state=60)

    final_df = pd.concat([negative_samples,positive_samples]).sample(frac=1,random_state=60) # A sample operation with full data is 
                                                                             # performed to shuffle the data points
    print(f'Final data shape is {final_df.shape}')
    print(f'Final target distribution is \n{final_df.sentiment.value_counts(normalize=True)}')
    return final_df

In [18]:
final_df = read_and_prepare_data(file)

Shape of the dataset is (50000, 2)
Target distribution is 
positive    0.5
negative    0.5
Name: sentiment, dtype: float64
Final data shape is (25000, 2)
Final target distribution is 
0    0.92
1    0.08
Name: sentiment, dtype: float64


In [19]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub("[^a-z0-9]"," ",text)
    text = re.sub("(\s)+"," ",text)
    return text

In [20]:
final_df['clean_text'] = final_df['review'].apply(preprocess_text)

In [21]:
final_df = final_df[['clean_text','sentiment']]
xtrain,xtest,ytrain,ytest = train_test_split(final_df['clean_text'],final_df['sentiment'],test_size=0.25,random_state=60,stratify=final_df['sentiment'])

print(xtrain.shape,ytrain.shape)
print(xtest.shape,ytest.shape)

(18750,) (18750,)
(6250,) (6250,)


In [22]:
def check_hypothesis(xtrain,xtest,ytrain,ytest,max_feat,model):
    
    print('Running TFIDF')
    tfidf = TfidfVectorizer(max_features=max_feat,stop_words='english')
    train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.vocabulary_)
    test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.vocabulary_)
    
    
    model.fit(train_df,ytrain)
    preds_tfidf = model.predict(test_df)
    print(classification_report(ytest,preds_tfidf))
    
    print('Running WCBTFIDF')
    wcbtfidf = Wcbtfidf(max_features=max_feat)
    wcbtfidf.fit(xtrain,ytrain)
    
    train_df = wcbtfidf.transform(xtrain)
    test_df = wcbtfidf.transform(xtest)
    
    model.fit(train_df,ytrain)
    preds_wcbtfidf = model.predict(test_df)
    print(classification_report(ytest,preds_wcbtfidf))

    return wcbtfidf,tfidf

In [23]:
model = LogisticRegression()
wcbtfidf_object,tfidf_object = check_hypothesis(xtrain,xtest,ytrain,ytest,300,model)

Running TFIDF
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      5750
           1       0.74      0.15      0.25       500

    accuracy                           0.93      6250
   macro avg       0.84      0.57      0.61      6250
weighted avg       0.92      0.93      0.91      6250

Running WCBTFIDF
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      5750
           1       0.75      0.19      0.31       500

    accuracy                           0.93      6250
   macro avg       0.84      0.59      0.63      6250
weighted avg       0.92      0.93      0.91      6250



## ANALYSIS

In [24]:
# Length Comparison

tfidf_vocab = tfidf_object.vocabulary_
wcbtfidf_vocab = wcbtfidf_object.combine_vocab

print(len(wcbtfidf_vocab),len(tfidf_vocab))

300 300


In [10]:
# Words that are present in tfidf vocab but not in wcbtfidf

print(list(set(tfidf_vocab) - set(wcbtfidf_vocab)))

['blood', 'direction', 'starts', 'annoying', 'stuff', 'god', 'car', 'came', 'episode', 'lost', 'wouldn', 'picture', 'totally', 'sound', 'cheap', 'care', 'save', 'quality', 'stars', 'hell', 'certainly', 'thinking', 'beginning', 'face', 'boy', 'yes', 'flick', 'kill', 'nice', 'truly', 'stop', 'killed', 'hope', 'written', 'attempt', 'moments', 'children', 'don', 'having', 'lead', 'person', 'felt', 'called', 'overall', 'wanted', 'white', 'writing', 'finally', 'evil', 'entire', 'obviously', 'does', 'girls', 'happens', 'turn', 'run', 'just', 'did', 'act', 'lack', 'looked', 'small', 'ridiculous', 'doing', 'gave', 'gore', 'title', 'game', 'example', 'hour', 'ok', 'case', 'playing', 'tries', 'recommend', 'decent', 'style']


In [11]:
# Words that are present in wcbtfidf but not in tfidf

print(list(set(wcbtfidf_vocab) - set(tfidf_vocab)))

['first', 'though', 'get', 'since', 'show', 'take', 'enough', 'although', 'another', 'seem', 'however', 'excellent', 'made', 'done', 'find', 'even', 'loved', 'definitely', 'back', 'go', 'still', 'anything', 'top', 'someone', 'rather', 'perfect', 'one', 'might', 'else', 'see', 'could', 'human', 'must', 'today', 'also', 'well', 'many', 'performances', 'everything', 'two', 'nothing', 'without', 'ever', 'seems', 'part', 'seemed', 'wonderful', 'every', 'less', 'yet', 'amazing', 'us', 'put', 'together', 'anyone', 'either', 'always', 'along', 're', 'become', 'would', 'gives', 'something', 'never', 'almost', 'may', 'give', 'found', 'around', 'next', 'much', 'name', 'everyone', 'last', 'least', 'three', 'whole']
