In [None]:
'''

An analysis of why wcbtfidf works better in imbalanced classes example.
We will have a look at the difference in the vocab considered and why the one used by wcbtfidf is better than tfidf.

'''

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')
import nltk
from nltk.tokenize import TreebankWordTokenizer

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from wcbtfidf import Wcbtfidf

In [2]:
df = pd.read_csv('train.tsv',sep='\t')
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [3]:
def change_target(value):
    if value in [2,3,4]:
        return 1
    
    if value in [0,1]:
        return 0
    

df['target'] = df['Sentiment'].apply(change_target)
df['target'].value_counts(normalize=True)

1    0.779924
0    0.220076
Name: target, dtype: float64

In [4]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub("[^a-z0-9]"," ",text)
    text = re.sub("(\s)+"," ",text)
    return text

In [5]:
df['clean_text'] = df['Phrase'].apply(preprocess_text)

In [6]:
print(df.shape)
df = df[['clean_text','target']]
print(df.shape)

(156060, 6)
(156060, 2)


In [7]:
xtrain,xtest,ytrain,ytest = train_test_split(df['clean_text'],df['target'],test_size=0.25,random_state=60,stratify=df['target'])

print(xtrain.shape,ytrain.shape)
print(xtest.shape,ytest.shape)

(117045,) (117045,)
(39015,) (39015,)


In [8]:
def check_hypothesis(xtrain,xtest,ytrain,ytest,max_feat,model):
    
    print('Running base version')
    tfidf = TfidfVectorizer(max_features=max_feat,stop_words='english')
    train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.vocabulary_)
    test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.vocabulary_)
    
    
    model.fit(train_df,ytrain)
    preds = model.predict(test_df)
    print(f'Precision is {precision_score(ytest,preds)}')
    print(f'Recall is {recall_score(ytest,preds)}')
    print(f'ROC curve is {roc_auc_score(ytest,preds)}')
    print(classification_report(ytest,preds))
    
    print('Running my version')
    wcbtfidf = Wcbtfidf(max_features=max_feat)
    wcbtfidf.fit(xtrain,ytrain)
    
    train_df = wcbtfidf.transform(xtrain)
    test_df = wcbtfidf.transform(xtest)
    
    model.fit(train_df,ytrain)
    preds = model.predict(test_df)
    print(f'Precision is {precision_score(ytest,preds)}')
    print(f'Recall is {recall_score(ytest,preds)}')
    print(f'ROC curve is {roc_auc_score(ytest,preds)}')
    print(classification_report(ytest,preds))
    return wcbtfidf,tfidf

model = LogisticRegression()
wcbtfidf_object,tfidf_object = check_hypothesis(xtrain,xtest,ytrain,ytest,300,model)

Running base version
Precision is 0.7953317083509962
Recall is 0.9865260113707318
ROC curve is 0.5434027680892792
              precision    recall  f1-score   support

           0       0.68      0.10      0.17      8586
           1       0.80      0.99      0.88     30429

    accuracy                           0.79     39015
   macro avg       0.74      0.54      0.53     39015
weighted avg       0.77      0.79      0.73     39015

Running my version
Precision is 0.8031068587400559
Recall is 0.9820237273653423
ROC curve is 0.5643871257371784
              precision    recall  f1-score   support

           0       0.70      0.15      0.24      8586
           1       0.80      0.98      0.88     30429

    accuracy                           0.80     39015
   macro avg       0.75      0.56      0.56     39015
weighted avg       0.78      0.80      0.74     39015



## ANALYSIS OF IMPROVEMENT

In [None]:
'''

0 is the minority class here(Refers to negative reviews of the movies)
1 is the majority class here(Refers to positive reviews of the movies)

Let us look at the two vocab used by tfidf and wcbtfidf


'''

In [10]:
# Length Comparison

tfidf_vocab = tfidf_object.vocabulary_
wcbtfidf_vocab = wcbtfidf_object.combine_vocab

print(len(wcbtfidf_vocab),len(tfidf_vocab))

300 300


In [11]:
# Words that are present in tfidf vocab but not in wcbtfidf

print(list(set(tfidf_vocab) - set(wcbtfidf_vocab)))

['act', 'message', 'thought', 'project', 'quirky', 'actor', 'recent', 'romance', 'turn', 'dead', 'stand', 'fresh', 'effort', 'coming', 'boy', 'manages', 'filmmaker', 'stuff', 'teen', 'directed', 'debut', 'feeling', 'energy', 'just', 'easy', 'storytelling', 'horror', 'likely', 'mind', 'does', 'school', 'classic', 'deep', 'filmmaking', 'sure', 'plays', 'role', 'dramatic', 'melodrama', 'laugh', 'ways', 'home', 'personal', 'suspense', 'rich', 'surprisingly', 'wo', 'leave', 'tragedy', 'eyes', 'sort', 'social', 'despite', 'version', 'level', 'especially', 'intelligence', 'offers', 'truly', 'effects', 'did', 'ideas', 'crime', 'audiences', 'death', 'enjoy', 'live', 'john', 'line', 'mr', 'tone', 'charm', 'past', 'shot', 'deeply', 'written', 'believe', 'sex', 'filmmakers', 'turns']


These words can be more categorised as those which are either neutral or lean more towards the positive end like **charm,quirky,classic,enjoy,fresh,laugh**

In [12]:
# Words that are present in wcbtfidf but not in tfidf

print(list(set(wcbtfidf_vocab) - set(tfidf_vocab)))

['show', 'stupid', 'seems', 'fails', 'already', 'bland', 'left', 'still', 'first', 'every', 'us', 'lacks', 'around', 'thin', 'tired', 'could', 'mess', 'another', 'almost', 'worse', 'though', 'ugly', 'yet', 'dumb', 'anyone', 'cheap', 'see', 'sometimes', 'go', 'find', 'part', 'interest', 'cliches', 'nothing', 'always', 'get', 'seem', 'last', 'keep', 'give', 'least', 'also', 'often', 'back', 'take', 'contrived', 'less', 'rather', 'enough', 'whole', 'might', 'whose', 'well', 'ever', 'two', 'without', 'else', 'much', 'three', 'done', 'since', 'made', 'full', 'gags', 're', 'may', 'boring', 'many', 'something', 'none', 'becomes', 'together', 'would', 'anything', 'barely', 'pretentious', 'even', 'slow', 'never', 'one']


Words here can be categorised as either neutral or leaning towards the negative side with words like **worse,lacks,dumb,barely,slow,stupid,boring,pretentious,mess,cheap,ugly,tired,bland.**
Words such as these cater more towards the negative class or the minority class