In [14]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from wcbtfidf import Wcbtfidf

In [15]:
df = pd.read_csv('train.tsv',sep='\t')
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [16]:
df['Sentiment'].value_counts(normalize=True)

2    0.509945
3    0.210989
1    0.174760
4    0.058990
0    0.045316
Name: Sentiment, dtype: float64

In [17]:
def change_target(value):
    if value in [2,3,4]:
        return 1
    
    if value in [0,1]:
        return 0
    

df['target'] = df['Sentiment'].apply(change_target)
df['target'].value_counts(normalize=True)

1    0.779924
0    0.220076
Name: target, dtype: float64

In [18]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub("[^a-z0-9]"," ",text)
    text = re.sub("(\s)+"," ",text)
    return text

In [19]:
df['clean_text'] = df['Phrase'].apply(preprocess_text)

In [20]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,target,clean_text
0,1,1,A series of escapades demonstrating the adage ...,1,0,a series of escapades demonstrating the adage ...
1,2,1,A series of escapades demonstrating the adage ...,2,1,a series of escapades demonstrating the adage ...
2,3,1,A series,2,1,a series
3,4,1,A,2,1,a
4,5,1,series,2,1,series


In [21]:
print(df.shape)
df = df[['clean_text','target']]
print(df.shape)

(156060, 6)
(156060, 2)


In [22]:
xtrain,xtest,ytrain,ytest = train_test_split(df['clean_text'],df['target'],test_size=0.25,random_state=60,stratify=df['target'])

print(xtrain.shape,ytrain.shape)
print(xtest.shape,ytest.shape)

(117045,) (117045,)
(39015,) (39015,)


In [23]:
def check_hypothesis(xtrain,xtest,ytrain,ytest,max_feat,model):
    
    print('Running base version')
    tfidf = TfidfVectorizer(max_features=max_feat,stop_words='english')
    train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.vocabulary_)
    test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.vocabulary_)
    
    
    model.fit(train_df,ytrain)
    preds = model.predict(test_df)
    print(f'Precision is {precision_score(ytest,preds)}')
    print(f'Recall is {recall_score(ytest,preds)}')
    print(f'ROC curve is {roc_auc_score(ytest,preds)}')
    print(classification_report(ytest,preds))
    
    print('Running my version')
    wcbtfidf = Wcbtfidf(max_features=max_feat)
    wcbtfidf.fit(xtrain,ytrain)
    
    train_df = wcbtfidf.transform(xtrain)
    test_df = wcbtfidf.transform(xtest)
    
    model.fit(train_df,ytrain)
    preds = model.predict(test_df)
    print(f'Precision is {precision_score(ytest,preds)}')
    print(f'Recall is {recall_score(ytest,preds)}')
    print(f'ROC curve is {roc_auc_score(ytest,preds)}')
    print(classification_report(ytest,preds))
    return wcbtfidf

In [24]:
model = LogisticRegression()
wcbtfidf_object = check_hypothesis(xtrain,xtest,ytrain,ytest,300,model)

Running base version
Precision is 0.7953317083509962
Recall is 0.9865260113707318
ROC curve is 0.5434027680892792
              precision    recall  f1-score   support

           0       0.68      0.10      0.17      8586
           1       0.80      0.99      0.88     30429

    accuracy                           0.79     39015
   macro avg       0.74      0.54      0.53     39015
weighted avg       0.77      0.79      0.73     39015

Running my version
Precision is 0.8042208403994724
Recall is 0.9818265470439383
ROC curve is 0.567374955329563
              precision    recall  f1-score   support

           0       0.70      0.15      0.25      8586
           1       0.80      0.98      0.88     30429

    accuracy                           0.80     39015
   macro avg       0.75      0.57      0.57     39015
weighted avg       0.78      0.80      0.74     39015



In [25]:
# To see the vocab of the tfidf used we can access the same using the combine_vocab param

print(len(wcbtfidf_object.combine_vocab)) # Checking the size of the vocab used
print(wcbtfidf_object.combine_vocab[:10]) # Printing some of the words

300
['might', 'come', 'away', 'music', 'll', 'entertaining', 'much', 'quite', 'hour', 'half']


In [27]:
# We can also check the class wise words taken by the model using the class_wise_vocab param
for key,value in wcbtfidf_object.class_wise_vocab.items():
    print(f"{key} : {value}")
    print('-'*100)

1 : ['might', 'come', 'away', 'music', 'll', 'entertaining', 'much', 'quite', 'hour', 'half', 'lot', 'lrb', 'rrb', 'back', 'love', 'power', 'almost', 'bad', 'one', 'performance', 'real', 'funny', 'action', 'film', 'seen', 'times', 'show', 'stories', 'like', 'middle', 'war', 'nothing', 'camera', 'moving', 'think', 'screen', 're', 'director', 'set', 'art', 'many', 'movie', 'right', 'first', 'performances', 'go', 'even', 'watching', 'place', 'women', 'spirit', 'romantic', 'comedy', 'genre', 'experience', 'every', 'cinematic', 'well', 'sense', 'works', 'part', 'human', 'world', 'special', 'see', 'true', 'fans', 'piece', 'dark', 'style', 'find', 'characters', 'may', 'looking', 'something', 'make', 'small', 'far', 'humor', 'bit', 'american', 'flick', 've', 'ever', 'entertainment', 'fascinating', 'without', 'amusing', 'documentary', 'could', 'long', 'way', 'black', 'kids', 'watch', 'though', 'drama', 'fun', 'emotional', 'engaging', 'two', 'men', 'story', 'family', 'little', 'compelling', 'thi