In [None]:
'''

Hypothesis : In imbalanced class scenarios, when we use normal tfidf or countvect majority class words takes preference.
             Since this class overpowers all other classes, the words associated with that class are majorly choosen in the
             vocab. So the words associated with other classes are not present in the vocab and that hinders the model's
             ability to classify examples belonging to other classes.
             

'''

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('train.tsv',sep='\t')
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [3]:
df['Sentiment'].value_counts(normalize=True)

2    0.509945
3    0.210989
1    0.174760
4    0.058990
0    0.045316
Name: Sentiment, dtype: float64

In [4]:
def change_target(value):
    if value in [2,3,4]:
        return 1
    
    if value in [0,1]:
        return 0
    

df['target'] = df['Sentiment'].apply(change_target)
df['target'].value_counts(normalize=True)

1    0.779924
0    0.220076
Name: target, dtype: float64

In [5]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub("[^a-z0-9]"," ",text)
    text = re.sub("(\s)+"," ",text)
    return text

In [6]:
df['clean_text'] = df['Phrase'].apply(preprocess_text)

In [7]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,target,clean_text
0,1,1,A series of escapades demonstrating the adage ...,1,0,a series of escapades demonstrating the adage ...
1,2,1,A series of escapades demonstrating the adage ...,2,1,a series of escapades demonstrating the adage ...
2,3,1,A series,2,1,a series
3,4,1,A,2,1,a
4,5,1,series,2,1,series


In [8]:
print(df.shape)
df = df[['clean_text','target']]
print(df.shape)

(156060, 6)
(156060, 2)


In [9]:
class Wcbtfidf:
    
    def __init__(self,max_features):
        self.max_features = max_features
        self.combine_vocab = []
    
    def fit(self,X,y):
        
        label_dict = y.value_counts(normalize=True).to_dict()
        for key,val in label_dict.items():
            new_val = int(np.round(val,1)*self.max_features)
            label_dict[key] = new_val
        
        self.combine_vocab = self.return_total_vocab(X,y,label_dict)
    
    def transform(self,X):
        final_tfidf = TfidfVectorizer(vocabulary=self.combine_vocab,stop_words='english')
        transformed_data = final_tfidf.fit_transform(X)
        transformed_data = pd.DataFrame(transformed_data.toarray(),columns=self.combine_vocab)
        return transformed_data
    
    def return_total_vocab(self,X,y,label_dict):
        
        exclude = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", 
                 "you", "your", "yours", "yourself", "yourselves", "he", "him", 
                 "his", "himself", "she", "her", "hers", "herself", "it", "its", 
                 "itself", "they", "them", "their", "theirs", "themselves", "what", 
                 "which", "who", "whom", "this", "that", "these", "those", "am", "is", 
                 "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", 
                 "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", 
                 "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", 
                 "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", 
                 "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", 
                 "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", 
                 "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", 
                 "just", "don", "should", "now"]
        
        total_vocab = []
        for key,val in label_dict.items():
            slice_data = X[y==key]
            tfidf = TfidfVectorizer(max_features=val,stop_words=exclude)
            tfidf.fit(slice_data)
            vocab = tfidf.vocabulary_
            total_vocab.extend(vocab)
            exclude.extend(vocab)
        
        return list(set(total_vocab))
            

In [10]:
xtrain,xtest,ytrain,ytest = train_test_split(df['clean_text'],df['target'],test_size=0.25,random_state=60)

print(xtrain.shape,ytrain.shape)
print(xtest.shape,ytest.shape)

(117045,) (117045,)
(39015,) (39015,)


In [11]:
def check_hypothesis(xtrain,xtest,ytrain,ytest,max_feat,model):
    
    print('Running base version')
    tfidf = TfidfVectorizer(max_features=max_feat,stop_words='english')
    train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.vocabulary_)
    test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.vocabulary_)
    
    
    model.fit(train_df,ytrain)
    preds = model.predict(test_df)
    print(f'Precision is {precision_score(ytest,preds)}')
    print(f'Recall is {recall_score(ytest,preds)}')
    print(f'ROC curve is {roc_auc_score(ytest,preds)}')
    print(classification_report(ytest,preds))
    
    print('Running my version')
    wcbtfidf = Wcbtfidf(max_features=max_feat)
    wcbtfidf.fit(xtrain,ytrain)
    
    train_df = wcbtfidf.transform(xtrain)
    test_df = wcbtfidf.transform(xtest)
    
    model.fit(train_df,ytrain)
    preds = model.predict(test_df)
    print(f'Precision is {precision_score(ytest,preds)}')
    print(f'Recall is {recall_score(ytest,preds)}')
    print(f'ROC curve is {roc_auc_score(ytest,preds)}')
    print(classification_report(ytest,preds))

In [12]:
model = LogisticRegression()
check_hypothesis(xtrain,xtest,ytrain,ytest,300,model)

Running base version
Precision is 0.7959124009993089
Recall is 0.9842245374174253
ROC curve is 0.5450349515219404
              precision    recall  f1-score   support

           0       0.65      0.11      0.18      8588
           1       0.80      0.98      0.88     30427

    accuracy                           0.79     39015
   macro avg       0.73      0.55      0.53     39015
weighted avg       0.76      0.79      0.73     39015

Running my version
Precision is 0.803291030329103
Recall is 0.9818910835770861
ROC curve is 0.5650023652631588
              precision    recall  f1-score   support

           0       0.70      0.15      0.24      8588
           1       0.80      0.98      0.88     30427

    accuracy                           0.80     39015
   macro avg       0.75      0.57      0.56     39015
weighted avg       0.78      0.80      0.74     39015

