In [1]:
'''

Hypothesis : In imbalanced class scenarios, when we use normal tfidf or countvect majority class words takes preference.
             Since this class overpowers all other classes, the words associated with that class are majorly choosen in the
             vocab. So the words associated with other classes are not present in the vocab and that hinders the model's
             ability to classify examples belonging to other classes.
             

'''

"\n\nHypothesis : In imbalanced class scenarios, when we use normal tfidf or countvect majority class words takes preference.\n             Since this class overpowers all other classes, the words associated with that class are majorly choosen in the\n             vocab. So the words associated with other classes are not present in the vocab and that hinders the model's\n             ability to classify examples belonging to other classes.\n             \n\n"

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
df.shape

(159571, 8)

In [4]:
df['sum'] = df['toxic'].astype(int) + df['severe_toxic'].astype(int) + df['obscene'].astype(int) + df['threat'].astype(int) \
            + df['insult'].astype(int) + df['identity_hate'].astype(int)
    
df['toxic_or_not'] = np.where(df['sum'] > 0,1,0)
df['toxic_or_not'].value_counts(normalize=True)

0    0.898321
1    0.101679
Name: toxic_or_not, dtype: float64

In [5]:
df['sum'].value_counts()

0    143346
1      6360
3      4209
2      3480
4      1760
5       385
6        31
Name: sum, dtype: int64

In [6]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub("[^a-z0-9]"," ",text)
    text = re.sub("(\s)+"," ",text)
    return text

In [7]:
df['clean_text'] = df['comment_text'].apply(preprocess_text)

In [8]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,sum,toxic_or_not,clean_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0,0,explanation why the edits made under my userna...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0,0,d aww he matches this background colour i m se...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0,0,hey man i m really not trying to edit war it s...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0,0,more i can t make any real suggestions on imp...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0,0,you sir are my hero any chance you remember wh...


In [9]:
print(df.shape)
df = df[['clean_text','toxic_or_not']]
print(df.shape)

(159571, 11)
(159571, 2)


In [10]:
class Wcbtfidf:
    
    def __init__(self,max_features):
        self.max_features = max_features
        self.combine_vocab = []
        self.final_tfidf = None
    
    def fit(self,X,y):
        
        label_dict = y.value_counts(normalize=True).to_dict()
        for key,val in label_dict.items():
            new_val = int(np.round(val,1)*self.max_features)
            label_dict[key] = new_val
        
        self.combine_vocab = self.return_total_vocab(X,y,label_dict)
        self.final_tfidf = TfidfVectorizer(vocabulary=self.combine_vocab,stop_words='english')
        self.final_tfidf.fit(X)
    
    def transform(self,X):
        transformed_data = self.final_tfidf.transform(X)
        transformed_data = pd.DataFrame(transformed_data.toarray(),columns=self.combine_vocab)
        return transformed_data
    
    def return_total_vocab(self,X,y,label_dict):
        
        exclude = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", 
                 "you", "your", "yours", "yourself", "yourselves", "he", "him", 
                 "his", "himself", "she", "her", "hers", "herself", "it", "its", 
                 "itself", "they", "them", "their", "theirs", "themselves", "what", 
                 "which", "who", "whom", "this", "that", "these", "those", "am", "is", 
                 "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", 
                 "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", 
                 "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", 
                 "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", 
                 "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", 
                 "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", 
                 "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", 
                 "just", "don", "should", "now"]
        
        total_vocab = []
        for key,val in label_dict.items():
            slice_data = X[y==key]
            tfidf = TfidfVectorizer(max_features=val,stop_words=exclude)
            tfidf.fit(slice_data)
            vocab = tfidf.vocabulary_
            total_vocab.extend(vocab)
            exclude.extend(vocab)
        
        return total_vocab
            

In [11]:
xtrain,xtest,ytrain,ytest = train_test_split(df['clean_text'],df['toxic_or_not'],test_size=0.25,random_state=60,stratify=df['toxic_or_not'])

print(xtrain.shape,ytrain.shape)
print(xtest.shape,ytest.shape)

(119678,) (119678,)
(39893,) (39893,)


In [12]:
def check_hypothesis(xtrain,xtest,ytrain,ytest,max_feat,model):
    
    print('Running base version')
    tfidf = TfidfVectorizer(max_features=max_feat,stop_words='english')
    train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.vocabulary_)
    test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.vocabulary_)
    
    
    model.fit(train_df,ytrain)
    preds = model.predict(test_df)
    print(f'Precision is {precision_score(ytest,preds)}')
    print(f'Recall is {recall_score(ytest,preds)}')
    print(f'ROC curve is {roc_auc_score(ytest,preds)}')
    print(classification_report(ytest,preds))
    
    print('Running my version')
    wcbtfidf = Wcbtfidf(max_features=max_feat)
    wcbtfidf.fit(xtrain,ytrain)    
    train_df = wcbtfidf.transform(xtrain)
    test_df = wcbtfidf.transform(xtest)
    
    model.fit(train_df,ytrain)
    preds = model.predict(test_df)
    print(f'Precision is {precision_score(ytest,preds)}')
    print(f'Recall is {recall_score(ytest,preds)}')
    print(f'ROC curve is {roc_auc_score(ytest,preds)}')
    print(classification_report(ytest,preds))

In [13]:
model = LogisticRegression()
check_hypothesis(xtrain,xtest,ytrain,ytest,300,model)

Running base version
Precision is 0.927710843373494
Recall is 0.36070019723865876
ROC curve is 0.6787595636973214
              precision    recall  f1-score   support

           0       0.93      1.00      0.96     35837
           1       0.93      0.36      0.52      4056

    accuracy                           0.93     39893
   macro avg       0.93      0.68      0.74     39893
weighted avg       0.93      0.93      0.92     39893

Running my version
Precision is 0.918452935694315
Recall is 0.4859467455621302
ROC curve is 0.7405317621551757
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     35837
           1       0.92      0.49      0.64      4056

    accuracy                           0.94     39893
   macro avg       0.93      0.74      0.80     39893
weighted avg       0.94      0.94      0.94     39893

