In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from wcbtfidf import Wcbtfidf

In [2]:
df = pd.read_csv('sentiment140_data.csv',names=('target','id','date','flag','username','tweet'))
df.shape

(1600000, 6)

In [3]:
df.head()

Unnamed: 0,target,id,date,flag,username,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
# Checking unique ids
print(df['id'].nunique(),df.shape[0])

1598315 1600000


In [5]:
# Removing duplicate ids
df.drop_duplicates(subset=['id'],keep='first',inplace=True)
print(df['id'].nunique(),df.shape[0])

1598315 1598315


In [6]:
# Target data distribution
df['target'].value_counts(normalize=True)

0    0.500527
4    0.499473
Name: target, dtype: float64

In [7]:
# To test our hypothesis let us convert into an imbalance problem with fewer positive samples
# We will take a total of 5 lakh data points with 4.5 lakh belonging to class 4 and 50k  to class 0

negative_samples = df[df['target'] == 0].sample(n=50000,random_state=60)
positive_samples = df[df['target'] == 4].sample(n=450000,random_state=60)

final_df = pd.concat([negative_samples,positive_samples]).sample(frac=1) # A sample operation with full data is 
                                                                         # performed to shuffle the data points

final_df['target'] = final_df['target'].map({0:0,4:1})
final_df['target'].value_counts(normalize=True)

1    0.9
0    0.1
Name: target, dtype: float64

In [8]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub("[^a-z0-9]"," ",text)
    text = re.sub("(\s)+"," ",text)
    return text

In [9]:
final_df['clean_text'] = final_df['tweet'].apply(preprocess_text)

In [10]:
print(final_df.shape)
final_df = final_df[['clean_text','target']]
print(final_df.shape)

(500000, 7)
(500000, 2)


In [11]:
xtrain,xtest,ytrain,ytest = train_test_split(final_df['clean_text'],final_df['target'],test_size=0.25,random_state=60,stratify=final_df['target'])

print(xtrain.shape,ytrain.shape)
print(xtest.shape,ytest.shape)

(375000,) (375000,)
(125000,) (125000,)


In [12]:
# Distribution check in train and test

print(ytrain.value_counts(normalize=True))
print(ytest.value_counts(normalize=True))

1    0.9
0    0.1
Name: target, dtype: float64
1    0.9
0    0.1
Name: target, dtype: float64


In [13]:
def check_hypothesis(xtrain,xtest,ytrain,ytest,max_feat,model):
    
    print('Running base version')
    tfidf = TfidfVectorizer(max_features=max_feat,stop_words='english')
    train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.vocabulary_)
    test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.vocabulary_)
    
    
    model.fit(train_df,ytrain)
    preds = model.predict(test_df)
    print(f'Precision is {precision_score(ytest,preds)}')
    print(f'Recall is {recall_score(ytest,preds)}')
    print(f'ROC curve is {roc_auc_score(ytest,preds)}')
    print(classification_report(ytest,preds))
    
    print('Running my version')
    wcbtfidf = Wcbtfidf(max_features=max_feat)
    wcbtfidf.fit(xtrain,ytrain)
    
    train_df = wcbtfidf.transform(xtrain)
    test_df = wcbtfidf.transform(xtest)
    
    model.fit(train_df,ytrain)
    preds = model.predict(test_df)
    print(f'Precision is {precision_score(ytest,preds)}')
    print(f'Recall is {recall_score(ytest,preds)}')
    print(f'ROC curve is {roc_auc_score(ytest,preds)}')
    print(classification_report(ytest,preds))
    return wcbtfidf,tfidf

In [14]:
model = LogisticRegression()
wcbtfidf_object,tfidf_object = check_hypothesis(xtrain,xtest,ytrain,ytest,300,model)

Running base version
Precision is 0.9060150680164761
Recall is 0.9951911111111111
ROC curve is 0.5330355555555556
              precision    recall  f1-score   support

           0       0.62      0.07      0.13     12500
           1       0.91      1.00      0.95    112500

    accuracy                           0.90    125000
   macro avg       0.76      0.53      0.54    125000
weighted avg       0.88      0.90      0.87    125000

Running my version
Precision is 0.9104624814567269
Recall is 0.9928977777777778
ROC curve is 0.5570488888888889
              precision    recall  f1-score   support

           0       0.65      0.12      0.20     12500
           1       0.91      0.99      0.95    112500

    accuracy                           0.91    125000
   macro avg       0.78      0.56      0.58    125000
weighted avg       0.88      0.91      0.88    125000



## ANALYSIS

In [15]:
# Length Comparison

tfidf_vocab = tfidf_object.vocabulary_
wcbtfidf_vocab = wcbtfidf_object.combine_vocab

print(len(wcbtfidf_vocab),len(tfidf_vocab))

300 300


In [16]:
# Words that are present in tfidf vocab but not in wcbtfidf

print(list(set(tfidf_vocab) - set(wcbtfidf_vocab)))

['girls', 'just', 'did', 'tho', 'fm', 'favorite', 'super', 'stay', 'idea', 'leave', 'cause', 'fine', 'ill', 'breakfast', 'hour', 'run', 'kids', 'lots', 'plurk', 'kind', 'does', 'mileycyrus', 'don', 'busy', 'using', 'watched', 'remember', 'totally', 'believe', 'stop', 'crazy', 'end', '100', 'ah', 'quite', 'seen', 'rest', 'guy', 'trip', 'hopefully', 'taking', 'reading', 'outside', 'fan', 'buy', 'beach', 'loved', 'room', 'tv', 'doing', 'weeks', 'pics', 'true', 'meet', 'talking', 'woke', 'send', 'wonderful', 'says', 'saturday', 'rock', 'years', 'probably', 'till', 'having', 'enjoying', 'lmao', 'came']


In [17]:
# Words that are present in wcbtfidf but not in tfidf

print(list(set(wcbtfidf_vocab) - set(tfidf_vocab)))

['call', 'missing', 'already', 'even', 'since', 'least', 'shit', 'around', 'find', 'still', 'one', 'didnt', 'take', 'sick', 'cant', 'headache', 'next', 'get', 'ever', 'everyone', 'would', 'much', 'almost', 'though', 'may', 'go', 'see', 'another', 'us', 'made', 'first', 'two', 'could', 'many', 'yet', 'sucks', 'everything', 'nothing', 'well', 'gone', 'someone', 'name', 'keep', 'show', 'never', 'give', 'back', 'something', 'always', 'please', 'poor', 'found', 'cold', 'iphone', 'stupid', 'lost', 'done', 'ugh', 'wanted', 'might', 'anymore', 're', 'hurts', 'last', 'also', 'put', 'anything', 'must']
