In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from wcbtfidf import Wcbtfidf

In [2]:
positive = pd.read_csv('processedPositive.csv').T.reset_index(level=0).rename(columns={'index':'text'}).sample(n=800,random_state=60)
negative = pd.read_csv('processedNegative.csv').T.reset_index(level=0).rename(columns={'index':'text'}).sample(n=400,random_state=60)
neutral = pd.read_csv('processedNeutral.csv').T.reset_index(level=0).rename(columns={'index':'text'}).sample(n=200,random_state=60)

positive['target'] = 2
negative['target'] = 0
neutral['target'] = 1

print(positive.shape,negative.shape,neutral.shape)

(800, 2) (400, 2) (200, 2)


In [3]:
df = pd.concat([positive,negative,neutral])
df.shape

(1400, 2)

In [4]:
df.head()

Unnamed: 0,text,target
431,isn't it terrible that we live in a world wher...,2
414,DRAGON MORE! OMG :D,2
203,I think I'm probably beyond repair by the soun...,2
1022,you change your whole life happy,2
756,TICKETS AVAILABLE NOW for The 401 Festival of ...,2


In [5]:
df['target'].value_counts(normalize=True)

2    0.571429
0    0.285714
1    0.142857
Name: target, dtype: float64

In [6]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub("[^a-z0-9]"," ",text)
    text = re.sub("(\s)+"," ",text)
    return text

In [7]:
df['clean_text'] = df['text'].apply(preprocess_text)

In [8]:
xtrain,xtest,ytrain,ytest = train_test_split(df['clean_text'],df['target'],test_size=0.25,random_state=60,stratify=df['target'])

print(xtrain.shape,ytrain.shape)
print(xtest.shape,ytest.shape)

(1050,) (1050,)
(350,) (350,)


In [9]:
def check_hypothesis(xtrain,xtest,ytrain,ytest,max_feat,model):
    
    print('Running base version')
    tfidf = TfidfVectorizer(max_features=max_feat,stop_words='english')
    train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.vocabulary_)
    test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.vocabulary_)
    
    
    model.fit(train_df,ytrain)
    preds = model.predict(test_df)
    print(classification_report(ytest,preds))
    
    print('Running my version')
    wcbtfidf = Wcbtfidf(max_features=max_feat)
    wcbtfidf.fit(xtrain,ytrain)
    
    train_df = wcbtfidf.transform(xtrain)
    test_df = wcbtfidf.transform(xtest)
    
    model.fit(train_df,ytrain)
    preds = model.predict(test_df)
    print(classification_report(ytest,preds))
    return wcbtfidf

In [10]:
model = LogisticRegression()
wcbtfidf_object = check_hypothesis(xtrain,xtest,ytrain,ytest,300,model)

Running base version
              precision    recall  f1-score   support

           0       0.98      0.79      0.87       100
           1       0.82      0.28      0.42        50
           2       0.77      0.97      0.86       200

    accuracy                           0.82       350
   macro avg       0.86      0.68      0.72       350
weighted avg       0.84      0.82      0.80       350

Running my version
              precision    recall  f1-score   support

           0       0.95      0.79      0.86       100
           1       0.95      0.36      0.52        50
           2       0.79      0.97      0.87       200

    accuracy                           0.83       350
   macro avg       0.90      0.71      0.75       350
weighted avg       0.86      0.83      0.82       350

