In [21]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
import xgboost as xgb

In [2]:
#stopword_list = set(stopwords.words('english'))

stopword_list = set(["i", "me", "my", "myself","we", "our", "ours", "ourselves","you", "your", "yours",
               "their", "they", "his", "her","she", "he", "a", "an", "and","is", "was", "are", "were", 
               "him", "himself", "has", "have", "it", "its", "the", "us"])

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

def remove_noise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_special_characters(text)
    return text

In [3]:
#Load the Train, Test, Test_y data
x_data = pd.read_csv("../data/alldata.tsv",sep="\t")
x_train = pd.read_csv("../data/split_0/train.csv")
x_test = pd.read_csv("../data/split_0/test.csv")
test_y = pd.read_csv("../data/split_0/test_y.csv")

In [None]:
#Remove the Noise
#x_data["review"] = x_data["review"].apply(remove_noise_text)

In [4]:
#Remove stopwords
x_data["review"] = x_data["review"].apply(lambda x: " ".join(x for x in x.split() if x.lower() not in stopword_list))

In [5]:
df = (pd.Series(' '.join(x_data['review']).split())).value_counts().reset_index()
df = df.rename(columns={'index':'word',0:'word_count'})

In [6]:
#Remove less frequent words
least_frequent_words = set(df[df['word_count'] < 11]['word'].str.lower())
x_data["review"] = x_data["review"].apply(lambda x: " ".join(x for x in x.split() if x.lower() not in least_frequent_words))


In [12]:
y_train = x_train['sentiment']
y_test = test_y['sentiment']

In [9]:
#cv=CountVectorizer(min_df=0.001,max_df=0.5,ngram_range=(1,3))
cv=CountVectorizer(ngram_range=(1,3))
X_data_sparse = cv.fit_transform(x_data['review'])

X_train_sparse = cv.transform(x_train['review'])
X_test_sparse = cv.transform(x_test['review'])

print (len(cv.vocabulary_))
print (X_test_sparse.shape)
print (X_train_sparse.shape)

3251992
(25000, 3251992)
(25000, 3251992)


In [10]:
#Logistic Regression
logit = LogisticRegression(random_state=17,n_jobs=-1, solver='lbfgs')
logit.fit(X_train_sparse, y_train)
print ("AUC: {}".format(round(roc_auc_score(y_test, logit.predict(X_test_sparse)), 3)))

AUC: 0.89


In [None]:
df = (pd.Series(' '.join(x_data['review']).split())).value_counts().reset_index()
df = df.rename(columns={'index':'word',0:'word_count'})

In [None]:
least_frequent_words = set(df[df['word_count'] < 11]['word'].str.lower())
x_data["review"] = x_data["review"].apply(lambda x: " ".join(x for x in x.split() if x.lower() not in least_frequent_words))

In [7]:
df1 = (pd.Series(' '.join(x_data['review']).split())).value_counts().reset_index()
df1 = df1.rename(columns={'index':'word',0:'word_count'})

In [None]:
df1

In [None]:
#Logistic Regression
logit = LogisticRegression(C=2.0,
                           solver='lbfgs')
logit.fit(X_train_sparse, y_train)
print ("AUC: {}".format(round(roc_auc_score(y_test, logit.predict(X_test_sparse)), 3)))

In [19]:
#Logistic Regression
for i in [0.001,0.002,0.003,0.004,0.006,0.009,0.01,0.02,0.04,0.05,0.07,0.09,0.1,0.2,0.3,0.4,0.5,0.6]:
    logit = LogisticRegression(penalty='l2',
                               C=i,
                               random_state=17,
                               n_jobs=-1,
                               solver='lbfgs')
    logit.fit(X_train_sparse, y_train)
    print ("Sigma:{} AUC: {}".format(i,round(roc_auc_score(y_test, logit.predict(X_test_sparse)), 3)))

Sigma:0.001 AUC: 0.857
Sigma:0.002 AUC: 0.869
Sigma:0.003 AUC: 0.876
Sigma:0.004 AUC: 0.88
Sigma:0.006 AUC: 0.885
Sigma:0.009 AUC: 0.888
Sigma:0.01 AUC: 0.889
Sigma:0.02 AUC: 0.89
Sigma:0.04 AUC: 0.893
Sigma:0.05 AUC: 0.893
Sigma:0.07 AUC: 0.893
Sigma:0.09 AUC: 0.894
Sigma:0.1 AUC: 0.894
Sigma:0.2 AUC: 0.893
Sigma:0.3 AUC: 0.892
Sigma:0.4 AUC: 0.891
Sigma:0.5 AUC: 0.892
Sigma:0.6 AUC: 0.893


In [20]:
#Logistic Regression
for i in [0.7,0.8,0.9,1.0,1.1,1.3,1.5,1.7,1.9,2.0,2.3,2.5,2.7]:
    logit = LogisticRegression(penalty='l2',
                               C=i,
                               random_state=17,
                               n_jobs=-1,
                               solver='lbfgs')
    logit.fit(X_train_sparse, y_train)
    print ("Sigma:{} AUC: {}".format(i,round(roc_auc_score(y_test, logit.predict(X_test_sparse)), 3)))

Sigma:0.7 AUC: 0.891
Sigma:0.8 AUC: 0.892
Sigma:0.9 AUC: 0.892
Sigma:1.0 AUC: 0.89
Sigma:1.1 AUC: 0.891
Sigma:1.3 AUC: 0.891
Sigma:1.5 AUC: 0.891
Sigma:1.7 AUC: 0.89
Sigma:1.9 AUC: 0.89
Sigma:2.0 AUC: 0.889
Sigma:2.3 AUC: 0.89
Sigma:2.5 AUC: 0.89
Sigma:2.7 AUC: 0.891


In [22]:
#Xgboost - Classifier
xg_reg = xgb.XGBClassifier(objective ='reg:squarederror', 
                           colsample_bytree = 0.1, 
                           learning_rate = 0.04,
                           max_depth = 25, 
                           alpha = 1, 
                           n_estimators = 1000)

xg_reg.fit(X_train_sparse,y_train)
print ("Sigma:{} AUC: {}".format(i,round(roc_auc_score(y_test, logit.predict(X_test_sparse)), 3)))

Sigma:2.7 AUC: 0.891


In [None]:
#Logistic Regression
for i in [0.09]:
    logit = LogisticRegression(penalty='l2',
                               C=i,
                               random_state=17,
                               n_jobs=-1,
                               max_iter=1000,
                               solver='lbfgs')
    logit.fit(X_train_sparse, y_train)
    print ("Sigma:{} AUC: {}".format(i,round(roc_auc_score(y_test, logit.predict(X_test_sparse)), 3)))