<a href="https://colab.research.google.com/github/PrasadRaoJammuna/test/blob/master/AMZ_LR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("fivethirtyeight")

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import roc_curve, auc,accuracy_score,roc_auc_score
from sklearn.metrics import confusion_matrix

import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/

from nltk.corpus import stopwords
from gensim.models import Word2Vec

from tqdm import tqdm
from bs4 import BeautifulSoup
from prettytable import PrettyTable


In [17]:
amz = pd.read_csv('Reviews.csv')
amz.head(1)

ParserError: ignored

In [0]:
amz.shape

In [0]:
amz=amz[amz['Score']!=3]
amz.shape

In [0]:
def score_rate(score):
    if score >3:
        return 1
    else:
        return 0
  

In [0]:
score_rate(amz['Score'][1])

In [0]:
amz['Score'] = amz['Score'].apply(score_rate)
amz['Score'].head(5)

In [0]:
amz.Score.value_counts(normalize=True)

In [0]:
#Deduplication of entries
amz=amz.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
amz.shape

In [0]:
amz=amz[amz.HelpfulnessNumerator<=amz.HelpfulnessDenominator]
amz.shape

###Preprocessing Review Text
Now that we have finished deduplication our data requires some preprocessing before we go on further with analysis and making the prediction model.

Hence in the Preprocessing phase we do the following in the order below:-

Remove any punctuations or limited set of special characters like , or . or # etc.
Begin by removing the html tags
Check if the word is made up of english letters and is not alpha-numeric
Check to see if the length of the word is greater than 2 (as it was researched that there is no adjective in 2-letters)
Convert the word to lowercase
Remove Stopwords
Finally Snowball Stemming the word (it was obsereved to be better than Porter Stemming)
After which we collect the words used to describe positive and negative reviews

In [0]:
# printing some random reviews
sent_0 = amz['Text'].values[0]
print(sent_0)
print("="*50)

sent_1000 = amz['Text'].values[1000]
print(sent_1000)
print("="*50)

In [0]:
# https://stackoverflow.com/a/47091490/4084039

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [0]:
sent_1000 = decontracted(sent_1000)
print(sent_1000)
print("="*50)

In [0]:
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
# <br /><br /> ==> after the above steps, we are getting "br br"
# we are including them into stop words list
# instead of <br /> if we have <br/> these tags would have revmoved in the 1st step

import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [0]:
df_pos = amz[amz['Score']==1].head(50000).sample(5000,random_state=100)
df_pos.shape

In [0]:
df_neg = amz[amz['Score']==0].tail(10000).sample(5575,random_state=1000)
df_neg.shape

In [0]:
df = pd.concat([df_neg,df_pos])
df.shape

In [0]:
df.Score.value_counts(normalize=True)

In [0]:
df.head(2)

In [0]:
# Combining all the above stundents 
from tqdm import tqdm
preprocessed_reviews = []
# tqdm is for printing the status bar
for sentance in tqdm(df['Text'].values):
    sentance = re.sub(r"http\S+", " ", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    # https://gist.github.com/sebleier/554280
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords)


    preprocessed_reviews.append(sentance.strip())

In [0]:
len(preprocessed_reviews)

In [0]:
preprocessed_reviews[89]

# Applying Logistic Regression

## [1.1] Logistic Regression on BOW, SET 1

### [1.1.1] Applying Logistic Regression with L1 regularization on BOW, SET 1

In [0]:
bow_vect = CountVectorizer()
bow_words = bow_vect.fit_transform(preprocessed_reviews).toarray()

print("shape of bow",bow_words.shape)
print()
print("some feature names:",bow_vect.get_feature_names()[:10])

In [0]:
x = bow_words
print(type(x))
print(x.shape)
y = np.array(df['Score'])
print(y.shape)

In [0]:
from sklearn.preprocessing import StandardScaler

In [0]:
scalar  = StandardScaler()
x = scalar.fit_transform(x)

In [0]:
type(x)

In [0]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=75)

#### simple Cross validation

In [0]:

x_tr,x_cv,y_tr,y_cv = train_test_split(x_train,y_train,test_size=0.2,random_state=786)

auc_scores_cv=[]
auc_scores_tr = []

lam =[0.0001,0.001,0.01,0.1,1.0,10,100,1000]

for i in lam:
    lr_bow = LogisticRegression(C=i,penalty='l1')
    lr_bow.fit(x_tr,y_tr)
    pred_cv= lr_bow.predict(x_cv)
    pred_tr= lr_bow.predict(x_tr)
     
    fpr_cv, tpr_cv, thresholds_cv = roc_curve(y_cv, pred_cv)
    auc_score_cv = auc(fpr_cv, tpr_cv)
    
    fpr_tr, tpr_tr, thresholds_tr = roc_curve(y_tr, pred_tr)
    auc_score_tr = auc(fpr_tr, tpr_tr)
    
    
    auc_scores_cv.append((auc_score_cv,i))
    auc_scores_tr.append((auc_score_tr,i))
    
    print("\nCV Accuracy for  (α)={} is {:.2f}".format(i,auc_score_cv))
    
print(10*'===')
print("Best accuracy:",max(auc_scores_cv))

In [0]:
a=[]
s =[]
for i,j in auc_scores_cv:
    a.append(j)
    s.append(i)
    
k =[]
l=[]

for i,j in auc_scores_tr:
    k.append(j)
    l.append(i)
    
plt.figure(figsize=(8,5))
plt.style.use('fivethirtyeight')

plt.plot(a,s)
plt.plot(k,l)
#plt.xlim([-0.0001,1000])
plt.xlabel("Lambda-values")
plt.ylabel('AUC scores')
plt.show()

print("Best Lambda-CrossValidation:",max(auc_scores_cv)[1])
#print("Best Lambda-Tr data:",max(auc_scores_tr)[1])

#### GridiSearchCv with BOW

In [0]:
lam =[0.1,1.0,10,100,1000]
param_grid=dict(C=lam)
print(param_grid)

In [0]:
grid_lr_bow = GridSearchCV(lr_bow, param_grid=param_grid,cv=3,scoring='roc_auc')
grid_lr_bow.fit(x_train,y_train)

In [0]:
print("Best Estimators:",grid_lr_bow.best_estimator_)
print("Best lambda:",grid_lr_bow.best_params_)
print("Bset Roc-AUC score:",grid_lr_bow.best_score_)

In [0]:
lr_bow = LogisticRegression(penalty='l1',C=grid_lr_bow.best_params_['C'])
lr_bow.fit(x_train,y_train)
y_pred_bow = lr_bow.predict(x_test)
y_pred_train_bow = lr_bow.predict(x_train)
print("Accuracy on Test Data:",accuracy_score(y_pred_bow,y_test)*100)
print("Accuracy on Train Data:",accuracy_score(y_pred_train_bow,y_train)*100)

#### AUC SCore on Test Data

In [0]:
fpr_test_bow, tpr_test_bow, thresholds = roc_curve(y_pred_bow, y_test)
auc_score_test_bow= auc(fpr_test_bow, tpr_test_bow)
print("AUC Score on Test data:",(auc_score_test_bow)*100)

In [0]:
roc_auc_test_bow = roc_auc_score(y_test,y_pred_bow)
fpr_test_bow,tpr_test_bow,thersholds_test = roc_curve(y_test,lr_bow.predict_proba(x_test)[:,1])

roc_auc_train_bow = roc_auc_score(y_train,y_pred_train_bow)
fpr_train,tpr_train,thersholds_train = roc_curve(y_train,lr_bow.predict_proba(x_train)[:,1])


In [0]:
plt.figure(figsize=(8,5))
plt.style.use('classic')
plt.plot(fpr_test_bow,tpr_test_bow,label='Model_LR with Test Data (area = %.2f).'% roc_auc_test_bow)
plt.plot(fpr_train,tpr_train,label='Model_LR with Train Data (area = %.2f).'% roc_auc_train_bow)
plt.plot([0,1],[0,1])
plt.xlim([-0.05,1.05])
plt.ylim([-0.05,1.05])
plt.legend(loc=0)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title("RoC -AUC curve Model-1")

plt.show()

#### confusion Matrix

In [0]:
cm=confusion_matrix(y_test,y_pred_bow)

class_label = ["negative", "positive"]
df_cm = pd.DataFrame(cm, index = class_label, columns = class_label)
plt.figure(figsize=(5,4))
sns.heatmap(df_cm, annot = True,cbar=False,fmt = "d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

#### [11.1.1] Calculating sparsity on weight vector obtained using L1 regularization on BOW, SET 1

In [0]:
w = lr_bow.coef_
print(" sparsity on weight vector obtained using L1 regularization on BOW ",np.count_nonzero(w))

### [1.1.2] Applying Logistic Regression with L2 regularization on BOW, SET 1

#### gridSearchCV

In [0]:
lam =[0.001,0.01,0.1,1.0,10,100,1000]
param_grid=dict(C=lam)
print(param_grid)

In [0]:
lr_bow = LogisticRegression(penalty='l2')

In [0]:
grid_lr_bow = GridSearchCV(lr_bow, param_grid=param_grid,cv=3,scoring='f1')
grid_lr_bow.fit(x_train,y_train)

In [0]:
print("Best Estimators:",grid_lr_bow.best_estimator_)
print("Best lambda:",grid_lr_bow.best_params_)
print("Bset f1- score:",grid_lr_bow.best_score_)

In [0]:
lr_bow = LogisticRegression(penalty='l2',C=grid_lr_bow.best_params_['C'])
lr_bow.fit(x_train,y_train)
y_pred_bow = lr_bow.predict(x_test)
y_pred_train_bow = lr_bow.predict(x_train)
print("Accuracy on Test Data:",accuracy_score(y_pred_bow,y_test)*100)
print("Accuracy on Train Data:",accuracy_score(y_pred_train_bow,y_train)*100)

#### AUC roc Curve-model comparison

In [0]:
roc_auc_test_bow_l2 = roc_auc_score(y_test,y_pred_bow)
fpr_test_bow_l2,tpr_test_bow_l2,thersholds_test = roc_curve(y_test,lr_bow.predict_proba(x_test)[:,1])


In [0]:
plt.figure(figsize=(8,5))
plt.style.use('classic')
plt.plot(fpr_test_bow,tpr_test_bow,label='Model_LR with Test Data-L1 (area = %.2f).'% roc_auc_test_bow)
plt.plot(fpr_test_bow_l2,tpr_test_bow_l2,label='Model_LR with Test Data -L2(area = %.2f).'% roc_auc_test_bow_l2)
plt.plot([0,1],[0,1])
plt.xlim([-0.05,1.05])
plt.ylim([-0.05,1.05])
plt.legend(loc=0)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title("RoC -AUC curve Model-1")

plt.show()

#### Confusion Matrix

In [0]:
cm=confusion_matrix(y_test,y_pred_bow)

class_label = ["negative", "positive"]
df_cm = pd.DataFrame(cm, index = class_label, columns = class_label)
plt.figure(figsize=(5,4))
sns.heatmap(df_cm, annot = True,cbar=False,fmt = "d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

#### [1.1.2.1] Performing pertubation test (multicollinearity check) on BOW, SET 1

In [0]:
lr_bow_x = LogisticRegression(penalty='l2',C=grid_lr_bow.best_params_['C'])
lr_bow_x.fit(x_train,y_train)

In [0]:
lr_bow_x.coef_

In [0]:
a = lr_bow_x.coef_
a.shape