In [59]:
# importing libraries
import warnings 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

import re
import json 
import nltk
import spacy
import string
import unicodedata
from bs4 import BeautifulSoup
from textblob import TextBlob 
from nltk.stem import WordNetLemmatizer

from IPython import display 
display.set_matplotlib_formats('svg')
warnings.filterwarnings('ignore')

In [143]:
#pip install nltk

In [144]:
# load the data 
data = pd.read_csv(r'D:\project\major project verzeo\amazon_alexa.csv')
data.head()

Unnamed: 0,reviews,sentiment
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


Here, we are going to use only 2 columns Independent(reviewText) and Dependent(rating). Let's ignore all other columns.  

In [145]:
# selection columns! 
data = data[['reviews', 'sentiment']]
data

Unnamed: 0,reviews,sentiment
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1
...,...,...
3145,"Perfect for kids, adults and everyone in betwe...",1
3146,"Listening to music, searching locations, check...",1
3147,"I do love these things, i have them running my...",1
3148,Only complaint I have is that the sound qualit...,1


In [146]:
# check the shape
data['sentiment'].value_counts()

1    2893
0     257
Name: sentiment, dtype: int64

In [147]:
# check whehter it's having null values or not
data.isnull().sum()

reviews      0
sentiment    0
dtype: int64

### Visualize words using WordCloud

In [148]:
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image  

# get the text based on ratings! 
rating_zero = data['reviews'][data['sentiment'] == 0]
rating_one = data['reviews'][data['sentiment'] == 1]


# word tokenize the data 
def list_tokenizer(rating): 
    ratings = " ".join(rating)
    ratings = nltk.word_tokenize(ratings)
    return str(ratings)

# visualizer 
def word_cloud(rating, number): 
    wc = WordCloud(background_color = 'black', max_font_size = 50, max_words = 100)  
    wc.generate(rating)  
    plt.figure(figsize=(10,8))
    plt.imshow(wc, interpolation = 'bilinear')  
    plt.title(f'WordCloud for {number}')
    plt.axis('off');

In [149]:
#import nltk
#nltk.download('punkt')

Now, we have suitable columns for model building, but before that we need to pre-process the text. Let's do that! 

### Pre-processing!
If you want a quick recap on pre-processing techniques you can check the notebook [**here**](https://github.com/RAravindDS/Learny/blob/main/NLP/TextCleaning/Cleaning_text_data.ipynb)

In [150]:
# 1. lowering case 
data['reviews'] = data['reviews'].str.lower()  # lowering the case! 
data.head()

Unnamed: 0,reviews,sentiment
0,love my echo!,1
1,loved it!,1
2,"sometimes while playing a game, you can answer...",1
3,i have had a lot of fun with this thing. my 4 ...,1
4,music,1


In [151]:
# 2. remove punctuation! 
data['reviews'] = data['reviews'].apply(lambda x: re.sub('[^a-z A-Z 0-9-]+', '', x))  # it removes the punctuation 
data.head()

Unnamed: 0,reviews,sentiment
0,love my echo,1
1,loved it,1
2,sometimes while playing a game you can answer ...,1
3,i have had a lot of fun with this thing my 4 y...,1
4,music,1


In [152]:
# 3. remove stopwords! (you can check the notebook, if you have any doubt)

from spacy.lang.en.stop_words import STOP_WORDS
data['reviews'] = data['reviews'].apply(lambda x: " ".join([i for i in x.split() if i not in STOP_WORDS]) )

data.head()

Unnamed: 0,reviews,sentiment
0,love echo,1
1,loved,1
2,playing game answer question correctly alexa s...,1
3,lot fun thing 4 yr old learns dinosaurs contro...,1
4,music,1


In [153]:
# 4. remove url and tags! 
data['reviews'] = data['reviews'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , str(x)))
                                              
data.head()

Unnamed: 0,reviews,sentiment
0,love echo,1
1,loved,1
2,playing game answer question correctly alexa s...,1
3,lot fun thing 4 yr old learns dinosaurs contro...,1
4,music,1


In [154]:
# 5. remove html tags! 
data['reviews'] = data['reviews'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())

data.head()

Unnamed: 0,reviews,sentiment
0,love echo,1
1,loved,1
2,playing game answer question correctly alexa s...,1
3,lot fun thing 4 yr old learns dinosaurs contro...,1
4,music,1


In [155]:
# 7. remove emails 
data['reviews'] = data['reviews'].apply(lambda x: re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+\b)', '', x))  # it will remove te emails 

In [156]:
# 8. remove extra spaces
data['reviews'] = data['reviews'].apply(lambda x: " ".join(x.split()))
data.head()

Unnamed: 0,reviews,sentiment
0,love echo,1
1,loved,1
2,playing game answer question correctly alexa s...,1
3,lot fun thing 4 yr old learns dinosaurs contro...,1
4,music,1


In [157]:
#import nltk
#nltk.download('wordnet')

In [158]:
#import nltk
#nltk.download('omw-1.4')

In [159]:
# 9. lemmatizer 

%time
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

data["reviews"] = data["reviews"].apply(lambda text: lemmatize_words(text))
data.head()

CPU times: total: 0 ns
Wall time: 0 ns


Unnamed: 0,reviews,sentiment
0,love echo,1
1,loved,1
2,playing game answer question correctly alexa s...,1
3,lot fun thing 4 yr old learns dinosaur control...,1
4,music,1


### Text to Words

Now, we have pre-processed the file, let's start the model buidling. 

But before model building, we need to convert the text to numbers. So, let's do this by two methods `BOW` & `TF-IDF`~


#### 1. Bag of Words 
If you have any doubt check our [**notebook**](https://github.com/RAravindDS/Learny/blob/main/NLP/BagofWords.ipynb)

But before this, let's split the data!

In [160]:
# data split 
from sklearn.model_selection import train_test_split 
xtrain, xtest, ytrain, ytest = train_test_split(data['reviews'], data['sentiment'], test_size = 0.3)

In [161]:
train = pd.DataFrame(xtrain,columns=['reviews'])
trainx = pd.DataFrame(ytrain,columns=['sentiment'])
train['sentiment'] = trainx
train

Unnamed: 0,reviews,sentiment
2263,worked advertised,1
653,wanted white dot white bathroom black shown kn...,0
993,extremely useful simple thing like spotify aud...,1
620,brought replace moved doesnt range previous no...,0
965,wanting bought prime day absolutely love,1
...,...,...
1782,far fun use learn deal alexa learning finding ...,1
752,great sound easy set,1
1464,love echo awesome speaker love fact listen mus...,1
2019,,0


In [163]:
test = pd.DataFrame(xtest,columns=['reviews'])
testy = pd.DataFrame(ytest,columns=['sentiment'])
test['sentiment'] = testy
test

Unnamed: 0,reviews,sentiment
995,getting use echo shopping list listening music...,1
598,cant figure use,0
783,echo work able sync amazon music device,1
2225,current demand stick high power coming usb por...,1
1071,love way designed choice face use bed drop ech...,1
...,...,...
1078,fault thought wireless price feel like sound q...,1
831,love,1
867,whats like speaker ask alexa,1
320,sound quality good wish alexa answer question,1


In [166]:
# Save train and test DataFrames as .csv files
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)


Unnamed: 0,reviews,sentiment
0,worked advertised,1
1,wanted white dot white bathroom black shown kn...,0
2,extremely useful simple thing like spotify aud...,1
3,brought replace moved doesnt range previous no...,0
4,wanting bought prime day absolutely love,1
...,...,...
2200,far fun use learn deal alexa learning finding ...,1
2201,great sound easy set,1
2202,love echo awesome speaker love fact listen mus...,1
2203,,0


In [116]:
# Convert text to numbers using (BOW)! 
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

# let's convert 
xtrain_bow = vectorizer.fit_transform(xtrain).toarray()
xtest_bow = vectorizer.transform(xtest).toarray()


#### 2. TF-IDF 

If you have any doubt you can check our [**notebook**](https://github.com/RAravindDS/Learny/blob/main/NLP/TFIDF/TFIDF.ipynb)

In [117]:
# Convert text to numbers using (TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer  

tf_vectorizer = TfidfVectorizer()

# let's convert 
xtrain_tf = tf_vectorizer.fit_transform(xtrain).toarray()
xtest_tf = tf_vectorizer.transform(xtest).toarray()

### Model Building 

Here, we are going to use the `Gaussina NB` model. 

First we will see the results for `BOW` 

In [119]:
from sklearn.naive_bayes import GaussianNB

# USING BOW 
clf_bow = GaussianNB().fit(xtrain_bow, ytrain)  # fitting 
prediction_bow = clf_bow.predict(xtest_bow)  # predictions

# USINGN TFIDF 
clf_tf = GaussianNB().fit(xtrain_tf, ytrain)
prediction_tf = clf_tf.predict(xtest_tf)

# ***Building KNN  model***

In [120]:
from sklearn.neighbors import KNeighborsClassifier

In [121]:
knn = KNeighborsClassifier(n_neighbors=2000)
knn.fit(xtrain_bow,ytrain)
pred_train = knn.predict(xtrain_bow)
pred_test = knn.predict(xtest_bow)

# ***Building Logistic regression model***

In [122]:
from sklearn.linear_model import LogisticRegression


In [123]:
# let's check our model! 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score 

def metrics(prediction, actual): 
    print('Confusion_matrix \n', confusion_matrix(actual, prediction))
    print('\nAccuracy:', accuracy_score(actual, prediction))
    print('\nclassification_report\n')
    print(classification_report(actual, prediction))
    
    
metrics(prediction_bow, ytest)

Confusion_matrix 
 [[ 46  29]
 [262 608]]

Accuracy: 0.692063492063492

classification_report

              precision    recall  f1-score   support

           0       0.15      0.61      0.24        75
           1       0.95      0.70      0.81       870

    accuracy                           0.69       945
   macro avg       0.55      0.66      0.52       945
weighted avg       0.89      0.69      0.76       945



In [98]:
metrics(prediction_tf, ytest)

Confusion_matrix 
 [[ 36  37]
 [280 592]]

Accuracy: 0.6645502645502646

classification_report

              precision    recall  f1-score   support

           0       0.11      0.49      0.19        73
           1       0.94      0.68      0.79       872

    accuracy                           0.66       945
   macro avg       0.53      0.59      0.49       945
weighted avg       0.88      0.66      0.74       945

