In [1]:
#import the required library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk

import warnings
warnings.filterwarnings('ignore')

In [2]:
#loading the dataset

data = pd.read_csv('amazon_alexa.tsv', sep='\t')

In [3]:
data.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


## Data Preprocessing

In [4]:
dataset = data[['verified_reviews','rating']]
dataset.columns = ['Review', 'Sentiment']

dataset.head()

Unnamed: 0,Review,Sentiment
0,Love my Echo!,5
1,Loved it!,5
2,"Sometimes while playing a game, you can answer...",4
3,I have had a lot of fun with this thing. My 4 ...,5
4,Music,5


In [5]:
# Creating a new column sentiment based on overall ratings
def compute_sentiments(labels):
    sentiments = []
    for label in labels:
        if label > 3.0:
            sentiment = 1
        elif label <= 3.0:
            sentiment = 0
        sentiments.append(sentiment)
    return sentiments

In [6]:
dataset['Sentiment'] = compute_sentiments(dataset.Sentiment)

In [7]:
dataset.head()

Unnamed: 0,Review,Sentiment
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


In [8]:
dataset['Sentiment'].value_counts()

Sentiment
1    2741
0     409
Name: count, dtype: int64

In [9]:
# check for null values
dataset.isnull().sum()

Review       1
Sentiment    0
dtype: int64

In [10]:
dataset[dataset['Review'].isnull()]

Unnamed: 0,Review,Sentiment
473,,0


In [23]:
x = dataset['Review'].values.astype('U')
y = dataset['Sentiment']

In [12]:
import spacy
nlp = spacy.load('en_core_web_sm')

# punct
import string
punct = string.punctuation


from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS) # list of stopwords

class CustomTokenizer():
    def __init__(self):
        pass

    def text_data_cleaning(self,sentence):
        doc = nlp(sentence)                         # spaCy tokenize text & call doc components, in order

        tokens = [] # list of tokens
        for token in doc:
            if token.lemma_ != "-PRON-":
                temp = token.lemma_.lower().strip()
            else:
              temp = token.lower_
            tokens.append(temp)

        cleaned_tokens = []
        for token in tokens:
            if token not in stopwords and token not in punct:
                cleaned_tokens.append(token)
        return cleaned_tokens

In [13]:
custom_tokenizer = CustomTokenizer()
custom_tokenizer.text_data_cleaning("Hello all, It's a beautiful day outside there!")

['hello', 'beautiful', 'day', 'outside']

In [14]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [19]:
# tokenizer=text_data_cleaning, tokenization will be done according to this function
tfidf = TfidfVectorizer(tokenizer=custom_tokenizer.text_data_cleaning)

## Train the model

In [24]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = dataset.Sentiment, random_state = 0)

In [25]:
# 2520 samples in training dataset and 630 in test dataset
x_train.shape, x_test.shape

((2520,), (630,))

In [26]:
classifier = LinearSVC()

In [27]:
# it will first do vectorization and then it will do classification
pipeline = Pipeline([('tfidf',tfidf), ('clf',classifier)])

In [35]:
#fit the model
pipeline.fit(x_train, y_train)

In [29]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [30]:
y_pred = pipeline.predict(x_test)

In [31]:
# confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 37,  45],
       [ 10, 538]], dtype=int64)

In [32]:
# classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.45      0.57        82
           1       0.92      0.98      0.95       548

    accuracy                           0.91       630
   macro avg       0.86      0.72      0.76       630
weighted avg       0.91      0.91      0.90       630



In [33]:
round(accuracy_score(y_test, y_pred)*100,2)

91.27

In [34]:
#Example
prediction = pipeline.predict(["Alexa is bad"])

if prediction == 1:
  print("Result: This review is positive")
else:
  print("Result: This review is negative")

Result: This review is negative


In [36]:
import joblib
joblib.dump(pipeline,'sentiment_model.pkl')

['sentiment_model.pkl']