### Amazon Reviews - Sentiment Analysis

#### 1. Sentimental Analysis using VADER ( Valence Aware dictionary and Sentiment Reasoner)

In [1]:
import numpy as np
import pandas as pd
# import the reviews dataset
df = pd.read_csv(r'C:\Users\sanya\Desktop\sanya\sentiment analysis\amazonreviews.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [2]:
# to check the labels
df['label'].value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

In [3]:
df.isnull().sum()

label     0
review    0
dtype: int64

In [4]:
# remove nan values and empty strings 
df.dropna(inplace = True)
blanks = []
for i, lb, rv in df.itertuples():
    if type(rv)==str:
        if rv.isspace():
            blanks.append(i)
df.drop(blanks, inplace = True)

In [5]:
# check the labels again, they are just like before.This means there were no nan values or empty strings
df['label'].value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

In [6]:
# import sentiment Inensity Analyzer to run on the tsv file
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [7]:
# this is the polarity score of the first entry in the dataset, we need to this for all entries in the dataset
sid.polarity_scores(df.loc[0]['review'])

{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}

In [8]:
# this is the label of the first entry in the dataset, we need to this for all entries in the dataset 
#and then compare the predicted sentiment to check the accuracy
df.loc[0]['label']

'pos'

In [9]:
# calculating the polarity score of all the entries in the dataset
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))
df.head()

Unnamed: 0,label,review,scores
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co..."
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co..."
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com..."
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com..."
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp..."


In [10]:
# calculating the compounded score to ultimately label the statement with the sentiment the model thinks applies on it
df['compound'] = df['scores'].apply(lambda score_dict:score_dict['compound'])
df.head()

Unnamed: 0,label,review,scores,compound
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781


In [11]:
# model assigns the sentiment attached to the statement
df['comp_score'] = df['compound'].apply(lambda c:'pos' if c>=0 else 'neg')
df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos


In [12]:
# In the table printed above, we need to compare the label column with the comp_score
#to check the accuracy of the sentiments predicted
from sklearn.metrics import accuracy_score
accuracy_score(df['label'], df['comp_score'])

0.7091

#### 2. Sentimental Analysis using sklearn

In [13]:
# load the reviews dataset
ds = pd.read_csv(r'C:\Users\sanya\Desktop\sanya\sentiment analysis\amazonreviews.tsv', sep='\t')
ds.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [14]:
ds.isnull().sum() #to check for blank statements and nan values

label     0
review    0
dtype: int64

In [15]:
# splitting data into training and testing
from sklearn.model_selection import train_test_split
x=ds['review']
y=ds['label']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.20, random_state = 5)

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()), ('clf',MultinomialNB()),])
text_clf_lsvc = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC()),])

In [17]:
text_clf_nb.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [18]:
predictions = text_clf_nb.predict(x_test)

In [19]:
import sklearn.metrics as metrics
print(metrics.accuracy_score(y_test,predictions))

0.8275


In [20]:
text_clf_lsvc.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [21]:
predictions = text_clf_lsvc.predict(x_test)

In [22]:
print(metrics.accuracy_score(y_test, predictions))

0.879


In [23]:
from sklearn.feature_extraction import text
text_clf_lsvc2 = Pipeline([('tfidf',TfidfVectorizer(stop_words = text.ENGLISH_STOP_WORDS)),('clf',LinearSVC()),])
text_clf_lsvc2.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterward...
                                                       'anything', 'anyway',
                                                       'anywhere', ...}),
                                 strip_accents=None, sublinear_tf=False,
                                

In [24]:
print(metrics.accuracy_score(y_test,predictions))

0.879
