# Analysis of Sentiment

#### Importing libraries

In [1]:
import pandas as pd
import numpy as np

#### Reading the files of consumer_reviews and validation in csv

In [61]:
s=pd.read_csv("/home/shwetha/Downloads/consumer_reviews.csv")
k=pd.read_csv("/home/shwetha/Downloads/validation.csv")

### #Since reviews.tile does not affect the ratings in validation file we drop reviews.title

In [62]:
ks=k.drop(["reviews.title"],axis=1)
ks

Unnamed: 0,reviews.rating,reviews.text
0,5.0,I realize that many people will believe that t...
1,5.0,Great tablet from Amazon! I works quickly and ...
2,5.0,This is my very first kindle. I am enjoying re...
3,5.0,Small and light weight. Even in a leather case...
4,5.0,Easy to use for books and audio. Will use on v...
5,5.0,"Works as it should, this was a replacement of ..."
6,2.0,"Hard to use, Lots of ads, and Randomly closes ..."
7,3.0,"I'm mostly happy, but like so many electronics..."
8,3.0,After the initial trial questions found our Al...
9,5.0,I brought 3 of them. 2 were gifts. We love the...


#### Same goes for the consumer_review where sub,primary categories and review does not affect the target hence we drop it.

In [None]:
ns=s.drop(["sub-categories","primary-categories","reviews"],axis=1)

# Pre-processing

##### Importing the Preprocessing pipeline for processing documents with Gensim. It easily manages text data to format data frames, run classification, etc. 

In [5]:
import gensim
from gensim.parsing.preprocessing import strip_numeric 
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_punctuation2
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_tags
#from gensim.parsing.preprocessing import stem_text
from gensim.parsing.preprocessing import preprocess_string

In [6]:
filters=[strip_numeric,strip_multiple_whitespaces,strip_non_alphanum,strip_punctuation,strip_punctuation,strip_punctuation2,strip_tags]

# Data Cleaning

In [7]:
def clean(string):
  words=preprocess_string(string,filters)
  return " ".join(word.lower() for word in words)

In [10]:
cl=[]
for i in ns["title"]:
  cl.append(clean(i))
ns["clean"]=cl
ck=[]
for i in ks["reviews.text"]:
    ck.append(clean(i))
ks["clean"]=ck


#### We convert the review.rating to binary nos. so as to show negative (0) or positive (1)

In [11]:
t=[]
for x in ks["reviews.rating"]:
    if x==5.0:
        t.append(0)
    else:
        t.append(1)
ks["reviews.rating"]=t
ytest=ks["reviews.rating"]

In [12]:
gs=ns.drop("title",axis=1)

### Converting ratings in to binary 

In [13]:
y=[]
for x in ns["rating"]:
    if x>3:
        y.append(1)
    else:
      y.append(0)
ns["rating"]=y
ns

Unnamed: 0,rating,title,clean
0,0,... 3 of them and one of the item is bad quali...,of them and one of the item is bad quality is ...
1,1,... always the less expensive way to go for pr...,always the less expensive way to go for produc...
2,1,... are not Duracell but for the price i am ha...,are not duracell but for the price i am happy
3,1,... as well as name brand batteries at a much ...,as well as name brand batteries at a much better
4,1,... batteries are very long lasting the price ...,batteries are very long lasting the price is g...
5,1,... batteries for Christmas and the AmazonBasi...,batteries for christmas and the amazonbasics c...
6,1,... batteries have ordered them in the past be...,batteries have ordered them in the past been v...
7,1,... batteries that last quite a while then the...,batteries that last quite a while then these a...
8,0,... do not hold the amount of high power juice...,do not hold the amount of high power juice lik...
9,1,... done well by me appear to have a good shel...,done well by me appear to have a good shelf life


In [14]:
gs['rating'].value_counts()

5    19897
4     5648
3     1206
1      965
2      616
Name: rating, dtype: int64

In [15]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


#### NLTK consists of the most common algorithms such as tokenizing, part-of-speech tagging, stemming, sentiment analysis, topic segmentation, and named entity recognition

In [16]:
import nltk

##### Ipmorting libraries like portstemmer and regexptokenizer for nltk

In [17]:
nltk.download("punkt")
nltk.download("stopwords")
from nltk.stem import PorterStemmer 
from nltk.tokenize import RegexpTokenizer

[nltk_data] Downloading package punkt to /home/shwetha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/shwetha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
st=PorterStemmer()
tk=RegexpTokenizer(r'[a-zA-z\']+')

### Word Tokenization
##### It breaks text paragraph into words.

#### Stemming is a process of linguistic normalization, which reduces words to their word root word or chops off the derivational affixes.

In [19]:
def tokenize(te):
  return[stemmer.stem(word) for word in tk.tokenize(te.lower())]

#### Importing sklearn.feature_extraction which converts a collection of raw documents to a matrix of TF-IDF features.

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

#### Removing stopwords such as is, am, are, this, a, an, the, etc.

In [23]:
v=TfidfVectorizer(stop_words='english',analyzer="word")
x=v.fit(gs["clean"])
x="tb"
pickle.dump(v,open("tb","wb"))
x=v.transform(gs["clean"])
xtest=v.transform(ks["clean"])

# Testing and Training 

In [25]:
xtest.shape

(20000, 2769)

In [26]:
y=gs["rating"]
y.shape

(28332,)

In [27]:
xtrain=x
ytrain=y

In [28]:
ytrain.shape
ytest.shape

(20000,)

### Fitting the model using Random forest classifier since this classifier gives the best accuracy

In [29]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier()
clf.fit(xtrain, ytrain)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Checking the accurcy score of the model

In [64]:
from sklearn.metrics import roc_auc_score

In [65]:
y=clf.predict_proba(xtest)[:,1]
y.shape

(20000,)

In [66]:
roc_auc_score(ytest,y)

0.561904864390129

#### The following code where data cleaning and tokenization is done can be used to check the model

In [67]:
s=np.array(["it is not good product"])
def clean(string):
  words=preprocess_string(string,filters)
  return " ".join(word.lower() for word in words)
clean(s)
def tokenize(te):
  return[stemmer.stem(word) for word in tk.tokenize(te.lower())]
g=v.transform(s)
r=clf.predict(g)
if r==1:
    print("positive")
else:
    print("negative")

negative


### The randomforest classifier is saved and dumped in a file which can be accessed for API

In [68]:
rf="file"

In [69]:
pickle.dump(clf,open("rf","wb"))