# Amazon fine food review analysis

In [None]:
%matplotlib inline

import sqlite3 
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer


In [None]:
# reading the data using sqlite3

con = sqlite3.connect('D:\\Data Science\\Dataset\\amazon-fine-food-reviews\\database.sqlite')


# filtering only positive and negative review not taking the value which has review as 3

filtered_data  = pd.read_sql_query("""
select * 
from Reviews
where Score !=3
""", con)

filtered_data

In [None]:
# give the score > 3 positive and score<3 negative

def partition(x):
    if x > 3:
        return 'Positive'
    return 'Negative'

# changing the review with score less than 3 to be positive and vice ve



rsa
# positiveNegative = filtered_data['Score'].map(partition)
filtered_data['Score'] = filtered_data['Score'].map(partition)

In [None]:
print('the shape of the data is : ',filtered_data.shape)
filtered_data.head()

In [None]:
# data cleaning : deduplication of the data 

display = pd.read_sql_query("""
select * from Reviews
where Score !=3 and UserId ="AR5J8UI46CURR"
order by ProductId
""", con)

display

In [None]:
# shorting the data according to the ProductId

shoted_data = filtered_data.sort_values("ProductId", axis=0, ascending=True)

# Deduplication of entries
final = shoted_data.drop_duplicates(subset ={'UserId','ProfileName','Time','Text'},keep='first', inplace=False)
final.shape

In [None]:
#  Checking the remaining % data left 

(final['Id'].size*1.0)/(filtered_data['Id'].size*1.0)*100

# Observation:
    it was also seen that two rows given below the value of HelpfulnessNumerator is greater than HelpfulnessDenominator
    which is not practically possible  hence these two rows are also removed from calculation.

In [None]:
display = pd.read_sql_query("""
select *
from Reviews
where score !=3 AND Id = 44737 OR Id= 64422
order by ProductID
""", con)

display

In [None]:
final = final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]

print(final.shape)

In [None]:
# How many Positive and Negative reviews are present in our dataset

final['Score'].value_counts()

# Bag of Word (BoW)

In [None]:
count_vect = CountVectorizer() # in scikit learn
final_count = count_vect.fit_transform(final['Text'].values)


In [None]:
type(final_count)

In [None]:
final_count.shape

# Text processing : 
    Stemming , stop word removel and lemmatization.

In [None]:
import re # python regular expression
i=0
for sent in final['Text'].values:
    if(len(re.findall('<.*?>', sent))):
        print(i)
        print(sent)
        break;
    i+=1;

In [None]:
import string
import nltk
nltk.download()


from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

stop = set(stopwords.words('english'))  # set of stopwords
sno = nltk.stem.SnowballStemmer('english') # initilizing the snow ball stmmer

def cleanhtml(sentence):     #function to clean the word of any html-tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr," ", sentence)
    return cleantext

def cleanpunc(sentence): # function to clean the word of any punctuation
    cleaned  = re.sub(r'[? |!|\'|"|#]', r'',sentence)
    cleaned  = re.sub(r'[. |,|)|(|\|/]', r'',cleaned)
    return cleaned
print(stop)
print("**********************************************************************")
print(sno.stem('tasty'))

In [None]:
# Code for implementing step-by-step gthe checks mentioned in the pre-processing 
# this code takes a while to run as it has to run on 500k sentences. 

i= 0 
str1  = ' '
final_string = []
all_positive_words = [] #store words from positive reviews here
all_negative_words = [] #store words from negative reviews here
s=''

for sent in final['Text'].values:
    filtered_sentence = []
#     print(sent)
    sent = cleanhtml(sent) #remove html tags
    
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_word.isalpha()) & (len(cleaned_words)>2)):
                if (cleaned_words.lower() not in stop):
                    s= (sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    
                    if (final['Score'].values)[i] == 'Positive':
                        all_positive_words.append(s)  # list of all the words which is positive
                        
                    if(final['Score'].values)[i] == "Negative":
                        all_negative_words.append(s)  # list of all the words which is negative
                        
                else:
                    continue
                    
            else:
                continue
                
    #                 print(filtered_sentences)
    str1 = b" ".join(filtered_sentence) #final string of cleaned words
    # print("*******************************************************************************")
    final_string.append(str1)
    i+=1
    



In [None]:
final['CleanedText'] = final_string # adding a column of clean text

In [None]:
final.head(3) # below the processed view can be seen in the cleaned_text

# Store final table into an Sqlite table for future.
conn = sqlite3.connect('final.sqlite')
c = conn.cursor()
conn.text_factory = str
final.to_sql('Reviews', conn, flavor = None, schema = None, if_exists = 'replace')

# Bi-Grams and n-Grams.

# Motivation:
    now that we have our list of words describing positive and negative reviews lets analyse them.
    We begin analysis by getting the frequency distribution of the words as shown below
    

In [None]:
frequency_dist_positive = nltk.FreqDist(all_positive_words)
frequency_dist_negative = nltk.FreqDist(all_negative_words)

print("most common positive words : ",frequency_dist_positive.most_common(20))
print("most common negative words : ",frequency_dist_negative.most_common(20))

In [None]:
# bi-gram , tri-gram and n-gram

# removing stop words like "not" should be avoided before building the n-grams
count_vect = CountVectorizer(ngram_range = (1,2)) # in scikit learn 1= give me the UNIGRAMS; 2 = up to BIGRAMS
final_bigram_counts = count_vect.fit_transform(final['Text'].values)

In [None]:
final_bigram_counts.get_shape() 

# TF-IDF

In [None]:
tf_idf_vect = TfidfVectorizer(ngram_range = (1,2))
final_tf_idf = tf_idf_vect.fit_transform(final['Text'].values)

In [None]:
final_tf_idf.shape

In [None]:
features = tf_idf_vect.get_feature_names()
len(features)


In [None]:
features[100000:100010]

In [None]:
# convert a raw in sparsematrix to a numpy array
print(final_tf_idf[3,:].toarray()[0])

In [None]:
def top_tfidf_feats(row,features, top_n =25):
    """get top tf idf values in row and return them with their corrospond"""
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i],row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    
    df.columns = ['features', 'tfidf']
    return df

top_tfidf = top_tfidf_feats(final_tf_idf[1,:].toarray()[0],features, 25)

top_tfidf

In [None]:
W