In [None]:
# This is a special command used in Jupyter notebooks to display Matplotlib plots directly within the notebook.
%matplotlib inline

# Importing required libraries
import sqlite3  # For working with SQL databases
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical computing

# Natural Language Processing toolkit
import nltk
from nltk.stem.porter import PorterStemmer  # For stemming

# String module
import string

# Plotting libraries
import matplotlib.pyplot as plt  # For creating visualizations
import seaborn as sns  # For enhanced statistical data visualizations

# Machine learning libraries from scikit-learn
from sklearn.feature_extraction.text import TfidfTransformer  # For TF-IDF transformation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer  # For converting text to numerical data
from sklearn.metrics import confusion_matrix  # For creating confusion matrices
from sklearn import metrics  # For evaluating machine learning models
from sklearn.metrics import roc_curve, auc  # For plotting ROC curve and calculating AUC


In [None]:
con = sqlite3.connect('C:/Users/nithy/OneDrive/Desktop/Amazon_fine_food/database.sqlite')

In [None]:
# load my data using sql and pandas
# not taking those reviews with score = 3
# run sql command using the connection , con
filtered_data = pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3
""" , con)

def partition(x):
    if x<3:
        return 'negative'
    return 'positive'

actualScore = filtered_data['Score']
positivenegative = actualScore.map(partition)
filtered_data['Score'] = positivenegative




In [None]:
filtered_data.shape
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [None]:
# Data cleaning or DEDUPLICATION
# display is used to display the table
# condition is used to print
display = pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score!=3 AND UserId = "AR5J8UI46CURR"
ORDER BY ProductID
""", con)
display



Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


In [None]:
#sorting the data according to product Id
sorted_data = filtered_data.sort_values('ProductId', axis = 0, ascending = True)

In [None]:
final = sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"} , keep='first' , inplace=False)
final.shape

(364173, 10)

In [None]:
# check how much % of data still remains
(final['Id'].size*1.0)/(filtered_data['Id'].size*1.0)*100

69.25890143662969

In [None]:
#n<d
display = pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score !=3 AND Id=44737 OR Id = 64422
ORDER BY ProductId
""" , con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [None]:
final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]

In [None]:
print(final.shape)
final['Score'].value_counts()

(364171, 10)


Score
positive    307061
negative     57110
Name: count, dtype: int64

# Text Pre processing

In [None]:
import re
i =0;
for sent in final['Text'].values:
    if(len(re.findall('<.*?>',sent))):
        print(i)
        print(sent)
        break;
    i+= 1;

6
I set aside at least an hour each day to read to my son (3 y/o). At this point, I consider myself a connoisseur of children's books and this is one of the best. Santa Clause put this under the tree. Since then, we've read it perpetually and he loves it.<br /><br />First, this book taught him the months of the year.<br /><br />Second, it's a pleasure to read. Well suited to 1.5 y/o old to 4+.<br /><br />Very few children's books are worth owning. Most should be borrowed from the library. This book, however, deserves a permanent spot on your shelf. Sendak's best.


In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

stop = set(stopwords.words('english'))
sno = nltk.stem.SnowballStemmer('english')

def cleanhtml(sentence):
    cleanr = re.compile('<.*?')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def cleanpunc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return cleaned
print(stop)
print('*****************************')
print(sno.stem('tasty'))


{'my', 'each', 'have', 'up', "isn't", 'because', 'such', "shouldn't", 'why', 'or', 'his', 'him', 'do', 'by', 'against', 'themselves', 'o', 'theirs', 'above', 'has', 'hers', 'who', 'its', 'did', 'over', 'be', 'there', 'aren', 'm', "haven't", "wasn't", 'both', 'through', 'wasn', 'am', 'for', 'own', "mightn't", 'during', 'your', 'y', 'when', 'that', 'few', 'ourselves', 'yourself', 'haven', "that'll", 'mightn', 'further', 'myself', 'their', "you'd", 'wouldn', 'as', 'was', 'me', "aren't", 'himself', 'the', 'under', 'll', "hasn't", "don't", 'yourselves', 'herself', 'should', 'itself', 'whom', 'ma', 'is', 'in', 'it', "it's", 'you', 'isn', 'to', 'mustn', 'at', 'ain', 'where', 'off', 'needn', 'ours', 'any', 'after', 'most', 'what', 'only', 'had', 'these', 'which', 'will', "you're", 'yours', 'her', 'then', 'other', 'they', 'down', 'd', 'nor', 'couldn', "shan't", 'but', 'again', "couldn't", 'on', 'a', "she's", 'now', 'once', 'until', "doesn't", 'hadn', 'more', "mustn't", "hadn't", 'hasn', 'are', 

In [None]:
import numpy as np
import pandas as pd

# Initialize lists
all_positive_words = []
all_negative_words = []
final_string = []

# Convert stop words list to a set for faster membership checking
stop_set = set(stop)

# Use pandas vectorized methods to clean and tokenize text data
final['cleaned_text'] = final['Text'].apply(cleanhtml)
final['cleaned_words'] = final['cleaned_text'].apply(lambda x: [word for word in cleanpunc(x).split() if len(word) > 2 and word.isalpha() and word.lower() not in stop_set])

# Cache stemmed words to avoid repeated computations
stem_cache = {}

def get_stemmed_word(word):
    if word not in stem_cache:
        stem_cache[word] = sno.stem(word).encode('utf-8')
    return stem_cache[word]

# Vectorized processing
def process_row(row):
    filtered_sentence = []
    for word in row['cleaned_words']:
        stemmed_word = get_stemmed_word(word.lower())
        filtered_sentence.append(stemmed_word)

        if row['Score'] == 'positive':
            all_positive_words.append(stemmed_word)
        elif row['Score'] == 'negative':
            all_negative_words.append(stemmed_word)

    return b" ".join(filtered_sentence)

# Apply the processing function to each row in the DataFrame
final['final_string'] = final.apply(process_row, axis=1)

# Convert the resulting Series to a list
final_string = final['final_string'].tolist()


In [None]:
final['CleanedText'] = final_string

In [None]:
print(final.dtypes)

Id                         int64
ProductId                 object
UserId                    object
ProfileName               object
HelpfulnessNumerator       int64
HelpfulnessDenominator     int64
Score                     object
Time                       int64
Summary                   object
Text                      object
cleaned_text              object
cleaned_words             object
final_string              object
CleanedText               object
dtype: object


# Bag of words(BoW)

In [None]:
# for each row ri we have to get a vector vi
# scikit learn is the most popular machine learning library
# CountVectorizer is a function / class
count_vect = CountVectorizer()
final_counts = count_vect.fit_transform(final['Text'].values)

In [None]:
type(final_counts)

scipy.sparse._csr.csr_matrix

In [None]:
final_counts.get_shape()

(364171, 115281)

In [None]:
#This imports the NLTK library, which is a powerful tool for natural language processing tasks.
import nltk
nltk.download('stopwords')
# import regular expression
import re
import string
#This imports the stopwords corpus from NLTK, which was downloaded earlier. It contains common stopwords in English.
from nltk.corpus import stopwords
#This imports the Porter Stemmer algorithm from NLTK, which is used for stemming words (reducing them to their root or base form).
from nltk.stem import PorterStemmer
#NLTK provides WordNetLemmatizer class which is a thin wrapper around the wordnet corpus. This class uses morphy() function to the WordNet CorpusReader class to find a lemma.
from nltk.stem import WordNetLemmatizer

#These stopwords are commonly occurring words in English that are often filtered out during text preprocessing tasks.
stop = set(stopwords.words('english'))

#SnowballStemmer
sno = nltk.stem.SnowballStemmer('english')

#This defines a function named cleanhtml that takes a sentence as input.
def cleanhtml(sentence):
    cleanr = re.compile('<.*?')
    cleantext = re.sub(cleanr,' ',sentence)
    return cleantext
# | stands for logical OR and a sentence having ?,! etc will be replaced by space
#Defining cleanupnc Function:
#This defines a function named cleanupnc that takes a sentence as input.
#This line uses re.sub() to remove specific punctuation characters such as ?, !, ', ", and # from the input sentence.
#This line further removes additional punctuation characters such as ., ,, ), (, |, and / from the cleaned text.
#This returns the cleaned text with punctuation removed.
def cleanupnc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return cleaned
print(stop)
print('***********************************')
print(sno.stem('tasty'))

{'my', 'each', 'have', 'up', "isn't", 'because', 'such', "shouldn't", 'why', 'or', 'his', 'him', 'do', 'by', 'against', 'themselves', 'o', 'theirs', 'above', 'has', 'hers', 'who', 'its', 'did', 'over', 'be', 'there', 'aren', 'm', "haven't", "wasn't", 'both', 'through', 'wasn', 'am', 'for', 'own', "mightn't", 'during', 'your', 'y', 'when', 'that', 'few', 'ourselves', 'yourself', 'haven', "that'll", 'mightn', 'further', 'myself', 'their', "you'd", 'wouldn', 'as', 'was', 'me', "aren't", 'himself', 'the', 'under', 'll', "hasn't", "don't", 'yourselves', 'herself', 'should', 'itself', 'whom', 'ma', 'is', 'in', 'it', "it's", 'you', 'isn', 'to', 'mustn', 'at', 'ain', 'where', 'off', 'needn', 'ours', 'any', 'after', 'most', 'what', 'only', 'had', 'these', 'which', 'will', "you're", 'yours', 'her', 'then', 'other', 'they', 'down', 'd', 'nor', 'couldn', "shan't", 'but', 'again', "couldn't", 'on', 'a', "she's", 'now', 'once', 'until', "doesn't", 'hadn', 'more', "mustn't", "hadn't", 'hasn', 'are', 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nithy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Bi-Grams and n-Grams

In [None]:
freq_dist_positive=nltk.FreqDist(all_positive_words)
freq_dist_negative=nltk.FreqDist(all_negative_words)
print("Most Common Positive Words :",freq_dist_positive.most_common(20))
print("Most Common negative Words :",freq_dist_negative.most_common(20))

Most Common Positive Words : [(b'like', 139150), (b'tast', 128631), (b'good', 112216), (b'flavor', 109473), (b'love', 107034), (b'use', 103627), (b'great', 102818), (b'product', 99504), (b'one', 95360), (b'tri', 86237), (b'tea', 83824), (b'coffe', 78610), (b'make', 74835), (b'get', 71962), (b'food', 64752), (b'amazon', 57832), (b'would', 55297), (b'time', 55225), (b'buy', 53903), (b'realli', 52569)]
Most Common negative Words : [(b'tast', 34489), (b'like', 32284), (b'product', 29504), (b'one', 20420), (b'flavor', 19561), (b'would', 17901), (b'tri', 17676), (b'use', 15275), (b'good', 14977), (b'coffe', 14677), (b'get', 13758), (b'buy', 13690), (b'order', 12846), (b'food', 12742), (b'dont', 11683), (b'tea', 11657), (b'amazon', 11258), (b'even', 10983), (b'box', 10841), (b'make', 9816)]


In [None]:
count_vect = CountVectorizer(ngram_range=(1,2) )
final_bigram_counts = count_vect.fit_transform(final['Text'].values)

In [None]:
final_bigram_counts.get_shape()

(364171, 2910192)

# TF-IDF

In [None]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
final_tf_idf = tf_idf_vect.fit_transform(final['Text'].values)

In [None]:
final_tf_idf.get_shape()

(364171, 2910192)

In [None]:
features = tf_idf_vect.get_feature_names_out()
len(features)

2910192

In [None]:
features[100000 : 100010]

array(['ales until', 'ales ve', 'ales would', 'ales you', 'alessandra',
       'alessandra ambrosia', 'alessi', 'alessi added', 'alessi also',
       'alessi and'], dtype=object)

In [None]:
print(final_tf_idf[3,:].toarray()[0])

[0. 0. 0. ... 0. 0. 0.]


In [None]:
def top_tfidf_feats(row,features,top_n=25):
    ''' Get top n tfidf values in row and return them with their correspond'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i])  for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature','tfidf']
    return df

top_tfidf = top_tfidf_feats(final_tf_idf[1,:].toarray()[0],features,25)

In [None]:
top_tfidf

Unnamed: 0,feature,tfidf
0,sendak books,0.173437
1,rosie movie,0.173437
2,paperbacks seem,0.173437
3,cover version,0.173437
4,these sendak,0.173437
5,the paperbacks,0.173437
6,pages open,0.173437
7,really rosie,0.168074
8,incorporates them,0.168074
9,paperbacks,0.168074
