In [1]:
# Inline plotting for Colab
%matplotlib inline

# Core libraries
import re
import string
import sqlite3
import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbn
from math import floor, ceil

# NLTK
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

stop = stopwords.words("english")
english_stemmer = SnowballStemmer('english')

# Scikit-learn
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    roc_curve,
    auc
)
from sklearn import metrics
from sklearn.svm import LinearSVC

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

!pip install transformers --quiet

from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Example text for summarization
example_text = """
Your long text goes here for summarization testing.
Transformers summarization extracts key insights and returns a summarized version.
This model produces more human-like summaries and supports longer inputs.
"""

# Summarize
summary_output = summarizer(example_text, max_length=100, min_length=30, do_sample=False)

print("Summary:")
print(summary_output[0]['summary_text'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cpu
Your max_length is set to 100, but your input_length is only 44. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)


Summary:
Transformers summarization extracts key insights and returns a summarized version. This model produces more human-like summaries and supports longer inputs. Your long text goes here for summarization testing.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd

file_path = '/content/drive/My Drive/Merged.json'

review_data = pd.read_json(file_path,lines=True)
review_data


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A10000012B7CGYKOMPQ4L,000100039X,Adam,"[0, 0]",Spiritually and mentally inspiring! A book tha...,5,Wonderful!,1355616000,"12 16, 2012"
1,A2S166WSCFIFP5,000100039X,"adead_poet@hotmail.com ""adead_poet@hotmail.com""","[0, 2]",This is one my must have books. It is a master...,5,close to god,1071100800,"12 11, 2003"
2,A1BM81XB4QHOA3,000100039X,"Ahoro Blethends ""Seriously""","[0, 0]",This book provides a reflection that you can a...,5,Must Read for Life Afficianados,1390003200,"01 18, 2014"
3,A1MOSTXNIO5MPJ,000100039X,Alan Krug,"[0, 0]",I first read THE PROPHET in college back in th...,5,Timeless for every good and bad time in your l...,1317081600,"09 27, 2011"
4,A2XQ5LZHTD4AFT,000100039X,Alaturka,"[7, 9]",A timeless classic. It is a very demanding an...,5,A Modern Rumi,1033948800,"10 7, 2002"
...,...,...,...,...,...,...,...,...,...
320452,A1YMNTFLNDYQ1F,B00LORXVUE,eyeused2loveher,"[0, 0]",Works great just like my original one. I reall...,5,This works just perfect!,1405900800,"07 21, 2014"
320453,A15TX8B2L8B20S,B00LORXVUE,Jon Davidson,"[0, 0]",Great product. Great packaging. High quality a...,5,Great replacement cable. Apple certified,1405900800,"07 21, 2014"
320454,A3JI7QRZO1QG8X,B00LORXVUE,Joyce M. Davidson,"[0, 0]","This is a great cable, just as good as the mor...",5,Real quality,1405900800,"07 21, 2014"
320455,A1NHB2VC68YQNM,B00LORXVUE,Nurse Farrugia,"[0, 0]",I really like it becasue it works well with my...,5,I really like it becasue it works well with my...,1405814400,"07 20, 2014"


In [4]:
review_data.columns

Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')

In [5]:
review_data.columns = review_data.columns.str.strip()
review_data=review_data.drop(['helpful', 'reviewTime', 'reviewerName','unixReviewTime'],axis=1)

In [13]:
review_data

Unnamed: 0,reviewerID,asin,reviewText,overall,summary
0,A10000012B7CGYKOMPQ4L,000100039X,Spiritually and mentally inspiring! A book tha...,5,Wonderful!
1,A2S166WSCFIFP5,000100039X,This is one my must have books. It is a master...,5,close to god
2,A1BM81XB4QHOA3,000100039X,This book provides a reflection that you can a...,5,Must Read for Life Afficianados
3,A1MOSTXNIO5MPJ,000100039X,I first read THE PROPHET in college back in th...,5,Timeless for every good and bad time in your l...
4,A2XQ5LZHTD4AFT,000100039X,A timeless classic. It is a very demanding an...,5,A Modern Rumi
...,...,...,...,...,...
320452,A1YMNTFLNDYQ1F,B00LORXVUE,Works great just like my original one. I reall...,5,This works just perfect!
320453,A15TX8B2L8B20S,B00LORXVUE,Great product. Great packaging. High quality a...,5,Great replacement cable. Apple certified
320454,A3JI7QRZO1QG8X,B00LORXVUE,"This is a great cable, just as good as the mor...",5,Real quality
320455,A1NHB2VC68YQNM,B00LORXVUE,I really like it becasue it works well with my...,5,I really like it becasue it works well with my...


In [6]:
def data_clean( rev, remove_stopwords=True):


    new_text = re.sub("[^a-zA-Z]"," ", rev)

    words = new_text.lower().split()

    if remove_stopwords:
        sts = set(stopwords.words("english"))
        words = [w for w in words if not w in sts]
    ary=[]
    eng_stemmer = english_stemmer
    for word in words:
        ary.append(eng_stemmer.stem(word))


    return(ary)

In [7]:
clean_reviewData = []
for rev in review_data['reviewText']:
    clean_reviewData.append( " ".join(data_clean(rev)))

clean_summaryData = []
for rev in review_data['summary']:
    clean_summaryData.append( " ".join(data_clean(rev)))

In [8]:
Most_used_Words_Review =pd.Series(' '.join(clean_reviewData).lower().split()).value_counts()[:20]
print (Most_used_Words_Review)

book      268914
phone     198383
one       186944
case      176492
like      150008
read      147058
use       141398
work      107734
great     107185
time      106146
get       103291
good      102910
would     100705
charg      96986
well       93445
love       90503
stori      88633
look       74482
make       73548
realli     72789
Name: count, dtype: int64


In [9]:
Most_used_Words_Summary = pd.Series(' '.join(clean_summaryData).lower().split()).value_counts()[:20]
print (Most_used_Words_Summary)

great      42388
good       26516
case       23337
work       15960
book       15928
love       15211
read       12004
nice        9759
best        8900
phone       8180
product     8162
perfect     7211
excel       6803
one         6769
price       6159
charger     6090
fit         6059
well        5992
stori       5601
like        5546
Name: count, dtype: int64


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
text_vectorizer = TfidfVectorizer(min_df=4, max_features = 1000)
test_vecor = text_vectorizer.fit_transform(clean_reviewData)
tfidf_vector = dict(zip(text_vectorizer.get_feature_names_out(), text_vectorizer.idf_))

In [13]:
import nltk
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
sample_review = review_data.reviewText[:10]
for test in sample_review:
    test
    ss = analyser.polarity_scores(test)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]))
    print(test)

compound: 0.7256, 
neg: 0.0, 
neu: 0.757, 
pos: 0.243, 
Spiritually and mentally inspiring! A book that allows you to question your morals and will help you discover who you really are!
compound: 0.8349, 
neg: 0.0, 
neu: 0.835, 
pos: 0.165, 
This is one my must have books. It is a masterpiece of spirituality. I'll be the first to admit, its literary quality isn't much. It is rather simplistically written, but the message behind it is so powerful that you have to read it. It will take you to enlightenment.
compound: 0.4404, 
neg: 0.0, 
neu: 0.927, 
pos: 0.073, 
This book provides a reflection that you can apply to your own life.And, a way for you to try and assess whether you are truly doing the right thing and making the most of your short time on this plane.
compound: 0.9201, 
neg: 0.076, 
neu: 0.763, 
pos: 0.161, 
I first read THE PROPHET in college back in the 60's. The book had a revival as did anything metaphysical in the turbulent 60's. It had a profound effect on me and became a

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [15]:
from sklearn.cluster import MiniBatchKMeans

clusters = 20
kmeans_model = MiniBatchKMeans(n_clusters=clusters, init='k-means++', n_init=1,
                         init_size=1000, batch_size=1000, verbose=False, max_iter=1000)
kmodel = kmeans_model.fit(test_vecor)
kmodel_clusters = kmodel.predict(test_vecor)
kmodel_distances = kmodel.transform(test_vecor)
centroids = kmodel.cluster_centers_.argsort()[:, ::-1]
values = text_vectorizer.get_feature_names_out()
for i in range(clusters):
    print("Cluster %d:" % i)
    for j in centroids[i, :5]:
        print(' %s' % values[j])
    print()

Cluster 0:
 fit
 perfect
 case
 phone
 iphon

Cluster 1:
 batteri
 charg
 phone
 use
 charger

Cluster 2:
 book
 read
 one
 like
 seri

Cluster 3:
 product
 great
 good
 price
 recommend

Cluster 4:
 instal
 easi
 screen
 protector
 bubbl

Cluster 5:
 case
 phone
 protect
 like
 iphon

Cluster 6:
 stylus
 pen
 tip
 use
 work

Cluster 7:
 work
 great
 use
 phone
 car

Cluster 8:
 quot
 book
 read
 stori
 one

Cluster 9:
 man
 book
 life
 stori
 read

Cluster 10:
 stori
 book
 charact
 read
 novel

Cluster 11:
 read
 book
 stori
 enjoy
 seri

Cluster 12:
 phone
 protect
 case
 use
 cover

Cluster 13:
 one
 use
 good
 like
 get

Cluster 14:
 screen
 protector
 bubbl
 phone
 appli

Cluster 15:
 sound
 headset
 ear
 bluetooth
 use

Cluster 16:
 charg
 charger
 usb
 devic
 cabl

Cluster 17:
 love
 book
 case
 great
 one

Cluster 18:
 color
 case
 love
 phone
 like

Cluster 19:
 look
 case
 like
 phone
 nice



In [16]:
test_reviewText = review_data.reviewText
test_Ratings = review_data.overall
text_vectorizer = TfidfVectorizer(max_df=.8)
text_vectorizer.fit(test_reviewText)
def rate(r):
    ary2 = []
    for rating in r:
        tv = [0,0,0,0,0]
        tv[rating-1] = 1
        ary2.append(tv)
    return np.array(ary2)

In [17]:
test_reviewText =test_reviewText[:2000]
test_reviewText

Unnamed: 0,reviewText
0,Spiritually and mentally inspiring! A book tha...
1,This is one my must have books. It is a master...
2,This book provides a reflection that you can a...
3,I first read THE PROPHET in college back in th...
4,A timeless classic. It is a very demanding an...
...,...
1995,Great story. Brings you right back the old da...
1996,after reading the book I bought the dvd to se...
1997,Nice enjoyable reading. A very nostalgic look...
1998,Another one of those page turners! Wanted to ...


In [18]:
test_Ratings = test_Ratings[:2000]
test_Ratings

Unnamed: 0,overall
0,5
1,5
2,5
3,5
4,5
...,...
1995,5
1996,5
1997,3
1998,5


In [19]:
X = text_vectorizer.transform(test_reviewText).toarray()
y = rate(test_Ratings.values)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2)

model = Sequential()
model.add(Dense(128,input_dim=X_train.shape[1]))
model.add(Dense(5,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
model.fit(X_train,y_train,validation_data=(X_test, y_test),epochs=10,batch_size=32,verbose=1)
model.evaluate(X_test,y_test)[1]

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 543ms/step - accuracy: 0.6113 - loss: 1.2951 - val_accuracy: 0.6450 - val_loss: 0.9854
Epoch 2/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 496ms/step - accuracy: 0.6492 - loss: 0.9505 - val_accuracy: 0.6450 - val_loss: 0.9079
Epoch 3/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 501ms/step - accuracy: 0.6742 - loss: 0.7840 - val_accuracy: 0.6500 - val_loss: 0.8658
Epoch 4/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 495ms/step - accuracy: 0.7055 - loss: 0.6981 - val_accuracy: 0.6700 - val_loss: 0.8309
Epoch 5/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 490ms/step - accuracy: 0.7707 - loss: 0.5810 - val_accuracy: 0.6725 - val_loss: 0.8183
Epoch 6/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 464ms/step - accuracy: 0.8622 - loss: 0.4663 - val_accuracy: 0.6800 - val_loss: 0.8128
Epoch 7/10
[1m50/50[

0.6575000286102295

This is the evaluated score for CNN Model on this data that comes upto 65.75%

Rights: Vindhya V