**Load and Clean data**

In [2]:
import pandas as pd
data=pd.read_csv("/kaggle/input/amazon-kindle-book-review-for-sentiment-analysis/preprocessed_kindle_review .csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  12000 non-null  int64 
 1   rating      12000 non-null  int64 
 2   reviewText  12000 non-null  object
 3   summary     11998 non-null  object
dtypes: int64(2), object(2)
memory usage: 375.1+ KB


In [4]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

# Load your dataset
# df = pd.read_csv('path_to_your_dataset.csv')

# Sample text column name 'review'
# Example: df = pd.DataFrame({'review': ["I love this Kindle!", "This book is terrible.", ...]})

# Download stopwords
nltk.download('stopwords')

# Text preprocessing function
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    words = text.split()
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Assign Sentiment Labels:

In [5]:
# Assign sentiment labels
data['sentiment'] = data['rating'].apply(lambda x: 1 if x >= 3 else 0)

# Display the first few rows of the dataset with sentiment labels
print(data[['reviewText','rating', 'sentiment']].head())

                                          reviewText  rating  sentiment
0  This book was the very first bookmobile book I...       5          1
1  When I read the description for this book, I c...       1          0
2  I just had to edit this review. This book is a...       5          1
3  I don't normally buy 'mystery' novels because ...       5          1
4  This isn't the kind of book I normally read, a...       5          1


In [6]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['reviewText'], data['sentiment'], test_size=0.2, random_state=42)

In [7]:
# Apply preprocessing
X_train_cleaned = X_train.apply(preprocess_text)
X_test_cleaned = X_test.apply(preprocess_text)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Vectorize the text using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_cleaned)
X_test_tfidf = tfidf_vectorizer.transform(X_test_cleaned)

# Train a Logistic Regression classifier
model_tfidf = LogisticRegression()
model_tfidf.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

In [9]:
# Evaluate the model
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
report_tfidf = classification_report(y_test, y_pred_tfidf)

print("TF-IDF Model Accuracy:", accuracy_tfidf)
print("TF-IDF Model Classification Report:\n", report_tfidf)

TF-IDF Model Accuracy: 0.8491666666666666
TF-IDF Model Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.67      0.75       799
           1       0.85      0.94      0.89      1601

    accuracy                           0.85      2400
   macro avg       0.85      0.81      0.82      2400
weighted avg       0.85      0.85      0.84      2400



**Text Summarization**

In [10]:
X_test.info()

<class 'pandas.core.series.Series'>
Index: 2400 entries, 1935 to 4414
Series name: reviewText
Non-Null Count  Dtype 
--------------  ----- 
2400 non-null   object
dtypes: object(1)
memory usage: 37.5+ KB


In [13]:
!pip install  sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Collecting chardet (from breadability>=0.1.20->sumy)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading chardet-5.2.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuil

Extractive Summary:

In [14]:
import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

In [15]:
# For Strings
summarized_review = []
for review in X_test:
    parser = PlaintextParser.from_string(review,Tokenizer("english"))
    summarizer = LexRankSummarizer()
    #Summarize the document with 2 sentences
    summary = summarizer(parser.document, 2)
    combined_summary = ' '.join(str(sentence) for sentence in summary)
    summarized_review.append(combined_summary)

In [16]:
print(f'original: {X_test[0]}\n'
      f'summary: {summarized_review[0]}')

original: This book was the very first bookmobile book I bought when I was in the school book club. I loved the story then and I bet a dollar to a donut I will love it again. If my memory serves, I bought this book in 5th grade. That would have been about 1961. I am looking forward to reliving the memories.
summary: This book was recommended to me and I actually liked it. If you are looking for suspense this is not your book, it is more like a killing time book where you can figure out what happens next.


Evaluate with the same model

In [17]:
summary_cleaned = pd.Series(summarized_review).apply(preprocess_text)
summary_tfidf = tfidf_vectorizer.transform(summary_cleaned)
y_pred_summary = model_tfidf.predict(summary_tfidf)

In [18]:
# Evaluate the model
accuracy_tfidf = accuracy_score(y_test, y_pred_summary)
report_tfidf = classification_report(y_test, y_pred_summary)

print("TF-IDF Model Accuracy:", accuracy_tfidf)
print("TF-IDF Model Classification Report:\n", report_tfidf)

TF-IDF Model Accuracy: 0.79375
TF-IDF Model Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.53      0.63       799
           1       0.80      0.92      0.86      1601

    accuracy                           0.79      2400
   macro avg       0.79      0.73      0.74      2400
weighted avg       0.79      0.79      0.78      2400



Abstractive Summary:

In [24]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load pre-trained T5 model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to("cuda")
model = model.to("cuda")

In [26]:
# Tokenize and summarize the input text using T5
T5_summary = []
for input_text in X_test:
    inputs = tokenizer.encode("summarize: " + input_text, return_tensors="pt", max_length=2000, truncation=True).to('cuda')
    summary_ids = model.generate(inputs, max_length=100, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    T5_summary.append(summary)

In [27]:
print(f'original: {X_test[0]}\n'
      f'summary: {T5_summary[0]}')

original: This book was the very first bookmobile book I bought when I was in the school book club. I loved the story then and I bet a dollar to a donut I will love it again. If my memory serves, I bought this book in 5th grade. That would have been about 1961. I am looking forward to reliving the memories.
summary: this is not your book, it is more like a killing time book where you can figure out what happens next.


In [28]:
summary_cleaned = pd.Series(T5_summary).apply(preprocess_text)
summary_tfidf = tfidf_vectorizer.transform(summary_cleaned)
y_pred_summary = model_tfidf.predict(summary_tfidf)

In [30]:
# Evaluate the model
accuracy_tfidf = accuracy_score(y_test, y_pred_summary)
report_tfidf = classification_report(y_test, y_pred_summary)

print("TF-IDF Model Accuracy:", accuracy_tfidf)
print("TF-IDF Model Classification Report:\n", report_tfidf)

TF-IDF Model Accuracy: 0.7825
TF-IDF Model Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.52      0.61       799
           1       0.79      0.91      0.85      1601

    accuracy                           0.78      2400
   macro avg       0.77      0.72      0.73      2400
weighted avg       0.78      0.78      0.77      2400

