Try to improve extractive summarization:

1. split each review into sentences
2. get polarity scores for each sentence
3. only keep the sentences with strong sentiment (not neutral)
4. combine the sentences together

# TF-IDF model for sentiment analysis

test accuracy for the original test reviews

In [8]:
import pandas as pd
data=pd.read_csv("/kaggle/input/amazon-kindle-book-review-for-sentiment-analysis/preprocessed_kindle_review .csv")

In [9]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

# Load your dataset
# df = pd.read_csv('path_to_your_dataset.csv')

# Sample text column name 'review'
# Example: df = pd.DataFrame({'review': ["I love this Kindle!", "This book is terrible.", ...]})

# Download stopwords
nltk.download('stopwords')

# Text preprocessing function
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    words = text.split()
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# Assign sentiment labels
data['sentiment'] = data['rating'].apply(lambda x: 1 if x >= 3 else 0)

# Display the first few rows of the dataset with sentiment labels
print(data[['reviewText','rating', 'sentiment']].head())

                                          reviewText  rating  sentiment
0  This book was the very first bookmobile book I...       5          1
1  When I read the description for this book, I c...       1          0
2  I just had to edit this review. This book is a...       5          1
3  I don't normally buy 'mystery' novels because ...       5          1
4  This isn't the kind of book I normally read, a...       5          1


In [11]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['reviewText'], data['sentiment'], test_size=0.2, random_state=42)

In [13]:
# Apply preprocessing
X_train_cleaned = X_train.apply(preprocess_text)
X_test_cleaned = X_test.apply(preprocess_text)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Vectorize the text using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_cleaned)
X_test_tfidf = tfidf_vectorizer.transform(X_test_cleaned)

# Train a Logistic Regression classifier
model_tfidf = LogisticRegression()
model_tfidf.fit(X_train_tfidf, y_train)

In [16]:
# Predict on the test set
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)
# Evaluate the model
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
report_tfidf = classification_report(y_test, y_pred_tfidf)

print("TF-IDF Model Accuracy:", accuracy_tfidf)
print("TF-IDF Model Classification Report:\n", report_tfidf)

TF-IDF Model Accuracy: 0.8491666666666666
TF-IDF Model Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.67      0.75       799
           1       0.85      0.94      0.89      1601

    accuracy                           0.85      2400
   macro avg       0.85      0.81      0.82      2400
weighted avg       0.85      0.85      0.84      2400



# Sumy

accuracy for the sumy test data

In [18]:
!pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Collecting chardet (from breadability>=0.1.20->sumy)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading chardet-5.2.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuild

In [19]:
import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

In [24]:
# function to summarize the reviews
def sumy_review(reviews):
    summarized_review = []
    for review in reviews:
        parser = PlaintextParser.from_string(review,Tokenizer("english"))
        summarizer = LexRankSummarizer()
        #Summarize the document with 2 sentences
        summary = summarizer(parser.document, 2)
        combined_summary = ' '.join(str(sentence) for sentence in summary)
        summarized_review.append(combined_summary)
    return summarized_review

In [25]:
summarized_review = sumy_review(X_test)
summary_cleaned = pd.Series(summarized_review).apply(preprocess_text)
summary_tfidf = tfidf_vectorizer.transform(summary_cleaned)
y_pred_summary = model_tfidf.predict(summary_tfidf)

In [26]:
# Evaluate the model
accuracy_tfidf = accuracy_score(y_test, y_pred_summary)
report_tfidf = classification_report(y_test, y_pred_summary)

print("TF-IDF Model Accuracy:", accuracy_tfidf)
print("TF-IDF Model Classification Report:\n", report_tfidf)

TF-IDF Model Accuracy: 0.79375
TF-IDF Model Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.53      0.63       799
           1       0.80      0.92      0.86      1601

    accuracy                           0.79      2400
   macro avg       0.79      0.73      0.74      2400
weighted avg       0.79      0.79      0.78      2400



# Improving the accuracy of Sumy

In [27]:
import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
['Hello there!', 'How are you doing today?', "I hope you're doing well.", "It's a sunny day, isn't it?"]


In [70]:
split_sentence = []
for review in X_test:
   split_sentence.append(sent_tokenize(review))

In [31]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!




In [92]:
# Function to filter non-neutral sentences and ensure at least two sentences are kept
def filter_non_neutral_sentences(sentences):
    # Analyze polarity for each sentence
    scored_sentences = [(sentence, sia.polarity_scores(sentence)['compound']) for sentence in sentences]

    # Filter out neutral sentences based on the compound score
    non_neutral_sentences = [sentence for sentence, score in scored_sentences if score < -0.1 or score > 0.1]

    # Ensure at least 2 sentences are kept
    if len(non_neutral_sentences) < 2:
        # Add sentences from the original list until there are at least 2 sentences
        non_neutral_sentences.extend([sentence for sentence, score in scored_sentences if sentence not in non_neutral_sentences][:2 - len(non_neutral_sentences)])

    return non_neutral_sentences

In [93]:
# Apply the filtering function to each list of sentences
filtered_lists = []
for review in split_sentence:
    filtered_lists.append(filter_non_neutral_sentences(review))

In [94]:
split_sentence[3]

['La Malinche is also known as Dona Marina, the native woman who aided Hernan Cortes in his conquest of Mexico.',
 "I knew quite a bit about that history, and when I know something of the subject, I'm afraid that the author will mess it up.",
 "I didn't have to worry.",
 'Mr. Hepple knows his subject very well.Treasure is more than an historical novel.',
 'Mr. Hepple blends present day Mexico with the past, as well as using a "Talk Radio" show to present other information from outside sources in a very entertaining way.I spent half my time sitting on the edge of my seat, biting my fingernails or holding my breath.',
 "The other half was spent in fascination with the history and politics of Mexico or laughing at Linc's problems with women (Vol.",
 '2).As I was getting toward the end, I thought I would take a break and read something else before reading Vol.',
 '2.',
 'No such luck.',
 'It was not to be.',
 'Vol.',
 '1 ended with breath-taking cliffhanger and I immediately started readin

In [95]:
filtered_lists[3]

["I knew quite a bit about that history, and when I know something of the subject, I'm afraid that the author will mess it up.",
 "I didn't have to worry.",
 'Mr. Hepple knows his subject very well.Treasure is more than an historical novel.',
 'Mr. Hepple blends present day Mexico with the past, as well as using a "Talk Radio" show to present other information from outside sources in a very entertaining way.I spent half my time sitting on the edge of my seat, biting my fingernails or holding my breath.',
 "The other half was spent in fascination with the history and politics of Mexico or laughing at Linc's problems with women (Vol.",
 'No such luck.',
 '2I highly recommend this book.']

In [96]:
combined_texts = [' '.join(sentences) for sentences in filtered_lists]

Now try the sumy method again

In [97]:
new_summarized_review = sumy_review(combined_texts)
new_summary_cleaned = pd.Series(new_summarized_review).apply(preprocess_text)
new_summary_tfidf = tfidf_vectorizer.transform(new_summary_cleaned)
new_y_pred_summary = model_tfidf.predict(new_summary_tfidf)

In [98]:
# Evaluate the model
new_accuracy_tfidf = accuracy_score(y_test, new_y_pred_summary)
new_report_tfidf = classification_report(y_test, new_y_pred_summary)

print("TF-IDF Model Accuracy:", new_accuracy_tfidf)
print("TF-IDF Model Classification Report:\n", new_report_tfidf)

TF-IDF Model Accuracy: 0.7958333333333333
TF-IDF Model Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.53      0.63       799
           1       0.80      0.93      0.86      1601

    accuracy                           0.80      2400
   macro avg       0.79      0.73      0.75      2400
weighted avg       0.79      0.80      0.78      2400



change the parameters for Sumy, keep 3 sentences instead of 2

In [99]:
# function to summarize the reviews
def sumy_review_3(reviews):
    summarized_review = []
    for review in reviews:
        parser = PlaintextParser.from_string(review,Tokenizer("english"))
        summarizer = LexRankSummarizer()
        #Summarize the document with 3 sentences
        summary = summarizer(parser.document, 3)
        combined_summary = ' '.join(str(sentence) for sentence in summary)
        summarized_review.append(combined_summary)
    return summarized_review

result for the original sumy

In [100]:
summarized_review = sumy_review_3(X_test)
summary_cleaned = pd.Series(summarized_review).apply(preprocess_text)
summary_tfidf = tfidf_vectorizer.transform(summary_cleaned)
y_pred_summary = model_tfidf.predict(summary_tfidf)
# Evaluate the model
accuracy_tfidf = accuracy_score(y_test, y_pred_summary)
report_tfidf = classification_report(y_test, y_pred_summary)

print("TF-IDF Model Accuracy:", accuracy_tfidf)
print("TF-IDF Model Classification Report:\n", report_tfidf)

TF-IDF Model Accuracy: 0.815
TF-IDF Model Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.58      0.68       799
           1       0.82      0.93      0.87      1601

    accuracy                           0.81      2400
   macro avg       0.81      0.76      0.77      2400
weighted avg       0.81      0.81      0.81      2400



result for the non-neutral

In [103]:
# Function to filter non-neutral sentences and ensure at least two sentences are kept
def filter_non_neutral_sentences_3(sentences):
    # Analyze polarity for each sentence
    scored_sentences = [(sentence, sia.polarity_scores(sentence)['compound']) for sentence in sentences]

    # Filter out neutral sentences based on the compound score
    non_neutral_sentences = [sentence for sentence, score in scored_sentences if score < -0.1 or score > 0.1]

    # Ensure at least 3 sentences are kept
    if len(non_neutral_sentences) < 3:
        # Add sentences from the original list until there are at least 3 sentences
        non_neutral_sentences.extend([sentence for sentence, score in scored_sentences if sentence not in non_neutral_sentences][:3 - len(non_neutral_sentences)])

    return non_neutral_sentences

In [104]:
# Apply the filtering function to each list of sentences
filtered_lists = []
for review in split_sentence:
    filtered_lists.append(filter_non_neutral_sentences_3(review))

combined_texts = [' '.join(sentences) for sentences in filtered_lists]

In [105]:
new_summarized_review = sumy_review_3(combined_texts)
new_summary_cleaned = pd.Series(new_summarized_review).apply(preprocess_text)
new_summary_tfidf = tfidf_vectorizer.transform(new_summary_cleaned)
new_y_pred_summary = model_tfidf.predict(new_summary_tfidf)

In [106]:
# Evaluate the model
new_accuracy_tfidf = accuracy_score(y_test, new_y_pred_summary)
new_report_tfidf = classification_report(y_test, new_y_pred_summary)

print("TF-IDF Model Accuracy:", new_accuracy_tfidf)
print("TF-IDF Model Classification Report:\n", new_report_tfidf)

TF-IDF Model Accuracy: 0.8116666666666666
TF-IDF Model Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.58      0.67       799
           1       0.82      0.93      0.87      1601

    accuracy                           0.81      2400
   macro avg       0.81      0.75      0.77      2400
weighted avg       0.81      0.81      0.80      2400

