# Data Science 346 Project Stellenbosch University
### Team:
- David Nicolay
- Kellen Mossner
- Matthew Holm

In [96]:
# Libraries
import pandas as pd
import numpy as np
from transformers import pipeline, AutoTokenizer

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers

from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

from langdetect import detect, DetectorFactory
import re
from sklearn.feature_extraction.text import TfidfVectorizer

import random
import time

# Set Random seed for reproducible results
random_seed = 100

In [53]:
# Ensure NLTK packages downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kelle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kelle\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kelle\AppData\Roaming\nltk_data...


True

# Data Pre-processing

Load data:

In [54]:
# Import Data
reviews = pd.read_csv("../WebScrapingExplore/data/goodreads_reviews_all.csv")
book_genres = pd.read_csv("../WebScrapingExplore/data/book_info.csv")

reviews = pd.merge(reviews, book_genres, on='Book Title', how='inner')
reviews.head()

Unnamed: 0,Book Title,Link,Review Text,Review Date,Review Stars,Review Likes,Genres,First Published Date,Author
0,Ways of Seeing,https://www.goodreads.com/book/show/2784.Ways_...,This book is based on a television series whic...,"September 29, 2014",5,513,"Art, Nonfiction, Philosophy, Essays, Art Histo...","January 1, 1972",John Berger
1,Ways of Seeing,https://www.goodreads.com/book/show/2784.Ways_...,"I am not the audience for this book, mainly be...","June 3, 2014",3,216,"Art, Nonfiction, Philosophy, Essays, Art Histo...","January 1, 1972",John Berger
2,Ways of Seeing,https://www.goodreads.com/book/show/2784.Ways_...,"Way of Seeing, John Berger Ways of Seeing is a...","October 21, 2021",4,0,"Art, Nonfiction, Philosophy, Essays, Art Histo...","January 1, 1972",John Berger
3,Ways of Seeing,https://www.goodreads.com/book/show/2784.Ways_...,"First of all, this entire book is set in bold....","May 25, 2008",4,106,"Art, Nonfiction, Philosophy, Essays, Art Histo...","January 1, 1972",John Berger
4,Ways of Seeing,https://www.goodreads.com/book/show/2784.Ways_...,This was a great introduction to the work of J...,"March 12, 2020",4,80,"Art, Nonfiction, Philosophy, Essays, Art Histo...","January 1, 1972",John Berger


In [4]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29519 entries, 0 to 29518
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Book Title    29519 non-null  object
 1   Link          29519 non-null  object
 2   Review Text   29247 non-null  object
 3   Review Date   29519 non-null  object
 4   Review Stars  29519 non-null  int64 
 5   Review Likes  29519 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 1.4+ MB


Through investiagting our database further

In [64]:
# Function to detect language
def is_english(review):
    try:
        return detect(review) == 'en'
    except:
        return False

In [65]:
# Filter only English reviews- This may take a while
reviews = reviews[reviews['Review Text'].apply(is_english)]

# Part 1: Summarization

## Transformers

Initializing the pipeline will take a while to run at first, since this function downloads the model weights (about 1.6gb).

In [13]:
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
summarizer = pipeline("summarization", model=model_name)



Due to restricted input length of the summarizer the reviews text needs to be divided into chunks.

In [16]:
def chunk_text(text, max_chunk_size=500):
    words = text.split()
    chunks = []
    current_chunk = []
    current_size = 0
    for word in words:
        if current_size + len(word) > max_chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_size = len(word)
        else:
            current_chunk.append(word)
            current_size += len(word) + 1  # +1 for space
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

def summarize_text(text, max_summary_length=150):
    chunks = chunk_text(text)
    summaries = []
    for chunk in chunks:
        summary = summarizer(chunk, max_length=max_summary_length, min_length=10)[0]['summary_text']
        summaries.append(summary)
    
    final_summary = ' '.join(summaries)
    if len(final_summary) > max_summary_length:
        final_summary = summarizer(final_summary, max_length=max_summary_length, min_length=30)[0]['summary_text']
    return final_summary

Begin by summarizing 1 book's reviews - "Ways of Seeing"

Here we can have a look at how the model does a good job of summarizing (but it essentially picks important sentences), however we still need to present it in a format that explains the overall sentiment from the readers.

In [36]:
summary_test = summarizer(book_df.loc[3]['Review Text'], max_length=50, min_length=10)

In [37]:
summary_test

[{'summary_text': '4 essays and 3 pictorial essays. It seems like museums are doing a lot of things wrong as well as right. Chapter on oil-painting was particularly interesting but it was the last one about advertising (or "publicity"'}]

In [38]:
book_df.loc[3]['Review Text']

'First of all, this entire book is set in bold. I don\'t know what crazy crazyman let that through the gate at Penguin but I just felt I had to point it out right away. It\'s still worth reading. 4 essays and 3 pictorial essays. Really interesting stuff cutting away some of the bullshit associated with our appreciation of art. It seems like museums are doing a lot of things wrong as well as right. Chapter on oil-painting was particularly interesting but it was the last one about advertising (or "publicity" as it\'s exclusively referred to in this book) that has me thinking. Advertising not only needs you to want this shirt, this car, the entire industry must endeavor to narrow the scope of your desires to make you amenable to the culture. The mindset must always be a future, better you achieved through important purchases. The essay is horrifying enough until you realise that it\'s thirty years old, and this is now only one facet of a business that\'s grown much more insidious. The ads

## Encoder-Decoder Approach

In [97]:
# ***************************************************************************
def preprocess(reviews):
    """
    Replaces newline characters with spaces
    """
    n_reviews = len(reviews)
    print(f"Number of reviews: {n_reviews}")
    for i in range(n_reviews):
        review = reviews[i]
        # Replace newlines with spaces
        reviews[i] = review.replace('\n', ' ').strip()
        
        
def split_sentences(reviews):
    """
    Splits the reviews into individual sentences.
    """
    n_reviews = len(reviews)
    for i in range(n_reviews):
        review = reviews[i]
        # import nltk
        # nltk.download('punkt')
        sentences = sent_tokenize(review)  # Tokenize into sentences
        # Remove empty sentences and strip spaces
        sentences = [sent.strip() for sent in sentences if sent.strip()]
        reviews[i] = sentences
        
        
def encode_sentences(reviews):
    """
    Obtains sentence embeddings for each sentence in the reviews
    using Sentence-BERT from the sentence-transformers library.
    """
    enc_reviews = [None] * len(reviews)
    cum_sum_sentences = [0]
    sent_count = 0
    
    # Flatten reviews into a list of sentences
    for review in reviews:
        sent_count += len(review)
        cum_sum_sentences.append(sent_count)

    all_sentences = [sent for review in reviews for sent in review]
    print('Loading pre-trained Sentence-BERT model...')
    model = SentenceTransformer('all-MiniLM-L6-v2')
    print('Encoding sentences...')
    enc_sentences = model.encode(all_sentences, show_progress_bar=True)

    # Group back encoded sentences by reviews
    for i in range(len(reviews)):
        begin = cum_sum_sentences[i]
        end = cum_sum_sentences[i+1]
        enc_reviews[i] = enc_sentences[begin:end]
        
    return enc_reviews
        
    
def summarize(reviews):
    """
    Performs summarization of book reviews.
    """
    n_reviews = len(reviews)
    summary = [None] * n_reviews
    print('Preprocessing...')
    preprocess(reviews)
    
    print('Splitting into sentences...')
    split_sentences(reviews)
    
    print('Starting to encode...')
    enc_reviews = encode_sentences(reviews)
    print('Encoding Finished')
    
    for i in range(n_reviews):
        enc_review = enc_reviews[i]
        n_clusters = int(np.ceil(len(enc_review) ** 0.5))  # Number of clusters
        
        # Perform KMeans clustering
        kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init=10)
        kmeans = kmeans.fit(enc_review)
        
        avg = []
        closest = []
        for j in range(n_clusters):
            idx = np.where(kmeans.labels_ == j)[0]
            avg.append(np.mean(idx))
        closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, enc_review)
        
        # Ordering sentences by clusters
        ordering = sorted(range(n_clusters), key=lambda k: avg[k])
        summary[i] = ' '.join([reviews[i][closest[idx]] for idx in ordering])
    
    print('Clustering Finished')
    return summary


#### What is happening here?
Encoding:
- Sentence-BERT: The function uses a pre-trained Sentence-BERT model ('all-MiniLM-L6-v2') to convert each sentence into a vector embedding. - This embedding is a numerical representation of the sentence that captures its semantic meaning.
- It first flattens all the sentences from the reviews into a single list and then encodes them.
- After encoding, it restructures the embeddings back into their original review groups.

Clustering:
- For each review, the number of clusters is determined using the square root of the number of sentences (rounded up).
- KMeans clustering is performed on the sentence embeddings to group similar sentences.
- For each cluster, **the sentences closest to the cluster center (based on the distance between sentence embeddings) are selected.**

### Applying to a single book's review

In [91]:
reviews_subset = reviews[reviews['Book Title'] == 'Ways of Seeing']

In [92]:
way_of_seeing_reviews = reviews_subset['Review Text'].tolist()
summaries = summarize(way_of_seeing_reviews)

Preprocessing...
Number of reviews: 116
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding sentences...


Batches:   0%|          | 0/35 [00:00<?, ?it/s]

Encoding Finished


















Clustering Finished


In [93]:
summaries[1]

'I agree. Which makes it breathtaking to see the authors get so many things wrong, either intentionally (cutting short the bible verse in which God punishes Eve *and Adam*); stupidly (non-Western art forms show women as active participants in sex, so that are isn\'t morally dubious); or in ways that are, ahem, temporally bound ("Hair is associated with sexual power, with passion." In general, the lesson of this book is that all art is bad for you, except the pieces that the authors of this book like. They don\'t discuss the 20th century at all (I know they know that twentieth century art exists; perhaps, as good Benjaminian Marxists, they don\'t like abstraction or difficulty). I am not the audience for this book, mainly because I\'ve already read and more or less digested the handful of essays and ideas on which it is based. Good for what it is, but extremely narrow in scope, and quite harmful for anyone who swallows it whole rather than taking a few minutes to worry away at its assum

In [37]:
reviews['Review Text'][1]

'I am not the audience for this book, mainly because I\'ve already read and more or less digested the handful of essays and ideas on which it is based. The seven chapters break down fairly simply. 1: Benjamin\'s \'Work of Art\'--the ability to reproduce images alters the way we encounter works of art. This seems reasonable. Nobody gets to see a Giotto without having seen a reproduction first, except someone who has no interest in the Giotto in the first place. But Berger et al* go a step further: we need to use the fact that we encounter works of art differently to undermine the ruling class\'s privilege and the "specialized experts who are the clerks of the nostalgia of a ruling class in decline." That\'s on page 32. Part of me, a large part, laments the fact that you\'d never get that published today, not even on a website. Another part of me laments the stupidity of intellectuals who put their faith in the inherent goodness of The People. The People does not have a good track record

## Full Book Review Summary Generation
Now with the ability to create summaries for each review individually, we can generate one last summary - one that describes how most readers perceive the book (themes, flaws, strengths, etc). First, let's generate another set of summarized reviews.

---


#### Applying Review Summaries to Born A Crime by Trevor Noah

In [36]:
import warnings
warnings.filterwarnings("ignore", message="KMeans is known to have a memory leak on Windows with MKL")

In [65]:
born_a_crime_subset = reviews[reviews['Book Title'] == 'Born a Crime: Stories From a South African Childhood']
born_a_crime_reviews = born_a_crime_subset['Review Text'].tolist()
born_a_crime_summaries = summarize(born_a_crime_reviews)

Preprocessing...
Number of reviews: 120
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/67 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished


In [89]:
born_a_crime_summaries[45]

"Noah sums it up very well: in America we had 1) forced removal of the native population to reservations, 2) slavery, followed by 3) segregation. Like in the best books, you learn some important history in the telling of his young life's story. One of the many things I didn't know about South Africa is that as most countries were trying to fix racist policies after the World War 2 holocaust had shown us where discrimination could lead, the South Africans or Afrikaners (as the Dutch colonists called themselves) were running towards institutionalized racism. This man's talent knows no ceiling."

In [91]:
born_a_crime_reviews[45]

["One of the best memoirs I've ever read, Born a Crime is so funny and wise and sad and reveals much about the writer.",
 "Trevor Noah is an exceptional young man raised by a strong, formidable mother who never gave up on him and loved him with a fierce pride.He never shrinks from complete honesty in the telling, even in those areas that don't reflect well on his teenaged and early adult self.",
 'One very amusing incident tells of how as a kid his rejection of going to the outside toilet in the rain led to a most unfortunate incident with his blind grandma.',
 "Like in the best books, you learn some important history in the telling of his young life's story.",
 'Apartheid was made officially part of South African government in 1948; whereas, before you had casual, implied racism, now it was a system of specific laws that rated you as a person.',
 'There were different rules for whites, colored (people descended from the first white settlers and the natives), blacks, Indians and Asians

---

So far so good - the summaries skip over some information here and there, but at a glance, they look quite accurate. Now let's take it a step further and generate a final summary that should encapsulate the core themes and ideas mentioned in the reviews.

---


In [94]:
def summarize_final(all_summaries):
    """
    Performs summarization of the combined summaries.
    """
    all_summaries = [summary.replace('\n', ' ').strip() for summary in all_summaries]
    
    from nltk.tokenize import sent_tokenize
    all_sentences = []
    for summary in all_summaries:
        sentences = sent_tokenize(summary)
        sentences = [sent.strip() for sent in sentences if sent.strip()]
        all_sentences.extend(sentences)
    
    print('Loading pre-trained Sentence-BERT model...')
    model = SentenceTransformer('all-MiniLM-L6-v2')
    print('Encoding sentences...')
    enc_sentences = model.encode(all_sentences, show_progress_bar=True)

    n_clusters = int(np.ceil(len(enc_sentences) ** 0.5))
    print('Clustering sentences...')
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans = kmeans.fit(enc_sentences)
    
    avg = []
    closest = []
    for j in range(n_clusters):
        idx = np.where(kmeans.labels_ == j)[0]
        avg.append(np.mean(idx))
    closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, enc_sentences)
    
    ordering = sorted(range(n_clusters), key=lambda k: avg[k])
    final_summary = ' '.join([all_sentences[closest[idx]] for idx in ordering])
    
    print('Final summary generated')
    return final_summary

In [99]:
final_born_a_crime_summary = summarize_final(born_a_crime_summaries)

Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated


In [101]:
final_born_a_crime_summary

"He was just Robert to Trevor, someone who was nice to him on Sundays and Christmas holidays, and on all those days when he was not giving in to hungry black women his Swiss, precious, and fair sperm. If you have a chance to listen to the book, instead of reading it, I recommend this experience highly. Noah's love and respect for his mother & the way she raised him shines through on nearly every page. Eye-opening and perspective changing in a way that's funny and deeply vulnerable, you'll feel educated and entertained at the same time. Trevor Noah is a remarkable person. I LOVED this book! I liked reading about his childhood. But Trevor Noah expertly maneuvers through different scenes--much of it due to his mixed background--and explains so much history with apartheid and the different tribes and the colored thing and life in the townships and life in the homeland and so on... South Africa does of course have it's problems, yet after this book one has to be at least more empathetic. . 

---

Ignoring the **extremely** odd first sentence, this final generated summary mentions essentially every aspect that you can find in most reviews (audiobook being better, Noah's relationship with his mother, apartheid and the disconnect that some readers have with it, etc). 

---

For the sake of coherency, we can now parse this final summary to `facebook/bart-large-cnn` with a prompt to reframe the summary as a book description.

`facebook/bart-large-cnn` is a fine-tuned version of BART (Bidirectional and Auto-Regressive Transformer), a transformer-based sequence-to-sequence model introduced by Facebook AI, combining the advantages of bidirectional and auto-regressive models. The model uses an encoder-decoder architecture: the encoder processes the entire input (like BERT), and the decoder generates output token-by-token (like GPT).

Pretrained on various denoising tasks to learn language patterns, BART was fine-tuned specifically on the CNN/DailyMail dataset to improve its summarization capabilities. During fine-tuning, the decoder attends to the encoder's output to generate contextually appropriate summaries (Attention Mechanisms). Beam search and length penalty were also used to avoid overly short summaries.

The weights were initialized from the original BART model that was pretrained on a large corpus of text using the denoising autoencoding tasks mentioned earlier.

---

In [40]:
def generate_book_description(summary):
    """
    Takes the final summary of reviews and generates a book description.
    """
    description_pipeline = pipeline("text2text-generation", model="facebook/bart-large-cnn")

    prompt = (
        "Convert the following summary into a structured book description that describes the book's content, "
        "themes, and significance: " + summary
    )

    result = description_pipeline(prompt, max_length=250, min_length=100, do_sample=False)

    book_description = result[0]['generated_text']
    #print(book_description)
    return book_description

In [120]:
book_description = generate_book_description(final_born_a_crime_summary)

In [109]:
book_description

"Trevor Noah's love and respect for his mother & the way she raised him shines through on nearly every page. Eye-opening and perspective changing in a way that's funny and deeply vulnerable, you'll feel educated and entertained at the same time. For an enhanced experience, I highly recommend the audiobook version. Moved out of the house at the age of 17 because of his step-dad and was even jailed for using a fake license plate. Imagine being born from a black mother and a white father in a country where interracial relationships were against the law."

---

We can now apply this to any book we like to obtain an overview/description of the contents and themes soley based off of the reviews.

---

## Generating Final Summaries For 10 Books

In [80]:
# Start timer for the whole process
total_start_time = time.time()

unique_books = reviews['Book Title'].unique()
random_books = random.sample(list(unique_books), 20)
book_descriptions_dict = {}

for book in random_books:
    print(book)

print("\n")

for book in random_books:

    book_subset = reviews[reviews['Book Title'] == book]

    book_reviews = book_subset['Review Text'].dropna().tolist()

    if not book_reviews:
        continue
        
    book_summaries = summarize(book_reviews)
    final_book_summary = summarize_final(book_summaries)

    book_description = generate_book_description(final_book_summary)

    book_descriptions_dict[book] = book_description

total_end_time = time.time()
total_duration = total_end_time - total_start_time

print(f"\nTotal processing time: {total_duration:.2f} seconds")

for book, description in book_descriptions_dict.items():
    print("\n")
    print(f"Book Title:\n{book}")
    print(f"Generated Description:\n{description}")
    print("\n")


The Wind in the Willows
The Voyage of the Dawn Treader
Drive: The Surprising Truth About What Motivates Us
Team of Rivals: The Political Genius of Abraham Lincoln
The Art Forger
Winnie-the-Pooh
The Lean Startup (Hardcover)
Harry Potter and the Prisoner of Azkaban
Goodnight Moon
Vincent Van Gogh: The Complete Paintings
Charlie and the Chocolate Factory
Anne of Green Gables
Alexander Hamilton
Wild: From Lost to Found on the Pacific Crest Trail
Purple Cow: Transform Your Business by Being Remarkable
Crossing the Chasm: Marketing and Selling High-Tech Products to Mainstream Customers
On Photography
The Cost of Discipleship
The Work of Art in the Age of Its Technological Reproducibility, and Other Writings on Media
Concerning the Spiritual in Art


Preprocessing...
Number of reviews: 120
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated
Preprocessing...
Number of reviews: 240
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/76 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/24 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated
Preprocessing...
Number of reviews: 119
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/50 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated
Preprocessing...
Number of reviews: 120
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/67 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated
Preprocessing...
Number of reviews: 118
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated
Preprocessing...
Number of reviews: 120
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/37 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated
Preprocessing...
Number of reviews: 118
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated
Preprocessing...
Number of reviews: 118
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/66 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/15 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated
Preprocessing...
Number of reviews: 118
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated
Preprocessing...
Number of reviews: 118
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated
Preprocessing...
Number of reviews: 120
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/48 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated
Preprocessing...
Number of reviews: 119
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/54 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/15 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated
Preprocessing...
Number of reviews: 120
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/71 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated
Preprocessing...
Number of reviews: 119
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/73 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated
Preprocessing...
Number of reviews: 118
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/33 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated
Preprocessing...
Number of reviews: 118
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/28 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated
Preprocessing...
Number of reviews: 119
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated
Preprocessing...
Number of reviews: 120
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/37 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated
Preprocessing...
Number of reviews: 118
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/35 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated
Preprocessing...
Number of reviews: 119
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/26 [00:00<?, ?it/s]

Encoding Finished
Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Clustering sentences...
Final summary generated

Total processing time: 1374.19 seconds


Book Title:
The Wind in the Willows
Generated Description:




Book Title:
The Voyage of the Dawn Treader
Generated Description:
My favorite in the series so far! I love the whole Narnia series, but this was my favorite. I had such a literary crush on Caspian, and I'd got a lot fonder of Edmund, too. The story is fun for children of all ages if you don't mind the overtness of the Christian allegory in this one. And of course there's the part about Eustace and the dragon, which involves transparently Christian imagery but it works better in this novel than in some of the others.




Book Title:
Drive: The Surprising Truth About What Motivates Us
Generated Description:
Drive takes a deeper look at businesses and owners that that use "motivation 3.0" to appeal towards employees real drives. It offers ideas on how people can inject real motivation in all aspects of their lives: work, child-rearing, pe

---
This process (obviously) took quite a while. To remedy this, we can make use of cloud computing in the form of Google Colab. Using their T4 GPU, it only took 2 minutes (almost 10x faster) to generate the following for another 20 random books:

Total processing time: **141.88 seconds**

**Book Title:**<br>
Zero to One: Notes on Startups, or How to Build the Future<br>
**Generated Description:**<br>
Peter Thiel is a massive contrarian (and I'm against extreme contrarianism and extreme conformity), but his ideas are grounded and logical. He claims, for example, that nothing but information technology and communications technology have evolved much since the 70s. Instead of trying to find a niche in a monopolistic market, startups are supposed to find entirely different ideas. He also provides a number of other areas of thought for the entrepreneur when starting and running a business. I wouldn't generalize Thiel's wisdom to fields outside of startups (just like the case with Paul Graham) -- indeed he made some claims that were not well thought out.

**Book Title:**<br>
The Autobiography of Malcolm X<br>
**Generated Description:**<br>
Malcolm X's life was a struggle. May Allah forgive all his shortcomings and allow us to spread Islam with even a morsel or his enthusiasm, determination and commitment. It is a long and powerful story that reveals another facet of the fight for racial justice and equality. My copy of this book is already in tatters for the work I have done on it but I will buy a new one, frame it perhaps in a glass ceiling, and look at it day and night just to remind myself what a dedicated man can become but most importantly what he can overcome.

**Book Title:**<br>
Bonhoeffer: Pastor, Martyr, Prophet, Spy<br>
**Generated Description:**<br>
The author Eric Metaxas must have spent years researching for this book. Bonhoeffer is an example of a man who loved the Word of God, the Church, and the person of Christ. His ideas/ theology. Was 3-stars, now 2-Stars Comment 20 September, 2023: I first read this book sometime in 2010-2011 - maybe prior to joining Goodreads, but certainly before I commenced writing any reviews. The book is so well-written and compelling. A fascinating life.

**Book Title:**<br>
Surprised by Joy: The Shape of My Early Life<br>
**Generated Description:**<br>
"I was equally angry with Him for creating a world." A young man who wishes to remain a sound Atheist cannot be too careful of his reading. A fairly dull middle, sandwiched between an interesting start and an interesting end. The book deepened my understanding of and appreciation for Lewis as a person, scholar, and author (not to mention adding new layers and depth of his other books). Reading how his life effected what he thought and wrote about is truly interesting. But I really loved this book.

**Book Title:**<br>
The Agony and the Ecstasy<br>
**Generated Description:**<br>
Author Irving Stone is a fantastic writer. I found myself reading slower and slower towards the end, because I did not want to finish reading the book! I felt the author devoted enough time to each event in Michelangelo's life to give it meaning and purpose, but was sure to move on when it was time. By the end of the book, you will feel that you know Michelangelo: family issues, rivalries, the popes, his friends, and his delight and obsession for his art. It also gives a excellent account of the the history and life in Italy in the 16th century.

**Book Title:**<br>
Color and Light: A Guide for the Realist Painter (Volume 2)<br>
**Generated Description:**<br>
James Gurney's book is a clear and concise collection of information about the science of light color as they relate to observational (and imagined!) The lessons are observations of light and colors from James Gurny's years of painting experience. Convert the following summary into a structured book description that describes the book's content, themes, and significance: I learned more about color theory reading this book than I did studying illustration for two years at Pratt Institute. I will definitely be revisiting this book often!

**Book Title:**<br>
The Art Thief: A True Story of Love, Crime, and a Dangerous Obsession<br>
**Generated Description:**<br>
This true story of a one of a kind criminal reads like fiction and I thoroughly enjoyed it. Police estimates he's stolen between $1 to 2 billion worth of art. He steals because he loves art. Obsession, madness, narcissism. He lived in an attic in his mother’s house with Anne-Catherine for most of his life where he accumulated his stolen art like a pack-rat. I could not stop listening to this audiobook! Nonfiction account of a prolific art heist.

**Book Title:**<br>
Shoe Dog: A Memoir by the Creator of Nike<br>
**Generated Description:**<br>
The book is unexpectedly enjoyable to read - well written and intriguing - and is not your standard entrepreneurship story. The story of Phil Knight is everything books like "Art of the Deal" are not: the humble story of a dream, framed more as an accidental life journey than the story of rags to riches. He came off too proud, full of elitism and arrogance, he’s mysoginistic and extremely privileged. HOWEVER this book was pretty good and inspirational - message to work hard and believe in yourself.

**Book Title:**<br>
How to Win Friends & Influence People<br>
**Generated Description:**<br>
D.C. by Carnegie makes following his principles sound too easy. I think I expected a lot of dated advice since this was published so long ago, but I found myself enjoying this and also understanding why it remains so popular after so many decades. Some other things give me this weird feeling of 'fake superficiality' and it decribes a world where people can't disagree or be frank with eachother. But that's really just me trying to find something positive (using the "principles") in a book that I am still trying to UNlearn.

**Book Title:**<br>
Anne of Green Gables<br>
**Generated Description:**<br>
This book made me cry. My daughter loved this book over the summer, one of the classics I’ve never read and it’s now firmly on my To Read list. I loved the writing so much and was surprised how lovely it is. Marilla and Rachel were great as well and I feel like Marilla had the most satisfying character development. This will be my comfort read, I'm obsessed and can't wait to watch the show Also Gilbert has my whole heart, so do all the other characters omg

**Book Title:**
The Practice of the Presence of God<br>
**Generated Description:**<br>
The life of Brother Lawrence is testimony to his writings; his single-minded concern for God, far from leading him away from love of people, brought him closer to them. The book is less practical than I was expecting, in terms of specific tips for spiritual disciplines. There were also times when his theology seemed a bit heavy on the idea that God sends all inflictions and suffering and sickness to purge/cure our soul. “We ought not to be weary of doing little things for the love of God, who regards not the greatness of the work, but with which it is performed”

**Book Title:**<br>
The Horse and His Boy<br>
**Generated Description:**<br>
This is by far the best Narnia book. As a lover of redemption arcs, I find this story very satisfying. I love the relationship between Shasta and the talking horse, Bree. So many good scenes with Aslan. Several of his moments moved me almost to tears. The prince agrees, because he's so certain his evil plan will work. And yes, I love it. But a boy in battle is a danger only to his own side. I should have read this book sooner >.<

**Book Title:**<br>
Alexander Hamilton<br>
**Generated Description:**<br>
At over 800 pages, it’s a hefty read, and there were times when I felt bogged down by the sheer volume of detail. The author sometimes went so far in depth in creating supporting characters histories that I felt, at times, that I was reading someone else's biography. It's dense and has some passages that are somewhat dull, and even the succinct writing style of Chernow didn't ease it much. I would only recommend this to history buffs and someone researching Hamilton's life. Though it took 36 hours, it engaged my interest throughout.

**Book Title:**<br>
Dreams from My Father: A Story of Race and Inheritance<br>
**Generated Description:**<br>
The book traces Obama's quest for self and purports that as a mixed race person with his family ties split between two continents and two cultures, he is confused and torn. I both understood and was puzzled by some of his feelings of loathing and anger towards himself and US society. I personally could connect with various aspects of his struggle and the larger struggle of the black community. The Kenya story is beautiful, but becomes a telling by his grandmother for 40 to 50 pages. I’m giving it 4 stars only because I felt as though the middle portion of the book dragged on a bit.

**Book Title:**<br>
Color: A Natural History of the Palette<br>
**Generated Description:**<br>
Victoria Finlay is a very interesting person. She is lucky, too late for me to ever see them now, the extent of her journeys in the book is remarkable. I was expecting to read more of a history book, but it turned out to be a travelogue/memoir. The author is definitely part of the story, but for me, it lent a human aspect and interest that may otherwise have been lacking. Although a fan of micro-histories could certainly enjoy this book, they shouldn’t go in expecting a Simon Winchester style non fiction book.

**Book Title:**<br>
The Art Book<br>
**Generated Description:**<br>
This isn't an in-depth exploration. The pictures are of a very good quality and the book itself is a monster. Lots of diversity in terms of styles of art shown (not so much diversity in the artists) This is a wonderful book for art fans, as well as those who are art curious. Not for the academics, but that's part of the appeal. I used to flip through a copy of this and the companion Photography book at Superstore while my Mom shopped for groceries.

**Book Title:**<br>
The Intelligent Investor<br>
**Generated Description:**<br>
Graham is known for inspiring Warren Buffett, and many other major investors. He goes through different types of investors, starting from the defensive investor who is someone a lot more careful to speculate. Majority of stock buybacks are done so to counteract the execution of employee options. The key to value investing is purchasing stocks that are selling well below their “intrinsic” value. This is the best book on investing. A definitive read for those looking for a disciplined approach to investment. Many claim that it is still entirely relevant, despite its age.

**Book Title:**<br>
Deep Work: Rules for Focused Success in a Distracted World<br>
**Generated Description:**<br>
"Deep Work" is about why and how to manage to work deeply, i.e., producing precious value. To learn quickly, you need to study for long periods of time consistently. Social media, if used without limit, can be particularly devastating to your quest to work deeper. To conclude, I’d like to recommend ‘Deep Work’ to anyone seeking to develop a more productive work routine. Read David Allen instead, whose ideas permeate this book to a degree, but who cannot be quoted every second page.

**Book Title:**<br>
Mere Christianity<br>
**Generated Description:**<br>
"I truly loved this book and recommend to all Christians ... even people who aren’t Christians will get a great grasp of what Christianity is all about" "I was alight with curiosity. The concepts and thoughts he introduced were really interesting, because what Lewis does is he takes normal, everyday concepts for a Christian and examines them to the point that you completely understand where and how and why they are" "His style of explaining things in such a blunt, straightforward manner was so relatable at times"

---
Generating summaries from book reviews is a powerful approach to capturing the essence of a book through the perspectives of multiple readers. By condensing a range of opinions, experiences, and highlights from reviews, we can create a more balanced and comprehensive description that goes beyond a typical synopsis. This method of summary generation can be applied to many other forms of reviews (product, location, etc) and has been done by companies like Amazon for their product reviews.

---

# Part 2: Manual Deep Learning

## Star Prediction

Users give a book a _Star Rating_ along with leaving a _text review_ on a book, we investigate the question: "Can a Neural Network be used to accurately predict the number of stars of a review based on the text?".

In [12]:
reviews['Review Text'].isnull().sum()

272

Some reviews texts are empty, it is quite few. Therefore we can safely omit these reviews from the dataset.

In [15]:
reviews_noNA = reviews.dropna(subset=['Review Text'])

In [16]:
# Setup target and predictor datasets
X = reviews_noNA['Review Text'].values
y = reviews_noNA['Review Stars'].values

We now use the `TfidfVectorizer` from `sklearn` to vectorize the text into term-frequency inverse-document frequency matrix form. This approach helps capture the importance of words across the dataset, where terms that occur frequently in a document but rarely across all documents are given higher weight, while common terms across the corpus are downweighted. The `LabelEncoder` converts categorical labels into numerical ones to feed into the neural net.

In [40]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X_tfidf = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=random_seed)

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Convert labels to categorical
num_classes = len(np.unique(y_train))
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)

In [41]:
model = tf.keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax') # Softmax for categorical classification
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train.toarray(), y_train, epochs=10, batch_size=32, validation_split=0.3, verbose=1)

Epoch 1/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.4791 - loss: 1.2619 - val_accuracy: 0.5450 - val_loss: 1.0771
Epoch 2/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.6591 - loss: 0.8290 - val_accuracy: 0.5325 - val_loss: 1.1296
Epoch 3/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.8196 - loss: 0.5126 - val_accuracy: 0.5027 - val_loss: 1.4720
Epoch 4/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.9540 - loss: 0.1712 - val_accuracy: 0.5071 - val_loss: 2.1783
Epoch 5/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.9884 - loss: 0.0439 - val_accuracy: 0.4813 - val_loss: 2.8878
Epoch 6/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9955 - loss: 0.0197 - val_accuracy: 0.4751 - val_loss: 3.2752
Epoch 7/10
[1m512/512[0m 

In [42]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test.toarray(), y_test, verbose=0)
print(f'Test accuracy: {test_accuracy:.3f}')

Test accuracy: 0.499


A test accuracy around 0.5 means we could have predicted 2.5 stars for every review and gotten the same accuracy. The training accuracy seems to increase drastically however validation accuracy remains steady at around 0.51. This indicates overfitting.

What appears to be interesting is that despite the model statistically indicating overfitting, the predictions it makes can actually be quite reliable.
 Consider an example of a bad review. "This book was absolutely terrible! How could you think this was a good idea."

In [29]:
# BAD review example
sample_review = ["This book was absolutely terrible! How could you think this was a good idea."]
sample_review_tfidf = vectorizer.transform(sample_review)
prediction = model.predict(sample_review_tfidf.toarray())
predicted_rating = np.argmax(prediction) + 1
print(f'Predicted rating: {predicted_rating}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Predicted rating: 1


Consider an example of a long mixed review:

In [30]:
sample_review = ["I had high hopes for The Infinite Horizon after hearing so much about it. From the beginning, the premise seemed promising, and for the most part, it delivers on its intriguing concept. The plot revolves around a futuristic world where society grapples with the boundaries of artificial intelligence, humanity, and survival—concepts that have always fascinated me. The world-building is impressive, with detailed landscapes and a unique societal structure that keeps you hooked initially. The author has clearly put a lot of thought into constructing the futuristic world, and it shows in the vivid descriptions and creative technologies. However, while the world-building is rich, the characters left much to be desired. The protagonist, Lila, felt underdeveloped. I found myself frustrated at several points because her motivations were either unclear or inconsistent. In the beginning, she starts off as a strong, determined character, but midway through, her actions seem erratic and her growth stagnates. The dialogue, too, felt stilted at times, making it hard to connect with the characters emotionally. There were a few moments where I felt the conversations between key characters were forced, almost like they were inserted to explain plot points rather than feeling organic.On the flip side, I have to give credit where it’s due—the pacing of the story is solid for the most part. There are intense moments where you’re on the edge of your seat, particularly during the battle scenes. These scenes were written with such vivid detail that I could easily imagine them playing out in a movie. The action sequences are well thought out, and they definitely add excitement to the narrative. That being said, there were also moments where the pacing lagged, especially in the middle sections. Some chapters felt like filler, dragging on with unnecessary exposition and side plots that didn’t add much to the overarching story."]
sample_review_tfidf = vectorizer.transform(sample_review)
prediction = model.predict(sample_review_tfidf.toarray())
predicted_rating = np.argmax(prediction) + 1
print(f'Predicted rating: {predicted_rating}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
Predicted rating: 3


The rating prediction appears to be somewhat reliable even with longer mixed reviews.

#### Alternative models to reduce overfitting

In [43]:
# Alternative model incorporating Dropout
model = tf.keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train.toarray(), y_train, epochs=10, batch_size=32, validation_split=0.3, verbose=1)

Epoch 1/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.4549 - loss: 1.3119 - val_accuracy: 0.5340 - val_loss: 1.1000
Epoch 2/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.5988 - loss: 0.9584 - val_accuracy: 0.5446 - val_loss: 1.1051
Epoch 3/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.6959 - loss: 0.7445 - val_accuracy: 0.5276 - val_loss: 1.2130
Epoch 4/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.7953 - loss: 0.5510 - val_accuracy: 0.5144 - val_loss: 1.4226
Epoch 5/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8697 - loss: 0.3718 - val_accuracy: 0.4957 - val_loss: 1.7808
Epoch 6/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9055 - loss: 0.2650 - val_accuracy: 0.5031 - val_loss: 2.1641
Epoch 7/10
[1m512/512[0m 

In [44]:
test_loss, test_accuracy = model.evaluate(X_test.toarray(), y_test, verbose=0)
print(f'Test accuracy: {test_accuracy:.3f}')

Test accuracy: 0.492


Adding dropout layers does not improve the statistical overfitting. Z

In [45]:
# Use L2 Regularization to attempt to improve overfitting through weight decay
model = tf.keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [46]:
history = model.fit(X_train.toarray(), y_train, epochs=10, batch_size=32, validation_split=0.3, verbose=1)
test_loss, test_accuracy = model.evaluate(X_test.toarray(), y_test, verbose=0)
print(f'Test accuracy: {test_accuracy:.3f}')

Epoch 1/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.4636 - loss: 1.9923 - val_accuracy: 0.4724 - val_loss: 1.3309
Epoch 2/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.4699 - loss: 1.3186 - val_accuracy: 0.4724 - val_loss: 1.3181
Epoch 3/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.4777 - loss: 1.3026 - val_accuracy: 0.4724 - val_loss: 1.3107
Epoch 4/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.4710 - loss: 1.3012 - val_accuracy: 0.4724 - val_loss: 1.3085
Epoch 5/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.4808 - loss: 1.2941 - val_accuracy: 0.4724 - val_loss: 1.3074
Epoch 6/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.4784 - loss: 1.2939 - val_accuracy: 0.4724 - val_loss: 1.3067
Epoch 7/10
[1m512/512[0m 

When applying L2 regularization the result is the model tends to _underfit_. We see this because both the training and validation accuracies are low and nearly identical, meaning the model is not complex enough to capture the underlying patterns in the data.

# Part 3: Aspect-Based Sentiment Analysis

Books are often reviewed based on certain aspects, for example, a fiction book on the plot, characters or maybe the setting. Readers have diverse tastes and some may prioritise plot over characters or maybe emotion over humour. We therefore attempt an Aspect-Based Sentiment Analysis (ABSA) of our book reviews. This can help identify strengths and can guide authors and publishers towards understanding the reader's perspective. By highlighting both positive and negative aspects, we can propose a balanced critique with a well-rounded perspective. This also enhances decision-making for readers who do not want to sift through hundreds of wordy reviews to gauge a sentiment on a specific aspect of the book.

## Simple Approach

A simple and interpretable approach to this problem is to:
1. Pre-define certain aspects.
2. Select sentences containing that aspect.
3. Compute sentiment of those sentences.

In [57]:
# Define aspects we're interested in
aspects = ["plot", "characters", "writing", "pacing", "setting", "structure", "emotion", "humor"]

def preprocess_lemmatize(text):
    # Tokenize
    tokens = word_tokenize(text.lower())
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

def extract_aspects(tokens):
    extracted_aspects = []
    for aspect in aspects:
        if aspect in tokens:
            extracted_aspects.append(aspect)
    return extracted_aspects

def analyze_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

def extract_aspect_sentences(text, aspect):
    sentences = sent_tokenize(text)
    return [sent for sent in sentences if aspect in sent.lower()]

def analyze_review(review):
    tokens = preprocess_lemmatize(review)
    extracted_aspects = extract_aspects(tokens)
    results = {}
    
    for aspect in extracted_aspects:
        aspect_sentences = extract_aspect_sentences(review, aspect)
        if aspect_sentences:
            sentiment = sum(analyze_sentiment(sent) for sent in aspect_sentences) / len(aspect_sentences)
            results[aspect] = {
                "sentiment": sentiment,
                "sentences": aspect_sentences
            }
    
    return results

In [58]:
# Best case example
example_review = """
The book had an intriguing plot that kept me guessing until the end. 
The characters were well-developed and relatable. 
However, the pacing was a bit slow in the middle sections. 
The writing style was eloquent, and the author's descriptions of the setting were vivid.
"""

results = analyze_review(example_review)

for aspect, data in results.items():
    print(f"Aspect: {aspect}")
    print(f"Sentiment: {data['sentiment']:.2f}")
    print("Supporting sentences:")
    for sentence in data['sentences']:
        print(f"- {sentence}")
    print()

Aspect: plot
Sentiment: 0.30
Supporting sentences:
- 
The book had an intriguing plot that kept me guessing until the end.

Aspect: writing
Sentiment: 0.12
Supporting sentences:
- The writing style was eloquent, and the author's descriptions of the setting were vivid.

Aspect: pacing
Sentiment: -0.15
Supporting sentences:
- However, the pacing was a bit slow in the middle sections.

Aspect: setting
Sentiment: 0.12
Supporting sentences:
- The writing style was eloquent, and the author's descriptions of the setting were vivid.



In [66]:
# Ensure we have the correct subset
born_a_crime_subset = reviews[reviews['Book Title'] == 'Born a Crime: Stories From a South African Childhood']

In [44]:
born_a_crime_subset['Review Text'][6959]



In [32]:
born_a_crime_subset['Review Text'][7919]
results = analyze_review(born_a_crime_subset['Review Text'][7919])

for aspect, data in results.items():
    print(f"Aspect: {aspect}")
    print(f"Sentiment: {data['sentiment']:.2f}")
    print("Supporting sentences:")
    for sentence in data['sentences']:
        print(f"- {sentence}")
    print()

Aspect: humor
Sentiment: -0.02
Supporting sentences:
- I inherently knew that they wouldn't pick someone with a sense of humor and style identical to Stewart's, but I felt that Noah was so different that his selection meant the show would have a really different feel, which might not appeal to long-time fans of the show.
- While the book does include some of the wry humor that has begun endearing him to fans, this is an emotional, brutal, and educational story of a life which flourished despite the odds stacked against it.



## Advanced Approach

The ABSA field is an extremely active area of research. A popular dataset which is often used as a benchmark for ABSA models is the [SemEval 2014 Task 4](https://paperswithcode.com/sota/aspect-based-sentiment-analysis-on-semeval). We considered many alternative papers and datasets, and finally settled on [InstructABSA](https://arxiv.org/abs/2302.08624v6) due to its incredible performance on benchmarks and ease of use.




### InstructABSA Architecture
This model introduces positive, negative, and neutral examples to each training sample, and instruction tune the model [Tk-Instruct](https://aclanthology.org/2022.emnlp-main.340/) for ABSA subtasks.

Let $S_i$ represent the $i^{th}$ review sentence in the training sample, where $S_i = {w_{i}^1, w_{i}^2, ..., w_{i}^n}$ with $n$ as the number of tokens in the sentence. 
Each $S_i$ contains a set of aspect terms denoted by $A_i = {a_{i}^1, a_{i}^2, ..., a_{i}^m} | m \le n$, and the corresponding opinion terms and sentiment polarities for each aspect term are denoted by $O_{i} = {o_{i}^1, o_{i}^2, ..., o_{i}^m}$ and $SP_{i} = {sp_{i}^1, sp_{i}^2, ..., sp_{i}^m}$ respectively, where $sp_i^k \in [ positive, negative, neutral ]$. 
\
\
The ABSA tasks are described as follows:\
ATE: $A_i = LM_{ATE}(S_i)$\
ATSC: $sp_i^k = LM_{ATSC}(S_i, a_i^k)$\
ASPE: $[A_i, SP_i] = LM_{ASPE}(S_i)$\
AOOE: $o_{i}^k = LM_{AOOE}(S_i, a_i^k)$\
AOPE: $[A_i, O_i] = LM_{AOPE}(S_i)$\
AOSTE: $[A_i, O_i, SP_i] = LM_{AOSTE}(S_i)$\
\
In these equations, $LM$ represents the language model, and the corresponding inputs and outputs are defined accordingly. As part of our approach, we instruction tune $LM_{subtask}$ by prepending task-specific prompts to each input sample to arrive at $LM_{subtask}^{Inst}$ 

![ABSASubtasks.png](images/ABSASubtasks.png)

Image Source: [InstructABSA](https://arxiv.org/abs/2302.08624v6) 

![overview.png](images/overview.png)
Image Source: [InstructABSA](https://arxiv.org/abs/2302.08624v6) 

In [83]:
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [102]:
tokenizer = AutoTokenizer.from_pretrained("kevinscaria/atsc_tk-instruct-base-def-pos-neg-neut-combined")
model = AutoModelForSeq2SeqLM.from_pretrained("kevinscaria/atsc_tk-instruct-base-def-pos-neg-neut-combined")

# Predefine prompt for aspect based sentiment analysis
bos_instruct = """Definition: The output will be the aspects and their sentiment polarity for book reviews. The aspects to be considered are exclusively: plot, characters, writing, pacing, setting, structure, emotion, and humor. In cases where none of these aspects are mentioned, the output should be noaspectterm:none. The sentiment can be positive, negative, or neutral.
Positive examples:
Input: The characters were so well-developed, I felt like I knew them personally by the end of the book.
Output: characters:positive
Input: The author's writing style was captivating, and the plot kept me guessing until the very end.
Output: writing:positive, plot:positive
Input: The pacing was perfect, with just the right balance of action and reflection, and the Victorian London setting was vividly described.
Output: pacing:positive, setting:positive
Negative examples:
Input: The plot had too many holes and inconsistencies to be enjoyable.
Output: plot:negative
Input: While the concept was interesting, the writing felt amateurish and the characters were one-dimensional.
Output: writing:negative, characters:negative
Input: The book's structure was confusing, jumping between timelines without any clear purpose.
Output: structure:negative
Neutral examples:
Input: The setting was a typical fantasy world, neither particularly innovative nor disappointing.
Output: setting:neutral
Input: The author attempted to inject humor into the story, but it didn't particularly stand out.
Output: humor:neutral
Input: The emotional content of the book was present but didn't significantly impact my reading experience.
Output: emotion:neutral
Mixed sentiment example:
Input: The plot was gripping, but the pacing felt off, with some sections dragging on too long.
Output: plot:positive, pacing:negative
No relevant aspect example:
Input: The book arrived on time and in good condition.
Output: noaspectterm:none
Now complete the following example-
Input:"""
delim_instruct = ' The aspect is '
eos_instruct = '.\noutput:'

def book_absa(book_title):
    book_subset = reviews[reviews['Book Title'] == book_title]
    book_reviews = book_subset['Review Text'].tolist()
    book_summaries = summarize(book_reviews)
    
    book_final_summary = summarize_final(book_summaries)
    
    text = f'''{book_final_summary}'''
    aspect_sentiment_dict = {}
    
    for aspect_term in aspects:
        tokenized_text = tokenizer(bos_instruction + text + delim_instruct + aspect_term + eos_instruct, return_tensors="pt")
        output = model.generate(tokenized_text.input_ids)
        sentiment_output = tokenizer.decode(output[0], skip_special_tokens=True)
        
        # Store the aspect term and its corresponding sentiment in the dictionary
        aspect_sentiment_dict[aspect_term] = sentiment_output
        print(f'Model output for {aspect_term}: ', sentiment_output)
    
    return aspect_sentiment_dict

#### Print the ABSA for the highest rated book

In [103]:
reviews['Review Stars'] = pd.to_numeric(reviews['Review Stars'], errors='coerce')

# Group by 'Book Title' and calculate the average rating
average_ratings = reviews.groupby('Book Title')['Review Stars'].mean()

# Find the book with the highest average rating
highest_rated_book = average_ratings.idxmax()
highest_average_rating = average_ratings.max()

print(f'Book with the highest average rating: {highest_rated_book} (Average Rating: {highest_average_rating})')

Book with the highest average rating: Night (Average Rating: 4.762711864406779)


In [104]:
absa_sentiments = book_absa('Night')

Preprocessing...
Number of reviews: 118
Splitting into sentences...
Starting to encode...
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

Encoding Finished












Clustering Finished
Loading pre-trained Sentence-BERT model...
Encoding sentences...


Batches:   0%|          | 0/15 [00:00<?, ?it/s]

Clustering sentences...


Token indices sequence length is longer than the specified maximum sequence length for this model (660 > 512). Running this sequence through the model will result in indexing errors


Final summary generated




Model output for plot:  positive
Model output for characters:  positive
Model output for writing:  positive
Model output for pacing:  positive
Model output for setting:  neutral
Model output for structure:  neutral
Model output for emotion:  positive
Model output for humor:  positive


In [105]:
print(absa_sentiments)

{'plot': 'positive', 'characters': 'positive', 'writing': 'positive', 'pacing': 'positive', 'setting': 'neutral', 'structure': 'neutral', 'emotion': 'positive', 'humor': 'positive'}
