# Data Science 346 Project Stellenbosch University
### Team:
- David Nicolay
- Kellen Mossner
- Matthew Holm

In [12]:
# Libraries
import pandas as pd
from transformers import pipeline, AutoTokenizer

# Part 1: Summarization

Load data:

In [2]:
# Import Data
reviews = pd.read_csv("../WebScrapingExplore/data/goodreads_reviews_all.csv")
reviews.head()

Unnamed: 0,Book Title,Link,Review Text,Review Date,Review Stars,Review Likes
0,Ways of Seeing,https://www.goodreads.com/book/show/2784.Ways_...,This book is based on a television series whic...,"September 29, 2014",5,513
1,Ways of Seeing,https://www.goodreads.com/book/show/2784.Ways_...,"I am not the audience for this book, mainly be...","June 3, 2014",3,216
2,Ways of Seeing,https://www.goodreads.com/book/show/2784.Ways_...,"Way of Seeing, John Berger Ways of Seeing is a...","October 21, 2021",4,0
3,Ways of Seeing,https://www.goodreads.com/book/show/2784.Ways_...,"First of all, this entire book is set in bold....","May 25, 2008",4,106
4,Ways of Seeing,https://www.goodreads.com/book/show/2784.Ways_...,This was a great introduction to the work of J...,"March 12, 2020",4,80


In [3]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29519 entries, 0 to 29518
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Book Title    29519 non-null  object
 1   Link          29519 non-null  object
 2   Review Text   29247 non-null  object
 3   Review Date   29519 non-null  object
 4   Review Stars  29519 non-null  int64 
 5   Review Likes  29519 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 1.4+ MB


Initializing the pipeline will take a while to run at first, since this function downloads the model weights (about 1.6gb).

In [13]:
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
summarizer = pipeline("summarization", model=model_name)



Due to restricted input length of the summarizer the reviews text needs to be divided into chunks.

In [16]:
def chunk_text(text, max_chunk_size=500):
    words = text.split()
    chunks = []
    current_chunk = []
    current_size = 0
    for word in words:
        if current_size + len(word) > max_chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_size = len(word)
        else:
            current_chunk.append(word)
            current_size += len(word) + 1  # +1 for space
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

def summarize_text(text, max_summary_length=150):
    chunks = chunk_text(text)
    summaries = []
    for chunk in chunks:
        summary = summarizer(chunk, max_length=max_summary_length, min_length=10)[0]['summary_text']
        summaries.append(summary)
    
    final_summary = ' '.join(summaries)
    if len(final_summary) > max_summary_length:
        final_summary = summarizer(final_summary, max_length=max_summary_length, min_length=30)[0]['summary_text']
    return final_summary

Begin by summarizing 1 book's reviews - "Ways of Seeing"

In [35]:
# TODO change model to be able to handle longer length summarize inputs 

#book_title = "Ways of Seeing"
# book_df = reviews[reviews['Book Title'] == book_title]

# # Concatenate all reviews for the book
# all_reviews = ' '.join(book_df['Review Text'].dropna())

# # Generate a summary of the concatenated reviews
# try:
#     summary = summarize_text(all_reviews)
# except Exception as e:
#     print(f"An error occurred: {e}")
#     summary = "Error generating summary"

# # Calculate average rating
# avg_rating = book_df['Review Stars'].mean()

# # Create a new dataframe with the results
# result_df = pd.DataFrame({
#     'Book Title': [book_title],
#     'Review Summary': [summary],
#     'Average Rating': [avg_rating],
#     'Number of Reviews': [len(book_df)]
# })

# # Display the results
# print(result_df)

# # Print some statistics
# print(f"\nTotal length of all reviews: {len(all_reviews)} characters")
# print(f"Length of summary: {len(summary)} characters")

Here we can have a look at how the model does a good job of summarizing (but it essentially picks important sentences), however we still need to present it in a format that explains the overall sentiment from the readers.

In [36]:
summary_test = summarizer(book_df.loc[3]['Review Text'], max_length=50, min_length=10)

In [37]:
summary_test

[{'summary_text': '4 essays and 3 pictorial essays. It seems like museums are doing a lot of things wrong as well as right. Chapter on oil-painting was particularly interesting but it was the last one about advertising (or "publicity"'}]

In [38]:
book_df.loc[3]['Review Text']

'First of all, this entire book is set in bold. I don\'t know what crazy crazyman let that through the gate at Penguin but I just felt I had to point it out right away. It\'s still worth reading. 4 essays and 3 pictorial essays. Really interesting stuff cutting away some of the bullshit associated with our appreciation of art. It seems like museums are doing a lot of things wrong as well as right. Chapter on oil-painting was particularly interesting but it was the last one about advertising (or "publicity" as it\'s exclusively referred to in this book) that has me thinking. Advertising not only needs you to want this shirt, this car, the entire industry must endeavor to narrow the scope of your desires to make you amenable to the culture. The mindset must always be a future, better you achieved through important purchases. The essay is horrifying enough until you realise that it\'s thirty years old, and this is now only one facet of a business that\'s grown much more insidious. The ads

### Summarize all book reviews

# Part 2: Category Generation