In [12]:
import boto3
import gzip
from io import BytesIO
import pandas as pd
import json

# Initialize the S3 client
s3 = boto3.client('s3')

# Define your bucket name and file path
bucket_name = 'abbynlpproject'
file_key = 'goodreads_reviews_fantasy_paranormal.json.gz'

# Download the file from S3 to local memory
obj = s3.get_object(Bucket=bucket_name, Key=file_key)

# Decompress the .gz file
with gzip.GzipFile(fileobj=BytesIO(obj['Body'].read()), mode='rb') as f:
    # Read the file line by line (assuming each line is a separate JSON object)
    data = []
    for line in f:
        try:
            # Decode each line, parse it as JSON and append to the data list
            data.append(json.loads(line.decode('utf-8')))
        except json.JSONDecodeError:
            continue  # Skip any malformed lines or invalid JSON objects

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(data)

# Check the columns of the book metadata dataframe
print(df.columns)

Index(['user_id', 'book_id', 'review_id', 'rating', 'review_text',
       'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes',
       'n_comments'],
      dtype='object')


In [None]:
# Keep only the 'book_id' and 'review_text' columns
df_cleaned = df[['book_id', 'review_text', 'rating']]

# Display the cleaned dataframe
print(df_cleaned.head())

    book_id                                        review_text  rating
0  18245960  This is a special book. It started slow for ab...       5
1   5577844  A beautiful story. Neil Gaiman is truly a uniq...       5
2  17315048  Mark Watney is a steely-eyed missile man. A ma...       5
3  13453029  A fun fast paced book that sucks you in right ...       4
4  13239822  This book has a great premise, and is full of ...       3


In [10]:
# Define your file key for the book metadata
book_metadata_key = 'goodreads_books_fantasy_paranormal.json.gz'

# Download the book metadata file from S3
book_metadata_obj = s3.get_object(Bucket=bucket_name, Key=book_metadata_key)

# Decompress and load the book metadata (similar to the reviews data)
with gzip.GzipFile(fileobj=BytesIO(book_metadata_obj['Body'].read()), mode='rb') as f:
    book_metadata = []
    for line in f:
        try:
            # Decode and parse the JSON data
            book_metadata.append(json.loads(line.decode('utf-8')))
        except json.JSONDecodeError:
            continue  # Skip any malformed lines

# Convert the book metadata list to a DataFrame
df_books = pd.DataFrame(book_metadata)

# Check the columns of the book metadata dataframe
print(df_books.columns)

# Assuming the column name is 'title' (or whatever is appropriate), use that to merge
df_with_book_names = pd.merge(df, df_books[['book_id', 'title']], on='book_id', how='left')

# Display the first few rows of the merged DataFrame
print(df_with_book_names.head())


Index(['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year', 'url',
       'image_url', 'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series'],
      dtype='object')
                            user_id   book_id  \
0  8842281e1d1347389f2ab93d60773d4d  18245960   
1  8842281e1d1347389f2ab93d60773d4d   5577844   
2  8842281e1d1347389f2ab93d60773d4d  17315048   
3  8842281e1d1347389f2ab93d60773d4d  13453029   
4  8842281e1d1347389f2ab93d60773d4d  13239822   

                          review_id  rating  \
0  dfdbb7b0eb5a7e4c26d59a937e2e5feb       5   
1  52c8ac49496c153e4a97161e36b2db55       5   
2  885c772fb033b041f42d57cef5be0a43       5   
3  46a6e1a14e8afc82d221fec0a

In [12]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download the stopwords and lemmatizer if you haven't already
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to preprocess the text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text (split it into words)
    words = text.split()
    
    # Remove stopwords and lemmatize the remaining words
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    # Join the words back into a single string
    return ' '.join(words)

# Apply the preprocessing function to the review_text column
df_with_book_names['cleaned_review_text'] = df_with_book_names['review_text'].apply(preprocess_text)

# Check the first few rows after preprocessing
print(df_with_book_names[['review_text', 'cleaned_review_text']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abbyeast/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/abbyeast/nltk_data...


                                         review_text  \
0  This is a special book. It started slow for ab...   
1  A beautiful story. Neil Gaiman is truly a uniq...   
2  Mark Watney is a steely-eyed missile man. A ma...   
3  A fun fast paced book that sucks you in right ...   
4  This book has a great premise, and is full of ...   

                                 cleaned_review_text  
0  special book started slow first third middle t...  
1  beautiful story neil gaiman truly unique story...  
2  mark watney steelyeyed missile man man man bad...  
3  fun fast paced book suck right away doesnt let...  
4  book great premise full beautifully written pr...  


In [14]:
cleaned_df = df_with_book_names[['title', 'book_id', 'cleaned_review_text']]
cleaned_df.head()

Unnamed: 0,title,book_id,cleaned_review_text
0,The Three-Body Problem (Remembrance of Earth’s...,18245960,special book started slow first third middle t...
1,Stardust,5577844,beautiful story neil gaiman truly unique story...
2,The Martian,17315048,mark watney steelyeyed missile man man man bad...
3,"Wool Omnibus (Silo, #1)",13453029,fun fast paced book suck right away doesnt let...
4,Alif the Unseen,13239822,book great premise full beautifully written pr...


In [15]:
import boto3
import pandas as pd
from io import StringIO

# Initialize S3 client
s3 = boto3.client('s3')

# Define S3 bucket and filename
bucket_name = 'abbynlpproject'
file_key = 'cleaned_goodreads_reviews.csv'

# Convert DataFrame to CSV in memory
csv_buffer = StringIO()
cleaned_df.to_csv(csv_buffer, index=False)

# Upload to S3
s3.put_object(Bucket=bucket_name, Key=file_key, Body=csv_buffer.getvalue())

print(f"File saved to S3: s3://{bucket_name}/{file_key}")


File saved to S3: s3://abbynlpproject/cleaned_goodreads_reviews.csv


In [1]:
'''
READ IN CLEANED FILE FROM S3
'''

import boto3
import pandas as pd
from io import StringIO

# Initialize S3 client
s3 = boto3.client('s3')

# Define S3 bucket and file key
bucket_name = 'abbynlpproject'
file_key = 'cleaned_goodreads_reviews.csv'

# Download the file
obj = s3.get_object(Bucket=bucket_name, Key=file_key)
df = pd.read_csv(StringIO(obj['Body'].read().decode('utf-8')))

# Check the first few rows
print(df.head())

                                               title   book_id  \
0  The Three-Body Problem (Remembrance of Earth’s...  18245960   
1                                           Stardust   5577844   
2                                        The Martian  17315048   
3                            Wool Omnibus (Silo, #1)  13453029   
4                                    Alif the Unseen  13239822   

                                 cleaned_review_text  
0  special book started slow first third middle t...  
1  beautiful story neil gaiman truly unique story...  
2  mark watney steelyeyed missile man man man bad...  
3  fun fast paced book suck right away doesnt let...  
4  book great premise full beautifully written pr...  


In [5]:
# Count the number of NaN values in the 'cleaned_review_text' column
nan_count = df['cleaned_review_text'].isna().sum()
print(f"Number of NaN values in 'cleaned_review_text': {nan_count}")

# Calculate the proportion with higher precision
nan_proportion = df['cleaned_review_text'].isna().sum() / len(df)
print(f"Proportion of NaN values in 'cleaned_review_text': {nan_proportion:.4f}")


Number of NaN values in 'cleaned_review_text': 4894
Proportion of NaN values in 'cleaned_review_text': 0.0014


In [2]:
# Sample a subset of the data (e.g., 10% of the data)
sample_fraction = 0.05  # Adjust this based on your available resources
df_sampled = df.sample(frac=sample_fraction, random_state=42)

# Check the new size
print(df_sampled.shape)

(171232, 3)


In [3]:
from langdetect import detect
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Function to detect if a review is in English
def is_english(text):
    try:
        return detect(text) == 'en'  # Return True if the text is in English
    except:
        return False  # Handle empty or error cases

# Drop missing reviews and filter non-English reviews
df_sampled = df_sampled.dropna(subset=['cleaned_review_text'])
df_sampled = df_sampled[df_sampled['cleaned_review_text'].apply(is_english)]

print(f"Remaining English reviews: {len(df_sampled)}")

Remaining English reviews: 149169


In [4]:
'''
TOPICS for genre/dataset in GENERAL, not by book
before removing common words
'''

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Drop missing reviews
df_sampled = df_sampled.dropna(subset=['cleaned_review_text'])

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df_sampled['cleaned_review_text'])

# Function to compute coherence score (top words per topic)
def compute_coherence_score(model, vectorizer):
    topics = model.components_
    terms = vectorizer.get_feature_names_out()
    coherence = []

    for topic in topics:
        topic_terms = [terms[i] for i in topic.argsort()[-10:]]  # Get top 10 words
        coherence.append(' '.join(topic_terms))

    return coherence

# Train LDA with 5 topics
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(X)

# Compute Perplexity
perplexity_score = lda_model.perplexity(X)

# Compute Coherence
coherence_score = compute_coherence_score(lda_model, vectorizer)

# Print results
print(f"Perplexity Score: {perplexity_score}")
print("Top Words in Each Topic:")
for i, topic in enumerate(coherence_score):
    print(f"Topic {i+1}: {topic}")


Perplexity Score: 3372.0660480062384
Top Words in Each Topic:
Topic 1: life fantasy time novel like read world character story book
Topic 2: character love good loved fun series great story read book
Topic 3: im series didnt good story read character like really book
Topic 4: amazing harry come love loved wait read series review book
Topic 5: loved know character read really story like series love book


In [7]:
'''
TOPICS for genre/dataset in GENERAL, not by book
after removing common words
'''

# Define a custom list of words to remove
custom_stop_words = {'book', 'story', 'series', 'read', 'novel', 'character', 'love', 'really', 'like', 'time', 'good', 'know', 'im', 'didnt', 'wait'}

# Initialize TF-IDF Vectorizer with improved parameters
vectorizer = TfidfVectorizer(stop_words='english', 
                             max_df=0.8, 
                             min_df=5,  # Ignore words that appear in fewer than 5 reviews
                             max_features=10000,  # Increase vocab size for richer topics
                             ngram_range=(1,2))  # Capture word pairs

X = vectorizer.fit_transform(df_sampled['cleaned_review_text'])

# Function to filter out generic words from topics
def clean_topics(topics, stop_words):
    cleaned_topics = []
    for topic in topics:
        topic_terms = topic.split()
        filtered_terms = [word for word in topic_terms if word not in stop_words]
        cleaned_topics.append(' '.join(filtered_terms))
    return cleaned_topics

# Train LDA with more topics
lda_model = LatentDirichletAllocation(n_components=7, random_state=42)  # More topics for diversity
lda_model.fit(X)

# Compute Coherence
coherence_score = compute_coherence_score(lda_model, vectorizer)

# Remove generic words from topics
filtered_topics = clean_topics(coherence_score, custom_stop_words)

# Print results
print("Filtered Topics:")
for i, topic in enumerate(filtered_topics):
    print(f"Topic {i+1}: {topic}")


Filtered Topics:
Topic 1: felt liked
Topic 2: dont
Topic 3: favorite looking looking forward fantasy great forward
Topic 4: way life world
Topic 5: great come review come loved review
Topic 6: great amazing loved
Topic 7: fairy harry potter potter tale short fun harry


In [9]:
from joblib import Parallel, delayed
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

# Set minimum number of reviews per book
min_reviews = 10  

# Process only the first 500 books for proof of concept
subset_books = df_sampled['book_id'].value_counts().index[:500]  
df_filtered = df_sampled[df_sampled['book_id'].isin(subset_books)]

# Function to process a single book
def process_book(book_id, group):
    if len(group) < min_reviews:
        return book_id, []  # Skip books with too few reviews

    vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
    X = vectorizer.fit_transform(group['cleaned_review_text'])

    lda_model = LatentDirichletAllocation(n_components=3, random_state=42)  # Only 3 topics per book for speed
    lda_model.fit(X)

    # Extract topics
    terms = vectorizer.get_feature_names_out()
    topics = []
    for topic_idx, topic in enumerate(lda_model.components_):
        top_words = [terms[i] for i in topic.argsort()[-10:]]  # Top 10 words per topic
        topics.append(" ".join(top_words))

    return book_id, topics

# Run topic modeling in parallel
book_groups = df_filtered.groupby('book_id')
results = Parallel(n_jobs=-1)(delayed(process_book)(book_id, group) for book_id, group in book_groups)

# Convert results to dictionary
book_topics = {book_id: topics for book_id, topics in results if topics}

# Print results for the first 5 books
for book, topics in list(book_topics.items())[:5]:
    print(f"Book ID: {book}")
    for i, topic in enumerate(topics):
        print(f"  Topic {i+1}: {topic}")
    print("\n")


Book ID: 1
  Topic 1: forever need dumbledore 2013 15th ring professor excited sir slow
  Topic 2: really review great read movie potter dumbledore reading harry book
  Topic 3: really read love best time favorite potter series harry book


Book ID: 2
  Topic 1: good character umbridge make love series story harry like book
  Topic 2: stupid ride angst like baby teen came didnt review httpswwwyoutubecomwatchv68ne
  Topic 3: sirius really loved series time favorite potter read harry book


Book ID: 3
  Topic 1: 2nd coming enjoying reathon beautiful writer thsi special shes chapter
  Topic 2: reread good great time potter series read harry love book
  Topic 3: loved series like potter time im harry reading read book


Book ID: 6
  Topic 1: love year second harry series loved time read favorite book
  Topic 2: love know page far series like favourite potter harry book
  Topic 3: story exciting time best love potter read series harry book


Book ID: 11
  Topic 1: clever time funny reread g

In [20]:
from joblib import Parallel, delayed
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

# Set minimum number of reviews per book
min_reviews = 10  

# Process only the first 500 books for proof of concept
subset_books = df_sampled['book_id'].value_counts().index[:1000]  
df_filtered = df_sampled[df_sampled['book_id'].isin(subset_books)]

# Define a custom list of generic words to remove
custom_stopwords = set([
    "book", "read", "series", "time", "love", "story", "like", 
    "really", "good", "great", "best", "favorite", "review"
])

# Function to process a single book
def process_book(book_id, group):
    if len(group) < min_reviews:
        return book_id, None  # Skip books with too few reviews

    vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
    X = vectorizer.fit_transform(group['cleaned_review_text'])

    lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
    lda_model.fit(X)

    # Extract topics
    terms = vectorizer.get_feature_names_out()
    topics = []
    for topic_idx, topic in enumerate(lda_model.components_):
        top_words = [terms[i] for i in topic.argsort()[-15:]]  # Get more words per topic
        filtered_words = [word for word in top_words if word not in custom_stopwords]
        topics.append(" ".join(filtered_words[:10]))  # Keep only top 10 after filtering

    # Get book title
    title = group['title'].iloc[0]  # Assuming all reviews for a book have the same title

    return book_id, title, topics

# Run topic modeling in parallel
book_groups = df_filtered.groupby('book_id')
results = Parallel(n_jobs=-1)(delayed(process_book)(book_id, group) for book_id, group in book_groups)

# Convert results to dictionary
book_topics = {book_id: (title, topics) for book_id, title, topics in results if topics}

# Print results for the first 5 books
for book_id, (title, topics) in list(book_topics.items())[:15]:
    print(f"Book Title: {title} (Book ID: {book_id})")
    for i, topic in enumerate(topics):
        print(f"  Topic {i+1}: {topic}")
    print("\n")


Book Title: Harry Potter and the Half-Blood Prince (Harry Potter, #6) (Book ID: 1)
  Topic 1: 1st spoiler snape better voldemort started tyrant dumbledore excited movie
  Topic 2: setup prince know im rowling againgeez potter definitely harry dumbledore
  Topic 3: end im magic think eternity believe reread harry
  Topic 4: right enjoyed think rereading second dumbledore feel potter harry
  Topic 5: obviously reading favourite absolute far potter harry


Book Title: Harry Potter and the Order of the Phoenix (Harry Potter, #5) (Book ID: 2)
  Topic 1: long make old end year character harry reading
  Topic 2: reread actually 5th fail spoiler attention came sad capture believe
  Topic 3: ive character im sirius loved novel potter harry
  Topic 4: finished darker excellent thing definitely reread potter harry
  Topic 5: awesome make potter umbridge teenage rowling new angst movie harry


Book Title: Harry Potter and the Sorcerer's Stone (Harry Potter, #1) (Book ID: 3)
  Topic 1: come classro