In [None]:
import boto3
import gzip
from io import BytesIO
import pandas as pd
import json

# Initialize the S3 client
s3 = boto3.client('s3')

# Define your bucket name and file path
bucket_name = 'abbynlpproject'
file_key = 'goodreads_reviews_fantasy_paranormal.json.gz'

# Download the file from S3 to local memory
obj = s3.get_object(Bucket=bucket_name, Key=file_key)

# Decompress the .gz file
with gzip.GzipFile(fileobj=BytesIO(obj['Body'].read()), mode='rb') as f:
    # Read the file line by line (assuming each line is a separate JSON object)
    data = []
    for line in f:
        try:
            # Decode each line, parse it as JSON and append to the data list
            data.append(json.loads(line.decode('utf-8')))
        except json.JSONDecodeError:
            continue  # Skip any malformed lines or invalid JSON objects

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(data)

# Check the columns of the book metadata dataframe
print(df.columns)

Index(['user_id', 'book_id', 'review_id', 'rating', 'review_text',
       'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes',
       'n_comments'],
      dtype='object')


In [None]:
# Keep only the 'book_id' and 'review_text' columns
df_cleaned = df[['book_id', 'review_text', 'rating']]

# Display the cleaned dataframe
print(df_cleaned.head())

    book_id                                        review_text  rating
0  18245960  This is a special book. It started slow for ab...       5
1   5577844  A beautiful story. Neil Gaiman is truly a uniq...       5
2  17315048  Mark Watney is a steely-eyed missile man. A ma...       5
3  13453029  A fun fast paced book that sucks you in right ...       4
4  13239822  This book has a great premise, and is full of ...       3


In [None]:
# Define your file key for the book metadata
book_metadata_key = 'goodreads_books_fantasy_paranormal.json.gz'

# Download the book metadata file from S3
book_metadata_obj = s3.get_object(Bucket=bucket_name, Key=book_metadata_key)

# Decompress and load the book metadata (similar to the reviews data)
with gzip.GzipFile(fileobj=BytesIO(book_metadata_obj['Body'].read()), mode='rb') as f:
    book_metadata = []
    for line in f:
        try:
            # Decode and parse the JSON data
            book_metadata.append(json.loads(line.decode('utf-8')))
        except json.JSONDecodeError:
            continue  # Skip any malformed lines

# Convert the book metadata list to a DataFrame
df_books = pd.DataFrame(book_metadata)

# Check the columns of the book metadata dataframe
print(df_books.columns)

# Assuming the column name is 'title' (or whatever is appropriate), use that to merge
df_with_book_names = pd.merge(df, df_books[['book_id', 'title']], on='book_id', how='left')

# Display the first few rows of the merged DataFrame
print(df_with_book_names.head())


Index(['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year', 'url',
       'image_url', 'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series'],
      dtype='object')
                            user_id   book_id  \
0  8842281e1d1347389f2ab93d60773d4d  18245960   
1  8842281e1d1347389f2ab93d60773d4d   5577844   
2  8842281e1d1347389f2ab93d60773d4d  17315048   
3  8842281e1d1347389f2ab93d60773d4d  13453029   
4  8842281e1d1347389f2ab93d60773d4d  13239822   

                          review_id  rating  \
0  dfdbb7b0eb5a7e4c26d59a937e2e5feb       5   
1  52c8ac49496c153e4a97161e36b2db55       5   
2  885c772fb033b041f42d57cef5be0a43       5   
3  46a6e1a14e8afc82d221fec0a

In [None]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download the stopwords and lemmatizer if you haven't already
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to preprocess the text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text (split it into words)
    words = text.split()
    
    # Remove stopwords and lemmatize the remaining words
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    # Join the words back into a single string
    return ' '.join(words)

# Apply the preprocessing function to the review_text column
df_with_book_names['cleaned_review_text'] = df_with_book_names['review_text'].apply(preprocess_text)

# Check the first few rows after preprocessing
print(df_with_book_names[['review_text', 'cleaned_review_text']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abbyeast/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/abbyeast/nltk_data...


                                         review_text  \
0  This is a special book. It started slow for ab...   
1  A beautiful story. Neil Gaiman is truly a uniq...   
2  Mark Watney is a steely-eyed missile man. A ma...   
3  A fun fast paced book that sucks you in right ...   
4  This book has a great premise, and is full of ...   

                                 cleaned_review_text  
0  special book started slow first third middle t...  
1  beautiful story neil gaiman truly unique story...  
2  mark watney steelyeyed missile man man man bad...  
3  fun fast paced book suck right away doesnt let...  
4  book great premise full beautifully written pr...  


In [None]:
cleaned_df = df_with_book_names[['title', 'book_id', 'cleaned_review_text']]
cleaned_df.head()

Unnamed: 0,title,book_id,cleaned_review_text
0,The Three-Body Problem (Remembrance of Earth’s...,18245960,special book started slow first third middle t...
1,Stardust,5577844,beautiful story neil gaiman truly unique story...
2,The Martian,17315048,mark watney steelyeyed missile man man man bad...
3,"Wool Omnibus (Silo, #1)",13453029,fun fast paced book suck right away doesnt let...
4,Alif the Unseen,13239822,book great premise full beautifully written pr...


In [None]:
import boto3
import pandas as pd
from io import StringIO

# Initialize S3 client
s3 = boto3.client('s3')

# Define S3 bucket and filename
bucket_name = 'abbynlpproject'
file_key = 'cleaned_goodreads_reviews.csv'

# Convert DataFrame to CSV in memory
csv_buffer = StringIO()
cleaned_df.to_csv(csv_buffer, index=False)

# Upload to S3
s3.put_object(Bucket=bucket_name, Key=file_key, Body=csv_buffer.getvalue())

print(f"File saved to S3: s3://{bucket_name}/{file_key}")


File saved to S3: s3://abbynlpproject/cleaned_goodreads_reviews.csv
