In [2]:
import pandas as pd
import gcsfs

# Import and join review/product data

In [3]:
# Initialize GCS filesystem
fs = gcsfs.GCSFileSystem()

items_metadata_file_path = 'gs://amazon-home-and-kitchen/meta_Home_and_Kitchen.jsonl'

# Open the file from the GCS bucket
with fs.open(items_metadata_file_path, 'r') as f:
    # Read the JSONL file in chunks to handle large files efficiently
    #json_reader = pd.read_json(f, lines=True, chunksize=500000)
    
    # Read the JSONL file in 600K rows at a time
    json_reader = pd.read_json(f, lines=True, chunksize=600000)
        
    for i, chunk in enumerate(json_reader):
        if i == 0:  # You can choose which chunk to process, e.g., the first chunk
            meta_df = chunk  # This is the DataFrame for the first chunk
            break

In [4]:
meta_df.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Amazon Home,Set of 4 Irish Coffee Glass Mugs Footed 10.5 o...,4.6,18,[☕PERFECT IRISH COFFEE MUG: With our clear gla...,[Set of 12 Footed 10.5 oz. Irish coffee mug th...,24.95,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Irish Coffee Glass Coffee Mugs Reg...,LavoHome,"[Home & Kitchen, Kitchen & Dining, Dining & En...","{'Brand': 'LavoHome', 'Material': 'Glass', 'Co...",B07R3DYMH6,,,
1,Amazon Home,Foaming Soap Dispenser Thick Ceramic Foam Hand...,4.4,135,[Saving money: You can DIY foam soap which wil...,[],24.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Foaming Soap Dispenser Ceramic Foa...,rejomiik,"[Home & Kitchen, Bath]",{'Package Dimensions': '7.32 x 6.14 x 3.94 inc...,B0BNZ8Q7YT,,,
2,Amazon Home,Tapestry Trading 558W90 90 in. European Lace T...,5.0,3,"[Polyester,lace, European Lace Tablecloth, 100...",[Features. European Lace Tablecloth. 100 Polye...,45.64,[{'thumb': 'https://m.media-amazon.com/images/...,[],Tapestry Trading,"[Home & Kitchen, Kitchen & Dining, Kitchen & T...","{'Brand': 'Tapestry Trading', 'Color': 'White'...",B01508WQC6,,,
3,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,4.3,167,"[Sleek chrome metal base, seat covered in Red ...",[],,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Small and Stylish Barstools ', 'ur...",jersey seating®,"[Home & Kitchen, Furniture, Game & Recreation ...","{'Color': 'Red', 'Frame Material': 'Metal', 'S...",B00KKU8HTG,,,
4,Amazon Home,Chisander 20 Inches Grey with White Super Soft...,4.6,67,[High-Quality Material: Made of high quality s...,[],9.99,[{'thumb': 'https://m.media-amazon.com/images/...,[],Chisander,"[Home & Kitchen, Seasonal Décor, Stockings & H...",{'Package Dimensions': '9.65 x 5.43 x 1.85 inc...,B0B61RJ848,,,


In [None]:
# Set up your GCS file path
user_reviews_train_file_path = 'gs://amazon-home-and-kitchen/Home_and_Kitchen_Train.jsonl'

# Open the file from the GCS bucket
with fs.open(user_reviews_train_file_path, 'r') as f:
    # Read the JSONL file in 500K rows at a time
    json_reader = pd.read_json(f, lines=True, chunksize=500000)
    
    for i, chunk in enumerate(json_reader):
        if i == 0:
            reviews_df = chunk
        else:
            reviews_chunk = chunk
            reviews_df = pd.concat([reviews_df, reviews_chunk], ignore_index=True)

In [None]:
reviews_with_meta_df = pd.merge(meta_df, reviews_df, left_on='parent_asin', right_on='parent_asin')

# Pre-process the text item review field called 'text'

In [14]:
!pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting regex>=2021.8.3 (from nltk)
  Using cached regex-2024.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Using cached regex-2024.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (782 kB)
Installing collected packages: regex, nltk
Successfully installed nltk-3.9.1 regex-2024.9.11


In [18]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenization
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # Join tokens back to string
    return ' '.join(tokens)

# Apply preprocessing to the 'text' column
df['processed_text'] = df['text'].apply(preprocess_text)

# Display the first few rows of the processed DataFrame
df[['text', 'processed_text']].head()

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0,text,processed_text
0,Livid. Once again received an obviously used ...,livid received obviously used item food scratc...
1,I purchased these for multiple reasons. The ma...,purchased multiple reason main reason moving m...
2,[[VIDEOID:c87e962bc893a948856b0f1b285ce6cc]] I...,videoidc87e962bc893a948856b0f1b285ce6cc wanted...
3,If you live at a higher elevation like me (5k ...,live higher elevation like 5k colorado know bu...
4,I use these to store yarn. They easily hold 12...,use store yarn easily hold 12 105 ounce bernat...
