This notebook illustrates how to load data from Amazon Product Review data set into a Pandas DataFrame.  
---
Data set from:[https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/AMAZON_FASHION_5.json.gz](https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/AMAZON_FASHION_5.json.gz) 

The JSON data is loaded into a pandas DataFrame. This particular file contains the following columns:

- overall: The overall rating.
- verified: Whether the review is verified.
- reviewTime: The time of the review.
- reviewerID: The ID of the reviewer.
- asin: The product ID.
- style: The style of the product, specifically its size in this case.
- reviewerName: The name of the reviewer.
- reviewText: The text of the review.
- summary: The summary of the review.
- unixReviewTime: The time of the review in Unix time format. 


In [1]:
import tensorflow as tf

import pandas as pd
import requests
import gzip
import io
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
def read_gzipped_json_from_url(url):
    # Send a HTTP request to the URL
    response = requests.get(url)
    # Check if the request was successful
    if response.status_code == 200:
        # Use gzip to decompress the content
        with gzip.GzipFile(fileobj=io.BytesIO(response.content)) as gz:
            # Read the JSON lines file and convert to a DataFrame
            df = pd.read_json(gz, lines=True)
        return df
    else:
        print(f"Failed to retrieve data: status code {response.status_code}")
        return None

In [3]:
# URL to the gzipped JSON file
url = 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/AMAZON_FASHION_5.json.gz'
#url = 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Luxury_Beauty_5.json.gz'
df = read_gzipped_json_from_url(url).reset_index(drop=True)

In [4]:
# Display the first few rows of the DataFrame
if df is not None:
    print(df.head())

   overall  verified  reviewTime     reviewerID        asin  \
0        5      True  09 4, 2015  ALJ66O1Y6SLHA  B000K2PJ4K   
1        5      True  09 4, 2015  ALJ66O1Y6SLHA  B000K2PJ4K   
2        5      True  09 4, 2015  ALJ66O1Y6SLHA  B000K2PJ4K   
3        5      True  09 4, 2015  ALJ66O1Y6SLHA  B000K2PJ4K   
4        5      True  09 4, 2015  ALJ66O1Y6SLHA  B000K2PJ4K   

                                               style reviewerName  \
0   {'Size:': ' Big Boys', 'Color:': ' Blue/Orange'}     Tonya B.   
1  {'Size:': ' Big Boys', 'Color:': ' Black (3746...     Tonya B.   
2  {'Size:': ' Big Boys', 'Color:': ' Blue/Gray L...     Tonya B.   
3  {'Size:': ' Big Boys', 'Color:': ' Blue (37867...     Tonya B.   
4     {'Size:': ' Big Boys', 'Color:': ' Blue/Pink'}     Tonya B.   

                 reviewText     summary  unixReviewTime  vote image  
0  Great product and price!  Five Stars      1441324800   NaN   NaN  
1  Great product and price!  Five Stars      1441324800   NaN   Na

In [5]:
# For illustration purposes only we will define a small subset of the data
# Defining the target variable
y = df['overall']
# Defining the features
df_small = df[['reviewerID','reviewText','summary']]

# Split the data into training and testing sets
df_Xtrain, df_Xtest, df_ytrain, df_ytest = train_test_split(df_small, y, test_size=0.1, random_state=42, stratify=y)
df_Xtrain = df_Xtrain.reset_index(drop=True)
df_Xtest = df_Xtest.reset_index(drop=True)
df_ytrain = df_ytrain.reset_index(drop=True)
df_ytest = df_ytest.reset_index(drop=True)

In [6]:
print(df_Xtrain.head())

       reviewerID                                         reviewText  \
0  A2YZERYQTLB8NG  Best tennis shoes I've had all my life. Very c...   
1   AG1CF6PFNBOQH  The shoes are amazing,very comfortable and fit...   
2   A9KR8PT0Z47CL  They're ugly: the toe is rounded and the mater...   
3  A13JGYKUU10QKH  I ordered a 1/2 size smaller than my usual siz...   
4  A3A90ECS7ALV4T                   love the lightweight and the fit   

                                             summary  
0                                   Very Comfortable  
1                                            Amazing  
2  the Nike Womens Flex Trainer 6 is way better l...  
3                                         Good shoes  
4                                         Five Stars  


In [7]:
print(df_ytrain.head())

0    5
1    5
2    3
3    5
4    5
Name: overall, dtype: int64


In [8]:
print(f'First review = {df_Xtrain.loc[0, "reviewText"]}')
print(f'First review has length = {len(df_Xtrain.loc[0, "reviewText"])}\n ')
print(f'First review summary= {df_Xtrain.loc[0, "summary"]}')
print(f'First review summary has length = {len(df_Xtrain.loc[0, "summary"])}\n ')

print(f'First review overall rating = {df_ytrain.loc[0]}')

First review = Best tennis shoes I've had all my life. Very comfortable out the box. I would buy 10 pairs of these shoes if had the money.
First review has length = 123
 
First review summary= Very Comfortable
First review summary has length = 16
 
First review overall rating = 5


In [9]:
print(f'Sixth review = {df_Xtrain.loc[6, "reviewText"]}')
print(f'Sixth review has length = {len(df_Xtrain.loc[6, "reviewText"])}\n ')
print(f'Sixth review summary= {df_Xtrain.loc[6, "summary"]}')
print(f'Sixth review summary has length = {len(df_Xtrain.loc[6, "summary"])}\n ')

print(f'Sixth review overall rating = {df_ytrain.loc[6]}')

Sixth review = Super comfortable. I joined Cross Fit and this is the perfect shoe.
Sixth review has length = 67
 
Sixth review summary= Super comfortable. I joined Cross Fit and this is ...
Sixth review summary has length = 53
 
Sixth review overall rating = 4


In [None]:
test