In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, concatenate, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import json


In [3]:
!wget 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/review_categories/All_Beauty.jsonl.gz'

--2024-04-23 06:05:30--  https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/review_categories/All_Beauty.jsonl.gz
Resolving datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)... 132.239.8.30
Connecting to datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)|132.239.8.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 95357493 (91M) [application/x-gzip]
Saving to: ‘All_Beauty.jsonl.gz’


2024-04-23 06:05:38 (12.4 MB/s) - ‘All_Beauty.jsonl.gz’ saved [95357493/95357493]



In [4]:
!gunzip All_Beauty.jsonl.gz

In [5]:
df = pd.read_json('All_Beauty.jsonl',lines=True)

In [6]:
df.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,5,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,True
1,4,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,True
2,5,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2020-05-16 21:41:06.052,2,True
3,1,Synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2022-01-28 18:13:50.220,0,True
4,5,A+,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2020-12-30 10:02:43.534,0,True


In [6]:
df = df.drop(['title','text','images','timestamp'],axis=1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 701528 entries, 0 to 701527
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   rating             701528 non-null  int64 
 1   asin               701528 non-null  object
 2   parent_asin        701528 non-null  object
 3   user_id            701528 non-null  object
 4   helpful_vote       701528 non-null  int64 
 5   verified_purchase  701528 non-null  bool  
dtypes: bool(1), int64(2), object(3)
memory usage: 27.4+ MB


In [None]:
# Load JSON data into DataFrame
with open('All_Beauty.jsonl', 'r') as f:
    data = json.load(f)

df = pd.DataFrame(data)


In [8]:
# Define categorical features and their vocabularies
categorical_features = ['user_id', 'asin', 'parent_asin','verified_purchase']
categorical_vocab_sizes = [df[cat].nunique() for cat in categorical_features]

In [9]:
print(categorical_vocab_sizes)

[631986, 115709, 112565, 2]


In [10]:



for col in categorical_features:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])

# Define numerical features
numerical_features = ['helpful_vote']
print(categorical_features, numerical_features)
df.info()
# Combine all features
all_features = categorical_features + numerical_features


# Split data into features and target
X = df[all_features]
y = df['rating']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# Create embedding layers for categorical features
embedding_layers = []
input_layers = []
reduction_factor = 128 * 32
for i, cat in enumerate(categorical_features):
    input_layer = Input(shape=(1,), name=cat)
    embedding_layer = Embedding(input_dim=categorical_vocab_sizes[i] + 1, output_dim=int(categorical_vocab_sizes[i]/reduction_factor)+1)(input_layer)
    embedding_layer = Flatten()(embedding_layer)  # Flatten the embeddings
    embedding_layers.append(embedding_layer)
    input_layers.append(input_layer)

for i, num in enumerate(numerical_features):
    num_layer = Input(shape=(1,), name=num)
    input_layers.append(num_layer)
    embedding_layers.append(num_layer)
print(input_layers)

# Concatenate all input embeddings and numerical features
concatenated_inputs = concatenate(embedding_layers )



['user_id', 'asin', 'parent_asin', 'verified_purchase'] ['helpful_vote']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 701528 entries, 0 to 701527
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype
---  ------             --------------   -----
 0   rating             701528 non-null  int64
 1   asin               701528 non-null  int64
 2   parent_asin        701528 non-null  int64
 3   user_id            701528 non-null  int64
 4   helpful_vote       701528 non-null  int64
 5   verified_purchase  701528 non-null  int64
dtypes: int64(6)
memory usage: 32.1 MB
[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'user_id')>, <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'asin')>, <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'parent_asin')>, <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'verified_purchase')>, <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'helpful_vote')>]


In [11]:
# Build MLP model
dense_layer = Dense(128, activation='relu')(concatenated_inputs)
dense_layer = Dropout(0.3)(dense_layer)
dense_layer = Dense(128, activation='relu')(dense_layer)
dense_layer = Dropout(0.3)(dense_layer)
output_layer = Dense(6, activation='softmax')(dense_layer)

# Create the model
model = Model(inputs=input_layers , outputs=output_layer)
model.summary()



Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 user_id (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 asin (InputLayer)           [(None, 1)]                  0         []                            
                                                                                                  
 parent_asin (InputLayer)    [(None, 1)]                  0         []                            
                                                                                                  
 verified_purchase (InputLa  [(None, 1)]                  0         []                            
 yer)                                                                                         

In [None]:

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Set up early stopping
es = EarlyStopping(monitor='val_loss',
                   mode='min',
                   verbose=1,
                   patience=5,
                   restore_best_weights=True)

# Train the model
history = model.fit([X_train[cat].values for cat in categorical_features] + [X_train[num].values for num in numerical_features],
          y_train,
          validation_data=([X_test[cat].values for cat in categorical_features] + [X_test[num].values for num in numerical_features], y_test),
          epochs=100,
          batch_size=128,callbacks=[es])

Epoch 1/100