In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, concatenate, Dropout, Lambda , Layer
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import json


In [5]:
!wget 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/review_categories/All_Beauty.jsonl.gz'

--2024-04-26 16:42:48--  https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/review_categories/All_Beauty.jsonl.gz
Resolving datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)... 132.239.8.30
Connecting to datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)|132.239.8.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 95357493 (91M) [application/x-gzip]
Saving to: ‘All_Beauty.jsonl.gz’


2024-04-26 16:43:11 (3.95 MB/s) - ‘All_Beauty.jsonl.gz’ saved [95357493/95357493]



In [6]:
!wget 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/meta_categories/meta_All_Beauty.jsonl.gz'

--2024-04-26 16:43:11--  https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/meta_categories/meta_All_Beauty.jsonl.gz
Resolving datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)... 132.239.8.30
Connecting to datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)|132.239.8.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 40415458 (39M) [application/x-gzip]
Saving to: ‘meta_All_Beauty.jsonl.gz’


2024-04-26 16:43:12 (37.8 MB/s) - ‘meta_All_Beauty.jsonl.gz’ saved [40415458/40415458]



In [7]:
!gunzip All_Beauty.jsonl.gz

In [8]:
!gunzip meta_All_Beauty.jsonl.gz

In [9]:
df_ratings = pd.read_json('All_Beauty.jsonl',lines=True)

In [10]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 701528 entries, 0 to 701527
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   rating             701528 non-null  int64         
 1   title              701528 non-null  object        
 2   text               701528 non-null  object        
 3   images             701528 non-null  object        
 4   asin               701528 non-null  object        
 5   parent_asin        701528 non-null  object        
 6   user_id            701528 non-null  object        
 7   timestamp          701528 non-null  datetime64[ns]
 8   helpful_vote       701528 non-null  int64         
 9   verified_purchase  701528 non-null  bool          
dtypes: bool(1), datetime64[ns](1), int64(2), object(6)
memory usage: 48.8+ MB


In [11]:
df_meta = pd.read_json('meta_All_Beauty.jsonl',lines = True)

In [12]:
df_meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112590 entries, 0 to 112589
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   main_category    112590 non-null  object 
 1   title            112590 non-null  object 
 2   average_rating   112590 non-null  float64
 3   rating_number    112590 non-null  int64  
 4   features         112590 non-null  object 
 5   description      112590 non-null  object 
 6   price            17704 non-null   float64
 7   images           112590 non-null  object 
 8   videos           112590 non-null  object 
 9   store            101259 non-null  object 
 10  categories       112590 non-null  object 
 11  details          112590 non-null  object 
 12  parent_asin      112590 non-null  object 
 13  bought_together  0 non-null       float64
dtypes: float64(3), int64(1), object(10)
memory usage: 12.0+ MB


In [13]:
# prompt: print first 5 values from df_meta details column

print(df_meta['details'].head(10))


0    {'Package Dimensions': '7.1 x 5.5 x 3 inches; ...
1    {'Item Form': 'Powder', 'Skin Type': 'Acne Pro...
2           {'Manufacturer': 'Levine Health Products'}
3    {'Brand': 'Cherioll', 'Item Form': 'Powder', '...
4                              {'UPC': '644287689178'}
5    {'Color': 'As Shown', 'Size': 'Large', 'Materi...
6    {'Brand': 'Edoneery', 'Material': 'Silk', 'Num...
7    {'Package Dimensions': '14.49 x 11.26 x 2.36 i...
8    {'Brand': 'Balmain', 'Item Form': 'Spray', 'It...
9    {'Package Dimensions': '12.49 x 9.97 x 1.46 in...
Name: details, dtype: object


In [14]:
# prompt: merge df_ratings and df_meta on parent_asin

df_merged = pd.merge(df_ratings, df_meta, on='parent_asin')


In [15]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 701528 entries, 0 to 701527
Data columns (total 23 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   rating             701528 non-null  int64         
 1   title_x            701528 non-null  object        
 2   text               701528 non-null  object        
 3   images_x           701528 non-null  object        
 4   asin               701528 non-null  object        
 5   parent_asin        701528 non-null  object        
 6   user_id            701528 non-null  object        
 7   timestamp          701528 non-null  datetime64[ns]
 8   helpful_vote       701528 non-null  int64         
 9   verified_purchase  701528 non-null  bool          
 10  main_category      701528 non-null  object        
 11  title_y            701528 non-null  object        
 12  average_rating     701528 non-null  float64       
 13  rating_number      701528 non-null  int64   

In [16]:
# prompt: only keep user_id , parent_asin, average_rating, rating_number, price, store, rating columns of df_merged

df = df_merged[['user_id', 'parent_asin', 'average_rating', 'rating_number', 'price', 'store', 'rating']]


In [17]:
df.head()

Unnamed: 0,user_id,parent_asin,average_rating,rating_number,price,store,rating
0,AGKHLEW2SOWHNMFQIJGBECAF7INQ,B00YQ6X8EO,4.3,384,,HERBIVORE,5
1,AGF2RF5HBCXKEOLTF2LXZTVFYFVQ,B00YQ6X8EO,4.3,384,,HERBIVORE,1
2,AGPWM36OKQCL3PM4MXZNLP4VVJKA,B00YQ6X8EO,4.3,384,,HERBIVORE,5
3,AGVVMZD2DJQ7Z3KJJXHFKQHSX7HA,B00YQ6X8EO,4.3,384,,HERBIVORE,5
4,AE5VLXANR3ZCUZCQAOBJTIEGMZGA,B00YQ6X8EO,4.3,384,,HERBIVORE,2


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 701528 entries, 0 to 701527
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   user_id         701528 non-null  object 
 1   parent_asin     701528 non-null  object 
 2   average_rating  701528 non-null  float64
 3   rating_number   701528 non-null  int64  
 4   price           185623 non-null  float64
 5   store           651636 non-null  object 
 6   rating          701528 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 37.5+ MB


In [19]:
# Define categorical features and their vocabularies
categorical_features = ['user_id', 'parent_asin']
categorical_vocab_sizes = [df[cat].nunique() for cat in categorical_features]

In [20]:
print(categorical_vocab_sizes)

[631986, 112565]


In [37]:



for col in categorical_features:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])

# Define numerical features
numerical_features = ['average_rating','rating_number','price']
print(categorical_features, numerical_features)
df.info()
# Combine all features
all_features = categorical_features


# Split data into features and target
X = df[all_features]
y = df['rating']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Standardize numerical features
# scaler = StandardScaler()
# X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
# X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# X_train[numerical_features] = X_train[numerical_features]
# X_test[numerical_features] = X_test[numerical_features]


# Create embedding layers for categorical features
embedding_layers = []
input_layers = []
reduction_factor = 128 * 16
for i, cat in enumerate(categorical_features):
    input_layer = Input(shape=(1,), name=cat)
    embedding_layer = Embedding(input_dim=categorical_vocab_sizes[i] + 1, output_dim=int(categorical_vocab_sizes[i]/reduction_factor)+1)(input_layer)
    embedding_layer = Flatten()(embedding_layer)  # Flatten the embeddings
    embedding_layers.append(embedding_layer)
    input_layers.append(input_layer)

# for i, num in enumerate(numerical_features):
#     num_layer = Input(shape=(1,), name=num)
#     input_layers.append(num_layer)
#     embedding_layers.append(num_layer)
# print(input_layers)

# Concatenate all input embeddings and numerical features
concatenated_inputs = concatenate(embedding_layers )



['user_id', 'parent_asin'] ['average_rating', 'rating_number', 'price']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 701528 entries, 0 to 701527
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   user_id         701528 non-null  int64  
 1   parent_asin     701528 non-null  int64  
 2   average_rating  701528 non-null  float64
 3   rating_number   701528 non-null  int64  
 4   price           185623 non-null  float64
 5   store           651636 non-null  object 
 6   rating          701528 non-null  int64  
dtypes: float64(2), int64(4), object(1)
memory usage: 37.5+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = encoder.fit_transform(df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = encoder.fit_transform(df[col])


In [52]:
# Custom layer for ceil operation
class CeilLayer(Layer):
    def __init__(self, **kwargs):
        super(CeilLayer, self).__init__(**kwargs)

    def call(self, inputs):
        return tf.math.ceil(inputs)

In [70]:
# Build MLP model
dense_layer = Dense(128, activation='relu')(concatenated_inputs)
dense_layer = Dropout(0.3)(dense_layer)
dense_layer = Dense(128, activation='relu')(dense_layer)
dense_layer = Dropout(0.3)(dense_layer)
output_layer = Dense(1)(dense_layer)

# Create the model
model = Model(inputs=input_layers , outputs=output_layer)
model.summary()



Model: "model_9"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 user_id (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 parent_asin (InputLayer)    [(None, 1)]                  0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, 1, 309)               1952839   ['user_id[0][0]']             
                                                          83                                      
                                                                                                  
 embedding_3 (Embedding)     (None, 1, 55)                6191130   ['parent_asin[0][0]']   

In [69]:

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['accuracy'])

# Set up early stopping
es = EarlyStopping(monitor='val_loss',
                   mode='min',
                   verbose=1,
                   patience=2,
                   restore_best_weights=True)

# Train the model
history = model.fit([X_train[cat].values for cat in all_features] ,
          y_train,
          validation_data=([X_test[cat].values for cat in all_features], y_test),
          epochs=100,
          batch_size=128,callbacks=[es])

Epoch 1/100


ResourceExhaustedError: in user code:

    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1154, in train_step
        self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/optimizers/optimizer.py", line 544, in minimize
        self.apply_gradients(grads_and_vars)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/optimizers/optimizer.py", line 1223, in apply_gradients
        return super().apply_gradients(grads_and_vars, name=name)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/optimizers/optimizer.py", line 638, in apply_gradients
        self.build(trainable_variables)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/optimizers/adam.py", line 145, in build
        self.add_variable_from_reference(
    File "/usr/local/lib/python3.10/dist-packages/keras/src/optimizers/optimizer.py", line 1125, in add_variable_from_reference
        return super().add_variable_from_reference(
    File "/usr/local/lib/python3.10/dist-packages/keras/src/optimizers/optimizer.py", line 508, in add_variable_from_reference
        initial_value = tf.zeros(
    File "/usr/local/lib/python3.10/dist-packages/tensorflow/dtensor/python/api.py", line 64, in call_with_layout
        return fn(*args, **kwargs)

    ResourceExhaustedError: {{function_node __wrapped__Fill_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[631987,309] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Fill] name: 


In [25]:
# prompt: predict for 10 rows in X_test

predictions = model.predict([X_test[cat].values for cat in all_features])

print(predictions[:10])


[[5.26747606e-08 2.27931157e-01 8.24532732e-02 9.00309458e-02
  9.50279012e-02 5.04556596e-01]
 [1.47319653e-07 2.05296800e-01 8.46464187e-02 1.00816697e-01
  1.16539970e-01 4.92699891e-01]
 [2.34571361e-19 1.33706816e-02 9.67753679e-03 1.59384739e-02
  6.14820756e-02 8.99531186e-01]
 [7.84372531e-08 2.03086033e-01 8.31154436e-02 9.02381539e-02
  1.12511717e-01 5.11048555e-01]
 [5.62877534e-15 5.34754507e-02 2.66343169e-02 3.59413922e-02
  8.40365961e-02 7.99912274e-01]
 [3.50690743e-06 2.39207491e-01 1.00417607e-01 1.04504794e-01
  1.41990691e-01 4.13875937e-01]
 [8.25197510e-09 9.91539359e-02 5.89881092e-02 9.67371315e-02
  2.09231809e-01 5.35889030e-01]
 [9.71840880e-13 6.41562641e-02 3.78980599e-02 5.01685552e-02
  1.06397316e-01 7.41379738e-01]
 [1.41329979e-08 1.80056751e-01 7.50098526e-02 8.89093429e-02
  1.01976320e-01 5.54047763e-01]
 [1.24813465e-10 1.25693887e-01 5.47065809e-02 6.53673559e-02
  8.89517143e-02 6.65280461e-01]]


In [26]:
print(y_test[:10])

558261    5
533565    4
275352    5
658564    5
48163     5
85182     3
185354    5
177877    4
474314    5
226539    5
Name: rating, dtype: int64


In [None]:
# prompt: print first few values of x_train

print(X_train.head())


In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [60]:
# Assuming your validation/test data is stored in X_val and y_val
y_pred = model.predict([X_test[cat].values for cat in all_features])
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = y_test.to_numpy()
#y_true = np.argmax(y_test, axis=0)





In [63]:
print(y_pred[:5])

[[2.8743045]
 [3.3117304]
 [4.5930643]
 [3.3962812]
 [4.686592 ]]


In [64]:
# prompt: take ceil values of y_pred

y_pred_ceil = np.ceil(y_pred)
print(y_pred_ceil[:5])


[[3.]
 [4.]
 [5.]
 [4.]
 [5.]]


In [65]:

accuracy = accuracy_score(y_true, y_pred_ceil)
precision = precision_score(y_true, y_pred_ceil, average='weighted')
recall = recall_score(y_true, y_pred_ceil, average='weighted')
f1 = f1_score(y_true, y_pred_ceil, average='weighted')
#auc = roc_auc_score(y_true,y_pred_classes, multi_class="ovo")

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
#print("auc Score:", auc)

Accuracy: 0.41867061993072285
Precision: 0.4528781672255683
Recall: 0.41867061993072285
F1 Score: 0.423768387676994


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
