In [None]:
import tensorflow as tf
from tensorflow.keras import layers, metrics
import pandas as pd
import requests
import gzip
import io
import numpy as np
from sklearn.model_selection import train_test_split
import wandb
from wandb.keras import WandbCallback

Set random_seed. This will keep random things the same through multiple experiments

In [None]:
tf.keras.utils.set_random_seed(812)

In [None]:
def read_gzipped_json_from_url(url):
    # Send a HTTP request to the URL
    response = requests.get(url)
    # Check if the request was successful
    if response.status_code == 200:
        # Use gzip to decompress the content
        with gzip.GzipFile(fileobj=io.BytesIO(response.content)) as gz:
            # Read the JSON lines file and convert to a DataFrame
            df = pd.read_json(gz, lines=True)
        return df
    else:
        print(f"Failed to retrieve data: status code {response.status_code}")
        return None

In [None]:
# URL to the gzipped JSON file
url = 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Luxury_Beauty_5.json.gz'
url2 = 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFiles/AMAZON_FASHION.json.gz'
luxury_df = read_gzipped_json_from_url(url).reset_index(drop=True)
amazon_fashion_df = read_gzipped_json_from_url(url2).reset_index(drop=True)

In [None]:
fashion_df = amazon_fashion_df[['overall','reviewText']]
luxury_df = luxury_df[['overall','reviewText']]

In [None]:
# Display the first few rows of the DataFrame
if fashion_df is not None:
    print(fashion_df.head())
if luxury_df is not None:
    print(luxury_df.head())

In [None]:
print(fashion_df.describe())
print(luxury_df.describe())

In [None]:
fashion_df = fashion_df.dropna()
luxury_df = luxury_df.dropna()
print(fashion_df.describe())
print(luxury_df.describe())

In [None]:
rating_counts_fashion = fashion_df['overall'].value_counts()
rating_counts_luxury = luxury_df['overall'].value_counts()
print(rating_counts_fashion)
print(rating_counts_luxury)

Both datasets are very imbalanced. The training and validation set will be balanced.

### Creating a smaller and more balanced dataset

Starts experiments with 10000 samples of each rating. So 50000 in total

In [None]:
number_each_rating = 10000
balanced_df = fashion_df.groupby('overall').head(number_each_rating)
balanced_counts = balanced_df['overall'].value_counts()
print(balanced_counts)

Setting tager and splitting data into training and validation sets

In [None]:
y = balanced_df['overall']

df_xtrain, df_xval, df_ytrain, df_yval = train_test_split(balanced_df, y, test_size=0.1, random_state=42, stratify=y)
df_xtrain = df_xtrain.reset_index(drop=True)
df_xval = df_xval.reset_index(drop=True)
df_ytrain = df_ytrain.reset_index(drop=True)
df_yval = df_yval.reset_index(drop=True)

In [None]:
print(df_ytrain.value_counts())
print(df_yval.value_counts())

In [None]:
print(df_xtrain.head())
print(df_xval.head())
print(df_ytrain.head())
print(df_yval.head())

In [None]:
y_test = luxury_df['overall']
x_test = luxury_df['reviewText']

In [None]:
review_lengths = df_xtrain['reviewText'].apply(len)

min_length = review_lengths.min()

max_length = review_lengths.max()

mean_length = review_lengths.mean()

median_length = review_lengths.median()

print('90th Percentile Length:',review_lengths.quantile(q = 0.9))
print("Minimum Length:", min_length)
print("Maximum Length:", max_length)
print("Mean Length:", mean_length)
print("Median Length:", median_length)

Min and max length



In [None]:
min_length_index = review_lengths.idxmin()

max_length_index = review_lengths.idxmax()

min_length_review = df_xtrain.loc[min_length_index, 'reviewText']
max_length_review = df_xtrain.loc[max_length_index, 'reviewText']

print("Review with Minimum Length:\n", min_length_review)
print("\nReview with Maximum Length:\n", max_length_review)

In [None]:
max_tokens = 2000
sentence_length = 400 #around the length of the 90th percentile
encoder = tf.keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode='int',  
    output_sequence_length=sentence_length)  

encoder.adapt(df_xtrain['reviewText'].values)  

vocab = np.array(encoder.get_vocabulary())
vocab[:20]

In [None]:
example_text = df_xtrain['reviewText'].iloc[0] 
encoded_example = encoder([example_text])[:3].numpy()
print(df_xtrain['reviewText'].iloc[0])
print(encoded_example)

In [None]:
sample_input = "your sample text s s s input very good"

encoded_sample = encoder(tf.constant([sample_input]))
print(encoded_sample)

Convert y to one hot

In [None]:
df_ytrain_one_hot = tf.keras.utils.to_categorical(df_ytrain - 1, num_classes=5)
df_yval_one_hot = tf.keras.utils.to_categorical(df_yval - 1, num_classes=5)

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=16,
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')


In [None]:
tf.keras.backend.clear_session()
wandb.init(project= 'RNN Experiments 1',
           name = 'Bidi LSTM(16),1 layer,')
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=16,
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy','AUC','Precision','Recall'])

history = model.fit(
    df_xtrain['reviewText'],
    df_ytrain_one_hot,
    epochs=10,
    validation_data=(df_xval['reviewText'], df_yval_one_hot),
    callbacks = [WandbCallback()])

In [None]:
tf.keras.backend.clear_session()
wandb.init(project= 'RNN Experiments 1',
           name = 'Bidi LSTM(256),1 layer,')
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=256,
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

model.summary()

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy','AUC','Precision','Recall'])

history = model.fit(
    df_xtrain['reviewText'],
    df_ytrain_one_hot,
    epochs=10,
    validation_data=(df_xval['reviewText'], df_yval_one_hot),
    callbacks = [WandbCallback()])

In [None]:
tf.keras.backend.clear_session()
wandb.init(project= 'RNN Experiments 1',
           name = 'Bidi LSTM(256),1 layer,')
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=256,
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

model.summary()

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy','AUC','Precision','Recall'])

history = model.fit(
    df_xtrain['reviewText'],
    df_ytrain_one_hot,
    epochs=30,
    validation_data=(df_xval['reviewText'], df_yval_one_hot),
    callbacks = [WandbCallback()])

In [None]:
tf.keras.backend.clear_session()
wandb.init(project= 'RNN Experiments 1',
           name = 'Bidi LSTM(512), Embedding = 256, T= 2000, Len = 400')
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=256,
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512)),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

model.summary()

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy','AUC','Precision','Recall'])

history = model.fit(
    df_xtrain['reviewText'],
    df_ytrain_one_hot,
    epochs=30,
    validation_data=(df_xval['reviewText'], df_yval_one_hot),
    callbacks = [WandbCallback()])

In [None]:
tf.keras.backend.clear_session()
wandb.init(project= 'RNN Experiments 1',
           name = 'Bidi LSTM(512), Embedding = 512, T= 2000, Len = 400')
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=512,
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512)),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

model.summary()

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy','AUC','Precision','Recall'])

history = model.fit(
    df_xtrain['reviewText'],
    df_ytrain_one_hot,
    epochs=30,
    validation_data=(df_xval['reviewText'], df_yval_one_hot),
    callbacks = [WandbCallback()])

In [None]:
tf.keras.backend.clear_session()
wandb.init(project= 'RNN Experiments 1',
           name = 'LSTM(256), Embedding = 128, T= 2000, Len = 400')
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=128,
        mask_zero=True),
    tf.keras.layers.LSTM(256),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

model.summary()

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy','AUC','Precision','Recall'])

history = model.fit(
    df_xtrain['reviewText'],
    df_ytrain_one_hot,
    epochs=30,
    validation_data=(df_xval['reviewText'], df_yval_one_hot),
    callbacks = [WandbCallback()])

In [None]:
tf.keras.backend.clear_session()
wandb.init(project= 'RNN Experiments 1',
           name = 'Bidi GRU(512), Embedding = 256, 60000, T= 2000, Len = 400')
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=256,
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(512)),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

model.summary()

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy','AUC','Precision','Recall'])

history = model.fit(
    df_xtrain['reviewText'],
    df_ytrain_one_hot,
    epochs=30,
    validation_data=(df_xval['reviewText'], df_yval_one_hot),
    callbacks = [WandbCallback()])

model.save("model_navn.keras")


In [None]:
tf.keras.backend.clear_session()
wandb.init(project= 'RNN Experiments 1',
           name = 'Bidi GRU(512), Embedding = 128, 60000, T= 500, Len = 200')
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=128,
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(512)),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

model.summary()

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy','AUC','Precision','Recall'])

history = model.fit(
    df_xtrain['reviewText'],
    df_ytrain_one_hot,
    epochs=30,
    validation_data=(df_xval['reviewText'], df_yval_one_hot),
    callbacks = [WandbCallback()])

model.save("model_navn.keras")
