In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

merged_df = pd.read_csv('/kaggle/input/finaldf-csv/merged_df.csv')

In [3]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 484007 entries, 0 to 484006
Data columns (total 14 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   rating             484007 non-null  float64
 1   title_x            483887 non-null  object 
 2   text               483874 non-null  object 
 3   helpful_vote       484007 non-null  int64  
 4   verified_purchase  484007 non-null  bool   
 5   title_y            483998 non-null  object 
 6   average_rating     484007 non-null  float64
 7   rating_number      484007 non-null  int64  
 8   price              484007 non-null  float64
 9   details            484007 non-null  object 
 10  x_length           484007 non-null  int64  
 11  y_length           484007 non-null  int64  
 12  de_length          484007 non-null  int64  
 13  review_length      484007 non-null  int64  
dtypes: bool(1), float64(3), int64(6), object(4)
memory usage: 48.5+ MB


In [4]:
# Fill NaN values in text columns with an empty string
text_columns = ['title_x', 'text', 'title_y', 'details']
merged_df[text_columns] = merged_df[text_columns].fillna('')

# Check for remaining missing values and handle them
merged_df.fillna(0, inplace=True)

# Now all columns should have the same number of non-null values
print(merged_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 484007 entries, 0 to 484006
Data columns (total 14 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   rating             484007 non-null  float64
 1   title_x            484007 non-null  object 
 2   text               484007 non-null  object 
 3   helpful_vote       484007 non-null  int64  
 4   verified_purchase  484007 non-null  bool   
 5   title_y            484007 non-null  object 
 6   average_rating     484007 non-null  float64
 7   rating_number      484007 non-null  int64  
 8   price              484007 non-null  float64
 9   details            484007 non-null  object 
 10  x_length           484007 non-null  int64  
 11  y_length           484007 non-null  int64  
 12  de_length          484007 non-null  int64  
 13  review_length      484007 non-null  int64  
dtypes: bool(1), float64(3), int64(6), object(4)
memory usage: 48.5+ MB
None


In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import scipy.sparse
from imblearn.over_sampling import RandomOverSampler

# Fill NaN values in text columns with an empty string
text_columns = ['text', 'title_x', 'title_y', 'details']
merged_df[text_columns] = merged_df[text_columns].fillna('')

# Combine text columns into one for TF-IDF vectorization
merged_df['combined_text'] = merged_df['text'] + ' ' + merged_df['title_x'] + ' ' + merged_df['title_y'] + ' ' + merged_df['details']

# Use a smaller sample of the data for experimentation
sample_df = merged_df.sample(frac=0.1, random_state=42)

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=300)  # Reduce max features to save memory
tfidf_matrix = tfidf_vectorizer.fit_transform(sample_df['combined_text'])

# Normalize numerical features
numerical_features = sample_df[['price', 'average_rating', 'rating_number', 'helpful_vote', 'x_length', 'y_length', 'de_length']]
scaler = StandardScaler()
normalized_numerical_features = scaler.fit_transform(numerical_features)

# Convert normalized numerical features to sparse matrix
normalized_numerical_features_sparse = scipy.sparse.csr_matrix(normalized_numerical_features)

# Combine TF-IDF features with normalized numerical features (both sparse matrices)
X_sparse = scipy.sparse.hstack([tfidf_matrix, normalized_numerical_features_sparse])

# Define the target variable
y = sample_df['rating'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_sparse, y, test_size=0.2, random_state=42)

# Apply RandomOverSampler to handle class imbalance
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# Train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1)  # Use all cores
model.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate and print the Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 1.180754178287367


<h2> Using RNNs for sequential Processing of Text Data and Observing difference over treating text as static 

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate

# Sample data
sample_df = merged_df.sample(frac=0.1, random_state=42)

# Fill NaN values in text columns with an empty string
text_columns = ['text', 'title_x']
sample_df[text_columns] = sample_df[text_columns].fillna('')

# Prepare text data
tokenizer = Tokenizer(num_words=10000)  # Adjust vocabulary size as needed
tokenizer.fit_on_texts(sample_df['text'] + ' ' + sample_df['title_x'])

# Convert text to sequences
text_sequences = tokenizer.texts_to_sequences(sample_df['text'])
title_sequences = tokenizer.texts_to_sequences(sample_df['title_x'])

# Pad sequences
max_seq_length = 100  # Adjust sequence length as needed
text_padded = pad_sequences(text_sequences, maxlen=max_seq_length, padding='post')
title_padded = pad_sequences(title_sequences, maxlen=max_seq_length, padding='post')

# Prepare numerical features
numerical_features = sample_df[['price', 'average_rating', 'rating_number', 'helpful_vote', 'x_length', 'y_length', 'de_length']]
scaler = StandardScaler()
normalized_numerical_features = scaler.fit_transform(numerical_features)

# Define the target variable
y = sample_df['rating'].values

# Split the data into training and testing sets
X_text_train, X_text_test, X_title_train, X_title_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    text_padded, title_padded, normalized_numerical_features, y, test_size=0.2, random_state=42)

2024-06-08 19:08:39.794785: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-08 19:08:39.794885: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-08 19:08:39.919852: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Define the text input
text_input = Input(shape=(max_seq_length,), name='text_input')
title_input = Input(shape=(max_seq_length,), name='title_input')

# Embedding layers
embedding_dim = 50  # Adjust embedding dimensions as needed
text_embedding = Embedding(input_dim=10000, output_dim=embedding_dim, input_length=max_seq_length)(text_input)
title_embedding = Embedding(input_dim=10000, output_dim=embedding_dim, input_length=max_seq_length)(title_input)

# LSTM layers
text_lstm = LSTM(64)(text_embedding)
title_lstm = LSTM(64)(title_embedding)

# Numerical input
numerical_input = Input(shape=(normalized_numerical_features.shape[1],), name='numerical_input')

# Concatenate all features
concatenated = Concatenate()([text_lstm, title_lstm, numerical_input])

# Dense layers
dense = Dense(64, activation='relu')(concatenated)
output = Dense(1)(dense)

# Define the model
model = Model(inputs=[text_input, title_input, numerical_input], outputs=output)
model.compile(optimizer='adam', loss='mse')

# Summary of the model
model.summary()

# Train the model
model.fit(
    [X_text_train, X_title_train, X_num_train], y_train,
    validation_data=([X_text_test, X_title_test, X_num_test], y_test),
    epochs=10, batch_size=32)  # Adjust epochs and batch size as needed

# Predict on the test set
y_pred = model.predict([X_text_test, X_title_test, X_num_test])

# Calculate and print the Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")



Epoch 1/10
[1m1210/1210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 11ms/step - loss: 2.6064 - val_loss: 1.9668
Epoch 2/10
[1m1210/1210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 10ms/step - loss: 1.8712 - val_loss: 1.5120
Epoch 3/10
[1m1210/1210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 10ms/step - loss: 1.0483 - val_loss: 0.7466
Epoch 4/10
[1m1210/1210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 10ms/step - loss: 0.5918 - val_loss: 0.7007
Epoch 5/10
[1m1210/1210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 10ms/step - loss: 0.4554 - val_loss: 0.7389
Epoch 6/10
[1m1210/1210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 10ms/step - loss: 0.3778 - val_loss: 0.7769
Epoch 7/10
[1m1210/1210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 10ms/step - loss: 0.3327 - val_loss: 0.8024
Epoch 8/10
[1m1210/1210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 10ms/step - loss: 0.2718 - val_loss: 0.8189
Epoch 9/

<h2> Hyperparameter Tuned Feed Forward Networks </h2>

In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from kerastuner.tuners import RandomSearch
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Sample data
sample_df = merged_df.sample(frac=0.1, random_state=42)

# Fill NaN values in text columns with an empty string
text_columns = ['text', 'title_x', 'title_y', 'details']
sample_df[text_columns] = sample_df[text_columns].fillna('')

# Combine text columns into one for TF-IDF vectorization
sample_df['combined_text'] = sample_df['text'] + ' ' + sample_df['title_x'] + ' ' + sample_df['title_y'] + ' ' + sample_df['details']

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
tfidf_matrix = tfidf_vectorizer.fit_transform(sample_df['combined_text'])

# Normalize numerical features
numerical_features = sample_df[['price', 'average_rating', 'rating_number', 'helpful_vote', 'x_length', 'y_length', 'de_length']]
scaler = StandardScaler()
normalized_numerical_features = scaler.fit_transform(numerical_features)

# Combine TF-IDF features with normalized numerical features
X = np.hstack([tfidf_matrix.toarray(), normalized_numerical_features])

# Define the target variable
y = sample_df['rating'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model-building function
def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units', min_value=32, max_value=512, step=32),
                    activation='relu',
                    input_shape=(X_train.shape[1],)))
    model.add(Dropout(rate=hp.Float('dropout', min_value=0.0, max_value=0.5, step=0.1)))
    
    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=512, step=32),
                        activation='relu'))
        model.add(Dropout(rate=hp.Float('dropout_' + str(i), min_value=0.0, max_value=0.5, step=0.1)))

    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])
    return model

# Initialize the Keras Tuner
tuner = RandomSearch(
    build_model,
    objective='mse',
    max_trials=10,  # Increase this for a more thorough search
    executions_per_trial=2,
    directory='my_dir',
    project_name='hyperparam_tuning_ffnn'
)

# Perform the hyperparameter search
tuner.search(X_train, y_train, epochs=10, validation_split=0.2, verbose=1)

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Print the best hyperparameters
print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units')} and the optimal dropout rate is {best_hps.get('dropout')}.
""")

# Build the best model
model = tuner.hypermodel.build(best_hps)

# Train the model
history = model.fit(X_train, y_train, epochs=20, validation_split=0.2, verbose=1)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate and print the Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Trial 10 Complete [00h 01m 06s]
mse: 0.3182298094034195

Best mse So Far: 0.0954609289765358
Total elapsed time: 00h 12m 54s

The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is 32 and the optimal dropout rate is 0.0.

Epoch 1/20
[1m968/968[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 2.9942 - mse: 2.9942 - val_loss: 0.8039 - val_mse: 0.8039
Epoch 2/20
[1m968/968[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.5956 - mse: 0.5956 - val_loss: 0.7447 - val_mse: 0.7447
Epoch 3/20
[1m968/968[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.3845 - mse: 0.3845 - val_loss: 0.7734 - val_mse: 0.7734
Epoch 4/20
[1m968/968[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.2866 - mse: 0.2866 - val_loss: 0.7612 - val_mse: 0.7612
Epoch 5/20
[1m968/968[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.2085 - mse: 0.2085 - v

- Very Similar Peformance between sequential Model and hyper-paramter tuned feed forward Networks

<h1><b>Product Recommendation System using bert Embeddings </b><h1>

In [5]:
import torch
from transformers import BertTokenizer, BertModel
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np


# Select a small subset of the data
subset_df = merged_df.sample(n=10000, random_state=42)

# Combine text columns into one for BERT embeddings
subset_df['combined_text'] = subset_df['text'] + ' ' + subset_df['title_x'] + ' ' + subset_df['title_y'] + ' ' + subset_df['details']

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)
    outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()
    return embeddings

# Generate BERT embeddings for the text data
bert_embeddings = np.vstack([get_bert_embeddings(text) for text in subset_df['combined_text']])

# Normalize numerical features
numerical_features = subset_df[['price', 'average_rating', 'rating_number', 'helpful_vote', 'x_length', 'y_length', 'de_length', 'review_length']]
scaler = StandardScaler()
normalized_numerical_features = scaler.fit_transform(numerical_features)

# Combine BERT embeddings with normalized numerical features
X = np.hstack([bert_embeddings, normalized_numerical_features])

# Define the target variable
y = subset_df['rating'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the input data to dense format
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

# Build the regression model
input_shape = X_train.shape[1]
model = Sequential([
    Input(shape=(input_shape,)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer for regression
])

model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, epochs=20, validation_split=0.2, batch_size=32, verbose=1)

# Predict ratings for all products
predicted_ratings = model.predict(X).flatten()

# Combine BERT embeddings, normalized numerical features, and predicted ratings
X_with_ratings = np.hstack([bert_embeddings, normalized_numerical_features, predicted_ratings.reshape(-1, 1)])

# Calculate cosine similarity between products
cosine_sim_with_ratings = cosine_similarity(X_with_ratings, X_with_ratings)

# Function to get top K similar products considering predicted ratings
def get_similar_products(product_idx, k=5):
    sim_scores = list(enumerate(cosine_sim_with_ratings[product_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:k+1]  # Exclude the product itself
    similar_products = [(subset_df.iloc[i[0]]['title_y'], i[1]) for i in sim_scores]
    return similar_products

# Example usage
example_product_idx = 0  # Using the first product as an example
similar_products = get_similar_products(example_product_idx, k=5)
print(f"Top 5 similar products to '{subset_df.iloc[example_product_idx]['title_y']}':")
for product, score in similar_products:
    print(f"Product: {product}, Similarity Score: {score}")

2024-06-09 19:36:43.763201: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-09 19:36:43.763316: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-09 19:36:44.011892: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1/20
[1m106/200[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 1ms/step - loss: 5.9849

I0000 00:00:1717963551.967739     143 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 4.5002 - val_loss: 1.7652
Epoch 2/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.6832 - val_loss: 1.6377
Epoch 3/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.5253 - val_loss: 1.5495
Epoch 4/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.5139 - val_loss: 1.5231
Epoch 5/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.4428 - val_loss: 1.6609
Epoch 6/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.3497 - val_loss: 1.9762
Epoch 7/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.3966 - val_loss: 1.3653
Epoch 8/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.3365 - val_loss: 1.3940
Epoch 9/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━