**Improved Accuracy Test**

In [None]:
!pip install tensorflow



# **NCF Model**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model

# Load data
users = pd.read_csv('users.csv')
products = pd.read_csv('products.csv')
interactions = pd.read_csv('interactions.csv')

# Create positive samples
positive_samples = interactions[['User_ID', 'Product_ID']].copy()
positive_samples['Label'] = 1

# Create negative samples
negative_samples_list = []
for _, row in positive_samples.iterrows():
    user_id = row['User_ID']
    negative_products = products[~products['Product_ID'].isin(interactions[interactions['User_ID'] == user_id]['Product_ID'])].sample(3)['Product_ID'].tolist()
    for neg_prod in negative_products:
        negative_samples_list.append([user_id, neg_prod, 0])
negative_samples = pd.DataFrame(negative_samples_list, columns=['User_ID', 'Product_ID', 'Label'])

# Combine and shuffle data
all_samples = pd.concat([positive_samples, negative_samples]).sample(frac=1).reset_index(drop=True)
train, test = train_test_split(all_samples, test_size=0.2, random_state=42)

# Feature preprocessing

gender_mapping = {'Male': 0, 'Female': 1, 'Other': 2}
users['Gender'] = users['Gender'].map(gender_mapping)

# Map location strings to integers
location_mapping = {loc: i for i, loc in enumerate(users['Location'].unique())}
users['Location'] = users['Location'].map(location_mapping)

users['Age'] = MinMaxScaler().fit_transform(users['Age'].values.reshape(-1, 1))
category_mapping = {category: i for i, category in enumerate(products['Category'].unique())}
products['Category'] = products['Category'].map(category_mapping)
products['Price'] = MinMaxScaler().fit_transform(products['Price'].values.reshape(-1, 1))
products['Avg_Rating'] = MinMaxScaler().fit_transform(products['Avg_Rating'].values.reshape(-1, 1))

# Define model parameters
NUM_USERS = 10000
NUM_PRODUCTS = 5000
EMBEDDING_SIZE = 20
EMBEDDING_SIZE_GENDER = 2
EMBEDDING_SIZE_LOCATION = 5
EMBEDDING_SIZE_CATEGORY = 5

# Model inputs
user_input = Input(shape=(1,))
product_input = Input(shape=(1,))
user_age_input = Input(shape=(1,))
user_gender_input = Input(shape=(1,))
user_location_input = Input(shape=(1,))
product_price_input = Input(shape=(1,))
product_rating_input = Input(shape=(1,))
product_category_input = Input(shape=(1,))



In [None]:
# Embeddings
user_embedding = Embedding(input_dim=NUM_USERS + 1, output_dim=EMBEDDING_SIZE)(user_input)
product_embedding = Embedding(input_dim=NUM_PRODUCTS + 1, output_dim=EMBEDDING_SIZE)(product_input)
user_gender_embedding = Embedding(input_dim=3, output_dim=EMBEDDING_SIZE_GENDER)(user_gender_input)
user_location_embedding = Embedding(input_dim=users['Location'].nunique() + 1, output_dim=EMBEDDING_SIZE_LOCATION)(user_location_input)
product_category_embedding = Embedding(input_dim=products['Category'].nunique() + 1, output_dim=EMBEDDING_SIZE_CATEGORY)(product_category_input)

# Flatten embeddings
user_embedding = Flatten()(user_embedding)
product_embedding = Flatten()(product_embedding)
user_gender_embedding = Flatten()(user_gender_embedding)
user_location_embedding = Flatten()(user_location_embedding)
product_category_embedding = Flatten()(product_category_embedding)

# Concatenate all features
all_features = Concatenate()([user_embedding, product_embedding, user_age_input, user_gender_embedding, user_location_embedding, product_price_input, product_rating_input, product_category_embedding])
x = Dense(128, activation='relu')(all_features)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)

In [None]:
# Build and compile model
model = Model(inputs=[user_input, product_input, user_age_input, user_gender_input, user_location_input, product_price_input, product_rating_input, product_category_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([train['User_ID'].values, train['Product_ID'].values, users.loc[train['User_ID'] - 1, 'Age'].values, users.loc[train['User_ID'] - 1, 'Gender'].values, users.loc[train['User_ID'] - 1, 'Location'].values, products.loc[train['Product_ID'] - 1, 'Price'].values, products.loc[train['Product_ID'] - 1, 'Avg_Rating'].values, products.loc[train['Product_ID'] - 1, 'Category'].values], train['Label'].values, epochs=5, batch_size=256, validation_split=0.1)




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7b4bb898fc70>

In [None]:
# Evaluate


test_loss, test_acc = model.evaluate([test['User_ID'].values, test['Product_ID'].values, users.loc[test['User_ID'] - 1, 'Age'].values, users.loc[test['User_ID'] - 1, 'Gender'].values, users.loc[test['User_ID'] - 1, 'Location'].values, products.loc[test['Product_ID'] - 1, 'Price'].values, products.loc[test['Product_ID'] - 1, 'Avg_Rating'].values, products.loc[test['Product_ID'] - 1, 'Category'].values], test['Label'].values)
print(f"Test Accuracy: {test_acc}")


Test Accuracy: 0.6969000101089478


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score,f1_score,precision_score,recall_score,explained_variance_score

# Get predictions for the test set using the trained NCF model
ncf_predictions = model.predict([test['User_ID'].values, test['Product_ID'].values, users.loc[test['User_ID'] - 1, 'Age'].values, users.loc[test['User_ID'] - 1, 'Gender'].values, users.loc[test['User_ID'] - 1, 'Location'].values, products.loc[test['Product_ID'] - 1, 'Price'].values, products.loc[test['Product_ID'] - 1, 'Avg_Rating'].values, products.loc[test['Product_ID'] - 1, 'Category'].values])

ncf_rmse = mean_squared_error(test['Label'].values, ncf_predictions, squared=False)
ncf_mae = mean_absolute_error(test['Label'].values, ncf_predictions)
ncf_mse = mean_squared_error(test['Label'].values, ncf_predictions)


# Compute Precision, Recall, and F1-Score


# Compute Explained Variance Score
ncf_explained_variance = explained_variance_score(test['Label'].values, ncf_predictions)

print(f"NCF - RMSE: {ncf_rmse}, MAE: {ncf_mae},Explained varience score: {ncf_explained_variance}")
print()





NCF - RMSE: 0.4648408552039187, MAE: 0.38347412329044384,Explained varience score: -0.16207979566476283



# **Multiple Model Testing**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, r2_score, precision_score,recall_score

# def evaluate_model(predictions, y_test):
#     """Evaluate the model and return RMSE, MAE, R^2, and MSE"""
#     rmse = mean_squared_error(y_test, predictions, squared=False)
#     mae = mean_absolute_error(y_test, predictions)

#     r2 = r2_score(y_test, predictions)
#     mse = mean_squared_error(y_test, predictions)
#     return rmse, mae

def evaluate_model(predictions, y_test, threshold=3):
    """Evaluate the model and return RMSE, MAE, R^2, MSE, Precision, Recall, F1-score, and Explained Variance"""
    rmse = mean_squared_error(y_test, predictions, squared=False)
    mae = mean_absolute_error(y_test, predictions)

    # Convert ratings into binary labels based on threshold
    y_true_bin = [1 if i > threshold else 0 for i in y_test]
    predictions_bin = [1 if i > threshold else 0 for i in predictions]

    precision = precision_score(y_true_bin, predictions_bin)
    recall = recall_score(y_true_bin, predictions_bin)
    f1 = f1_score(y_true_bin, predictions_bin)
    explained_variance = explained_variance_score(y_test, predictions)

    return rmse, mae, precision, recall, f1, explained_variance

# This function can now be used to evaluate any model's predictions on the test set.


# Load datasets
users_data = pd.read_csv('users.csv')
products_data = pd.read_csv('products.csv')
interactions_data = pd.read_csv('interactions.csv')

# Merge datasets
merged_data = interactions_data.merge(users_data, on='User_ID').merge(products_data, on='Product_ID')

# Label encode categorical columns
label_columns = ['Gender', 'Location', 'Interests', 'Category', 'Product_Name', 'Interaction_Type']
for col in label_columns:
    le = LabelEncoder()
    merged_data[col] = le.fit_transform(merged_data[col])

# Fill missing ratings with the average rating of the product
merged_data['Rating'].fillna(merged_data['Avg_Rating'], inplace=True)

# Define X (features) and y (target)
X = merged_data.drop(columns=['User_ID', 'Product_ID', 'Rating'])
y = merged_data['Rating']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
lr_rmse = mean_squared_error(y_test, lr_predictions, squared=False)


# Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=50, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_rmse = mean_squared_error(y_test, rf_predictions, squared=False)


# XGBoost Regressor
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'eta': 0.05,
    'max_depth': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.8
}
xgb_model = xgb.train(params, dtrain, num_boost_round=100)
xgb_predictions = xgb_model.predict(dtest)
xgb_rmse = mean_squared_error(y_test, xgb_predictions, squared=False)




lr_rmse, lr_mae,lr_precision,lr_recall,lr_f1_score,lr_explained_varience = evaluate_model(lr_predictions, y_test)
print(f"Linear Regression - RMSE: {lr_rmse}, MAE: {lr_mae}, Precision: {lr_precision}, Recall: {lr_recall}, F1: {lr_f1_score}, Explained Varience: {lr_explained_varience}")

# Evaluate Random Forest Regressor
rf_rmse, rf_mae,rf_precision,rf_recall,rf_f1_score,rf_explained_varience  = evaluate_model(rf_predictions, y_test)
print(f"Random Forest - RMSE: {rf_rmse}, MAE: {rf_mae}, Precision: {rf_precision}, Recall: {rf_recall}, F1: {rf_f1_score}, Explained Varience: {rf_explained_varience}")

# Evaluate XGBoost Regressor
xgb_rmse, xgb_mae,xgb_precision,xgb_recall,xgb_f1_score,xgb_explained_varience  = evaluate_model(xgb_predictions, y_test)
print(f"XGBoost - RMSE: {xgb_rmse}, MAE: {xgb_mae}, Precision: {xgb_precision}, Recall: {xgb_recall}, F1: {xgb_f1_score}, Explained Varience: {xgb_explained_varience}")


Linear Regression - RMSE: 0.9847684571436717, MAE: 0.6756059179364098, Precision: 0.7956577266922095, Recall: 0.8604972375690608, F1: 0.8268082282680823, Explained Varience: 0.38302644177626866
Random Forest - RMSE: 0.851680050464725, MAE: 0.427636974761906, Precision: 0.7945083267248215, Recall: 0.8515724606884828, F1: 0.8220512820512821, Explained Varience: 0.5385333752680524
XGBoost - RMSE: 0.821899789140935, MAE: 0.42852632621455194, Precision: 0.8173658943853058, Recall: 0.8321291967700808, F1: 0.8246814783615879, Explained Varience: 0.5702404741395826


# **Using DeppMF and XGBoost**

In [None]:
# Necessary imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate

# Load datasets
users_data = pd.read_csv('users.csv')
products_data = pd.read_csv('products.csv')
interactions_data = pd.read_csv('interactions.csv')

# Relabel User_ID and Product_ID to start from 0
user_encoder = LabelEncoder()
product_encoder = LabelEncoder()

users_data['User_ID'] = user_encoder.fit_transform(users_data['User_ID'])
products_data['Product_ID'] = product_encoder.fit_transform(products_data['Product_ID'])
interactions_data['User_ID'] = user_encoder.transform(interactions_data['User_ID'])
interactions_data['Product_ID'] = product_encoder.transform(interactions_data['Product_ID'])

# Merge datasets
merged_data = interactions_data.merge(users_data, on='User_ID').merge(products_data, on='Product_ID')

# Label encode other categorical columns
label_columns = ['Gender', 'Location', 'Interests', 'Category', 'Product_Name', 'Interaction_Type']
for col in label_columns:
    le = LabelEncoder()
    merged_data[col] = le.fit_transform(merged_data[col])

# Handle NaN or infinite values in the 'Rating' column
merged_data['Rating'].fillna(merged_data['Rating'].mean(), inplace=True)  # Replace NaN with the mean rating
merged_data = merged_data[~np.isinf(merged_data['Rating'])]  # Remove rows with infinite values

# Split the data
X = merged_data.drop(columns=['Rating'])
y = merged_data['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost Model
# For XGBoost, we don't need User_ID and Product_ID
X_train_xgb = X_train.drop(columns=['User_ID', 'Product_ID'])
X_test_xgb = X_test.drop(columns=['User_ID', 'Product_ID'])

dtrain = xgb.DMatrix(X_train_xgb, label=y_train)
dtest = xgb.DMatrix(X_test_xgb, label=y_test)
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'eta': 0.05,
    'max_depth': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.8
}
xgb_model = xgb.train(params, dtrain, num_boost_round=100)
xgb_predictions = xgb_model.predict(dtest)

# Deep Matrix Factorization (DeepMF) Model
n_users = len(merged_data['User_ID'].unique())
n_products = len(merged_data['Product_ID'].unique())
embedding_size = 10

user_input = Input(shape=[1], name='User')
product_input = Input(shape=[1], name='Product')

user_embedding = Embedding(n_users, embedding_size, name='User-Embedding')(user_input)
product_embedding = Embedding(n_products, embedding_size, name='Product-Embedding')(product_input)

user_vec = Flatten(name='Flatten-Users')(user_embedding)
product_vec = Flatten(name='Flatten-Products')(product_embedding)
mf_vec = Dot(name='Dot-Product', axes=1)([user_vec, product_vec])

concat = Concatenate()([user_vec, product_vec, mf_vec])
dense = Dense(128, activation='relu')(concat)
dense = Dense(64, activation='relu')(dense)
output = Dense(1)(dense)

deepmf_model = Model([user_input, product_input], output)
deepmf_model.compile(optimizer='adam', loss='mean_squared_error')
deepmf_model.fit([X_train['User_ID'], X_train['Product_ID']], y_train, epochs=10, batch_size=64)
deepmf_predictions = deepmf_model.predict([X_test['User_ID'], X_test['Product_ID']])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_score,recall_score

# Evaluation for XGBoost
xgb_rmse = mean_squared_error(y_test, xgb_predictions, squared=False)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)

# Evaluation for DeepMF
deepmf_rmse = mean_squared_error(y_test, deepmf_predictions, squared=False)
deepmf_mae = mean_absolute_error(y_test, deepmf_predictions)

from sklearn.metrics import f1_score, explained_variance_score

# Convert ratings into binary labels based on a threshold of 3
y_test_bin = [1 if i > 3 else 0 for i in y_test]
xgb_predictions_bin = [1 if i > 3 else 0 for i in xgb_predictions]
deepmf_predictions_bin = [1 if i > 3 else 0 for i in deepmf_predictions]

# Compute Precision, Recall, and F1-Score for XGBoost
xgb_precision = precision_score(y_test_bin, xgb_predictions_bin)
xgb_recall = recall_score(y_test_bin, xgb_predictions_bin)
xgb_f1 = f1_score(y_test_bin, xgb_predictions_bin)
xgb_explained_variance = explained_variance_score(y_test, xgb_predictions)

# Compute Precision, Recall, and F1-Score for DeepMF
deepmf_precision = precision_score(y_test_bin, deepmf_predictions_bin)
deepmf_recall = recall_score(y_test_bin, deepmf_predictions_bin)
deepmf_f1 = f1_score(y_test_bin, deepmf_predictions_bin)
deepmf_explained_variance = explained_variance_score(y_test, deepmf_predictions)

# Print the metrics
print(f"XGBoost - RMSE: {xgb_rmse}, MAE: {xgb_mae}, Precision: {xgb_precision}, Recall: {xgb_recall}, F1: {xgb_f1}, Explained Variance: {xgb_explained_variance}")
print(f"DeepMF - RMSE: {deepmf_rmse}, MAE: {deepmf_mae}, Precision: {deepmf_precision}, Recall: {deepmf_recall}, F1: {deepmf_f1}, Explained Variance: {deepmf_explained_variance}")






XGBoost - RMSE: 0.821899789140935, MAE: 0.42852632621455194, Precision: 0.8173658943853058, Recall: 0.8321291967700808, F1: 0.8246814783615879, Explained Variance: 0.5702404741395826
DeepMF - RMSE: 1.4174407929851174, MAE: 1.1808177417622805, Precision: 0.4731465880370682, Recall: 0.47736931576710584, F1: 0.4752485720330019, Explained Variance: -0.2781748519074567
