# **Model Trained with Balanced Data and Rating Scale (1-6)**

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# 1. Load Dataset
users = pd.read_csv("BX-Users.csv", sep=';', encoding='latin-1', on_bad_lines='skip')
books = pd.read_csv("BX_Books.csv", sep=';', encoding='latin-1', on_bad_lines='skip')
ratings = pd.read_csv("BX-Book-Ratings.csv", sep=';', encoding='latin-1', on_bad_lines='skip')

# Merge ratings with user and book features
data = ratings.merge(users, on="User-ID").merge(books, on="ISBN")

# Drop unnecessary features
data = data.drop(columns=["ISBN", "Publisher"])

# 3. Handle Missing Values
data["Age"] = data["Age"].fillna(data["Age"].median())
data["Location"] = data["Location"].fillna("Unknown")
data["Book-Title"] = data["Book-Title"].fillna("Unknown")
data["Book-Author"] = data["Book-Author"].fillna("Unknown")
data["Year-Of-Publication"] = data["Year-Of-Publication"].fillna(data["Year-Of-Publication"].median())

# 4. Encode Features
le_user = LabelEncoder()
le_book = LabelEncoder()

data["User-ID"] = le_user.fit_transform(data["User-ID"])
data["Book-ID"] = le_book.fit_transform(data["Book-Title"]) + 1  # Ensure IDs start from 1

# Normalize numerical features
scaler = MinMaxScaler()
data["Age"] = scaler.fit_transform(data["Age"].values.reshape(-1, 1))
data["Year-Of-Publication"] = scaler.fit_transform(data["Year-Of-Publication"].values.reshape(-1, 1))

# Label ratings as 0 or 1
data["Recommendation"] = (data["Book-Rating"] > 6).astype(int)

# 5. Balance the Dataset
class_counts = data["Recommendation"].value_counts()
print("Class Distribution Before Balancing:")
print(class_counts)

positive_class = data[data["Recommendation"] == 1]
negative_class = data[data["Recommendation"] == 0]

# Downsample the negative class to match the positive class size
negative_class_downsampled = negative_class.sample(n=len(positive_class), random_state=42)

# Combine the balanced classes
balanced_data = pd.concat([positive_class, negative_class_downsampled])

# Shuffle the dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

print("Class Distribution After Balancing:")
print(balanced_data["Recommendation"].value_counts())

# 6. Split Data (Without Age)
X = balanced_data[["User-ID", "Location", "Book-ID", "Book-Title", "Book-Author", "Year-Of-Publication"]]
y = balanced_data["Recommendation"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Preprocess Text Features
text_vectorizer = tf.keras.layers.TextVectorization(max_tokens=5000, output_sequence_length=10)

text_vectorizer.adapt(X_train["Location"])
location_train = text_vectorizer(X_train["Location"])
location_test = text_vectorizer(X_test["Location"])

title_vectorizer = tf.keras.layers.TextVectorization(max_tokens=5000, output_sequence_length=10)
title_vectorizer.adapt(X_train["Book-Title"])
title_train = title_vectorizer(X_train["Book-Title"])
title_test = title_vectorizer(X_test["Book-Title"])

author_vectorizer = tf.keras.layers.TextVectorization(max_tokens=5000, output_sequence_length=10)
author_vectorizer.adapt(X_train["Book-Author"])
author_train = author_vectorizer(X_train["Book-Author"])
author_test = author_vectorizer(X_test["Book-Author"])

# 8. Define Model (Without Age)
user_input = tf.keras.layers.Input(shape=(1,), name="User-ID")
location_input = tf.keras.layers.Input(shape=(10,), name="Location")
book_input = tf.keras.layers.Input(shape=(1,), name="Book-ID")
title_input = tf.keras.layers.Input(shape=(10,), name="Book-Title")
author_input = tf.keras.layers.Input(shape=(10,), name="Book-Author")
year_input = tf.keras.layers.Input(shape=(1,), name="Year-Of-Publication")

user_embedding = tf.keras.layers.Embedding(input_dim=data["User-ID"].nunique(), output_dim=16)(user_input)
book_embedding = tf.keras.layers.Embedding(input_dim=data["Book-ID"].nunique() + 1, output_dim=16)(book_input)

user_flatten = tf.keras.layers.Flatten()(user_embedding)
book_flatten = tf.keras.layers.Flatten()(book_embedding)

location_embedding = tf.keras.layers.Embedding(input_dim=5000, output_dim=8)(location_input)
title_embedding = tf.keras.layers.Embedding(input_dim=5000, output_dim=8)(title_input)
author_embedding = tf.keras.layers.Embedding(input_dim=5000, output_dim=8)(author_input)

location_flatten = tf.keras.layers.Flatten()(location_embedding)
title_flatten = tf.keras.layers.Flatten()(title_embedding)
author_flatten = tf.keras.layers.Flatten()(author_embedding)

concat = tf.keras.layers.Concatenate()(
    [user_flatten, location_flatten, book_flatten, title_flatten, author_flatten, year_input]
)

dense1 = tf.keras.layers.Dense(128, activation="relu")(concat)
dense2 = tf.keras.layers.Dense(64, activation="relu")(dense1)
output = tf.keras.layers.Dense(1, activation="sigmoid")(dense2)

model = tf.keras.Model(
    inputs=[user_input, location_input, book_input, title_input, author_input, year_input], outputs=output
)

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# 9. Prepare Data for Model (Without Age)
X_train_prepared = [
    X_train["User-ID"],
    location_train,
    X_train["Book-ID"],
    title_train,
    author_train,
    X_train["Year-Of-Publication"].values,
]

X_test_prepared = [
    X_test["User-ID"],
    location_test,
    X_test["Book-ID"],
    title_test,
    author_test,
    X_test["Year-Of-Publication"].values,
]

# 10. Train Model
model.fit(
    X_train_prepared,
    y_train,
    validation_data=(X_test_prepared, y_test),
    epochs=10,
    batch_size=32
)


Class Distribution Before Balancing:
Recommendation
0    740958
1    290217
Name: count, dtype: int64
Class Distribution After Balancing:
Recommendation
0    290217
1    290217
Name: count, dtype: int64
Epoch 1/10
[1m14511/14511[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m738s[0m 51ms/step - accuracy: 0.6619 - loss: 0.5980 - val_accuracy: 0.6996 - val_loss: 0.5631
Epoch 2/10
[1m14511/14511[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m738s[0m 51ms/step - accuracy: 0.7774 - loss: 0.4635 - val_accuracy: 0.6745 - val_loss: 0.6148
Epoch 3/10
[1m14511/14511[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m725s[0m 50ms/step - accuracy: 0.8262 - loss: 0.3833 - val_accuracy: 0.6698 - val_loss: 0.6414
Epoch 4/10
[1m14511/14511[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m783s[0m 53ms/step - accuracy: 0.8529 - loss: 0.3226 - val_accuracy: 0.6724 - val_loss: 0.7077
Epoch 5/10
[1m14511/14511[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m763s[0m 50ms/step - accuracy: 0.8727 - loss: 0

<keras.src.callbacks.history.History at 0x7c6314683dc0>

In [None]:

# 10. Evaluate Model
loss, accuracy = model.evaluate(X_test_prepared, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

[1m3628/3628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.6456 - loss: 1.1444
Test Accuracy: 0.64


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, average_precision_score
import numpy as np

# Make predictions
y_pred_probs = model.predict(X_test_prepared)  # Predicted probabilities
y_pred = (y_pred_probs > 0.5).astype(int)  # Convert to binary (assuming binary classification)

# Precision
precision = precision_score(y_test, y_pred, average='binary')  # Use 'micro', 'macro', or 'weighted' for multi-class
print(f"Precision: {precision:.2f}")

# Recall
recall = recall_score(y_test, y_pred, average='binary')
print(f"Recall: {recall:.2f}")

# F1 Score
f1 = f1_score(y_test, y_pred, average='binary')
print(f"F1 Score: {f1:.2f}")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Mean Average Precision (MAP)
average_precision = average_precision_score(y_test, y_pred_probs)
print(f"Mean Average Precision (MAP): {average_precision:.2f}")


[1m3628/3628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step
Precision: 0.64
Recall: 0.66
F1 Score: 0.65
Confusion Matrix:
[[36362 21683]
 [19720 38322]]
Mean Average Precision (MAP): 0.67
