Explication : L'ago doit prédire les notes d'un utilisateur pour des livres qu'il n'a pas encore notés. Ce système prend en compte les notes que l'utilisateur a attribuées à ses livres pour faire des prédictions. Les notes qu'un utilisateur a attribuées à ses livres sont utilisées pour faire des prédictions sur les livres qu'il pourrait aimer à l'avenir. De plus, parce que la SVD prend en compte les notes de tous les utilisateurs, les prédictions sont également influencées par les préférences des utilisateurs similaires

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from scipy import stats
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

import ast
from sqlalchemy import create_engine
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from fuzzywuzzy import process, fuzz
from IPython.display import clear_output
from data_db import user, mdp



## PREPROCESSING

In [2]:
book_data = pd.read_csv("books_metadata_Amazon.csv", delimiter=',', on_bad_lines='skip')
book_data = book_data[['Title', 'authors', 'categories']] # keep the Title, authors and categories from the columns
book_data.head()

Unnamed: 0,Title,authors,categories
0,Its Only Art If Its Well Hung!,['Julie Strain'],['Comics & Graphic Novels']
1,Dr. Seuss: American Icon,['Philip Nel'],['Biography & Autobiography']
2,Wonderful Worship in Smaller Churches,['David R. Ray'],['Religion']
3,Whispers of the Wicked Saints,['Veronica Haddon'],['Fiction']
4,"Nation Dance: Religion, Identity and Cultural ...",['Edward Long'],


In [3]:
# Function to convert a list into a string
def list_to_string(list):
    return ', '.join(list)

def str_to_list(list_str):
    if isinstance(list_str, str):
        return ast.literal_eval(list_str)
    else:
        return []

# Replace NaN with empty strings
book_data['authors'].fillna('[]', inplace=True)
book_data['categories'].fillna('[]', inplace=True)

# Convert strings that look like lists into actual lists
book_data['authors'] = book_data['authors'].apply(str_to_list)
book_data['categories'] = book_data['categories'].apply(str_to_list)

# Convert the lists into strings
book_data['authors'] = book_data['authors'].apply(list_to_string)
book_data['categories'] = book_data['categories'].apply(list_to_string)

book_data.head()


Unnamed: 0,Title,authors,categories
0,Its Only Art If Its Well Hung!,Julie Strain,Comics & Graphic Novels
1,Dr. Seuss: American Icon,Philip Nel,Biography & Autobiography
2,Wonderful Worship in Smaller Churches,David R. Ray,Religion
3,Whispers of the Wicked Saints,Veronica Haddon,Fiction
4,"Nation Dance: Religion, Identity and Cultural ...",Edward Long,


In [4]:
# consider on user_id, book_id, and ratings
df = pd.read_csv('Books_Amazon.csv')
df = df[['Id','User_id','Title','review/score', 'review/time']]
df.rename(columns={'Id':'ProductId','User_id':'UserId','review/time':'Time','Title':'title','review/score':'Score'},inplace=True)
df.head()

Unnamed: 0,ProductId,UserId,title,Score,Time
0,1882931173,AVCGYZL8FQQTD,Its Only Art If Its Well Hung!,4.0,940636800
1,826414346,A30TK6U7DNS82R,Dr. Seuss: American Icon,5.0,1095724800
2,826414346,A3UH4UZ4RSVO82,Dr. Seuss: American Icon,5.0,1078790400
3,826414346,A2MVUWT453QH61,Dr. Seuss: American Icon,4.0,1090713600
4,826414346,A22X4XUPKF66MR,Dr. Seuss: American Icon,4.0,1107993600


In [5]:
df.shape

(3000000, 5)

In [6]:
# FUSION DES DONNEES
df['title'] = df['title'].str.strip().str.lower()
book_data['Title'] = book_data['Title'].str.strip().str.lower()

# Merge the DataFrames on the titles
df = df.merge(book_data, how='left', left_on='title', right_on='Title')

# Supprimez la colonne des titres en double
df = df.drop(columns=['Title'])


# Data Preprocessing

In [8]:
#lots of fields where user is NaN
df = df.dropna(subset=['UserId'])

In [9]:
print("Size of 'ProductId' column:", len(df['ProductId']))
print("Size of 'UserId' column:", len(df['UserId']))

# Define the threshold values
product_id_threshold = 200 
user_id_threshold = 10

# Count the occurrences of ProductId and UserId
product_id_counts = df['ProductId'].value_counts()
user_id_counts = df['UserId'].value_counts()

# Filter out rows below the threshold
filtered_df = df[(df['ProductId'].isin(product_id_counts[product_id_counts >= product_id_threshold].index)) &
                 (df['UserId'].isin(user_id_counts[user_id_counts >= user_id_threshold].index))]

print("Size of 'ProductId' column:", len(filtered_df['ProductId']))
print("Size of 'UserId' column:", len(filtered_df['UserId']))

Size of 'ProductId' column: 2687258
Size of 'UserId' column: 2687258
Size of 'ProductId' column: 534135
Size of 'UserId' column: 534135


In [10]:
# Get unique UserIds and ProductIds
unique_user_ids = filtered_df['UserId'].unique()
unique_product_ids = filtered_df['ProductId'].unique() #unique ids for books are less

user_id_to_index = {user_id: index for index, user_id in enumerate(unique_user_ids)}
product_id_to_index = {product_id: index for index, product_id in enumerate(unique_product_ids)}

# clean matrix
matrix = np.zeros((len(unique_user_ids), len(unique_product_ids)))

# users as rows, books as columns with their ratings
for _, row in filtered_df.iterrows():
    user_id = row['UserId']
    product_id = row['ProductId']
    score = row['Score']
    
    user_index = user_id_to_index[user_id]
    product_index = product_id_to_index[product_id]
    
    if matrix[user_index][product_index] < score:
        matrix[user_index][product_index] = score
print(matrix.shape)
matrix

(33624, 1809)


array([[5., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 5., 0.],
       [0., 0., 0., ..., 0., 5., 0.],
       [0., 0., 0., ..., 0., 5., 0.]])

# Z-Scoring

In [11]:
matrix = stats.zscore(matrix, axis=0)

# Evaluation

Evaluation de la performance des recommandation

In [12]:
def calculate_mse(predicted_matrix, test_matrix):
    num_users = min(predicted_matrix.shape[0], test_matrix.shape[0])
    num_items = min(predicted_matrix.shape[1], test_matrix.shape[1])
    mse = np.mean((predicted_matrix[:num_users, :num_items] - test_matrix[:num_users, :num_items]) ** 2)
    return mse

def calculate_f1_score(recall, precision):
    if recall + precision == 0:
        return 0
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

def precision_at_k(actual_matrix, predicted_matrix, k, threshold):
    binary_predicted_matrix = predicted_matrix >= threshold
    
    precision = []
    for i in range(len(actual_matrix)):
        actual_indices = np.where(actual_matrix[i] >= threshold)[0]
        predicted_indices = np.argsort(~binary_predicted_matrix[i])[:k]
        common_indices = np.intersect1d(actual_indices, predicted_indices)
        precision.append(len(common_indices) / len(predicted_indices))
    
    return np.mean(precision)

def recall_at_k(true_matrix, pred_matrix, k, threshold):
    pred_matrix_sorted = np.argsort(pred_matrix, axis=1)[:, ::-1][:, :k]
    recall_scores = []
    for i in range(len(true_matrix)):
        true_positives = len(set(pred_matrix_sorted[i]).intersection(set(np.where(true_matrix[i] >= threshold)[0])))
        actual_positives = len(np.where(true_matrix[i] >= threshold)[0])
        if actual_positives > 0:
            recall_scores.append(true_positives / actual_positives)
    recall = np.mean(recall_scores)
    return recall

# SVD

système de recommandation basé sur la factorisation de matrices, en particulier la décomposition en valeurs singulières

In [13]:
def split_train_test(matrix, test_size=0.2, random_state=42):
    train_matrix, test_matrix = train_test_split(matrix, test_size=test_size, random_state=random_state)
    return train_matrix, test_matrix

def calculate_svd(train_matrix, k=600):
    train_sparse = csr_matrix(train_matrix)
    # Perform SVD on the sparse matrix
    U_train, S_train, VT_train = svds(train_sparse, k=k)
    # Reverse the singular values, columns of U_train, and rows of VT_train
    S_train_k = np.diag(S_train[::-1])
    U_train_k = U_train[:, ::-1]
    VT_train_k = VT_train[::-1, :]
    
    return U_train_k, S_train_k, VT_train_k

train_matrix, test_matrix = split_train_test(matrix)

# training set
U_train, S_train, VT_train = calculate_svd(train_matrix)
U_train_pred = np.dot(train_matrix, VT_train.T)
train_pred_matrix = np.dot(U_train_pred, VT_train)

# Make predictions for the test set
U_test_pred = np.dot(test_matrix, VT_train.T)
predicted_matrix = np.dot(U_test_pred, VT_train)

# Calculate MSE 
train_mse = calculate_mse(train_matrix, train_pred_matrix)
test_mse = calculate_mse(test_matrix, predicted_matrix)

print("Train Set Mean Squared Error (MSE):", train_mse)
print("Test Set Mean Squared Error (MSE):", test_mse)

Train Set Mean Squared Error (MSE): 0.10621363095250724
Test Set Mean Squared Error (MSE): 0.16099663293673255


In [14]:
# Calculate Precision at k for the test set
precision = precision_at_k(test_matrix, predicted_matrix, k=10, threshold=3)

# Calculate Recall at k for the test set
recall = recall_at_k(test_matrix, predicted_matrix, k=10, threshold=3)

# Calculate F1 score
f1_score = calculate_f1_score(recall, precision)
print("RMSE (training): ", np.sqrt(train_mse) )
print("RMSE (test): ", np.sqrt(test_mse))
print("Precision @ 10: ", precision)
print("Recall @ 10:", recall)
print("F1 Score:", f1_score)

RMSE (training):  0.32590432791312735
RMSE (test):  0.4012438571950137
Precision @ 10:  0.6477769516728623
Recall @ 10: 0.8558497803434887
F1 Score: 0.7374167404663634


# Output MySQL USER DATA

In [15]:
def get_user_data(user_email):
    # Connect to the database
    engine = create_engine('mysql+pymysql://'+user+':'+mdp+'@localhost:3306/db_master_project') # Change the password accordingly !!!!

    # Load user's read and liked books from the database
    query = f"SELECT * FROM user WHERE email = '{user_email}'"
    user_data = pd.read_sql_query(query, engine)
    
    return user_data
user_data = get_user_data('john.doe@example.com')

In [16]:
def get_user_books(user_email):
    """ Récupère les livres d'un utilisateur à partir de la base de données """
    engine = create_engine('mysql+pymysql://'+user+':'+mdp+'@localhost:3306/db_master_project')
    user_books_query = "SELECT * FROM book WHERE owner = '%s';" % user_email
    user_books = pd.read_sql_query(user_books_query, engine)
    #user_books = user_books[user_books['rating'].between(0, 5)]
    # Prendre en compte la casse et les espaces supplémentaires
    user_books['title'] = user_books['title'].str.strip().str.lower()
    return user_books

user_books = get_user_books('john.doe@example.com')

In [17]:
def recommend_books_for_new_user(user_books, n_recommendations):
    user_matrix = np.zeros(len(unique_product_ids))

    for _, book in user_books.iterrows():
        title = book['title']
        rating = book['rating']
        closest_match = process.extractOne(title, list(product_id_to_index.keys()))
        if closest_match[1] < 80:  # Ignore the match if the similarity score is less than 80
            continue
        closest_title = closest_match[0]
        product_id_values = df[df['title'] == closest_title]['ProductId'].values
        if product_id_values.size == 0:  # The book does not exist in df
            continue
        product_id = product_id_values[0]
        if product_id in product_id_to_index:
            product_index = product_id_to_index[product_id]
            user_matrix[product_index] = rating
    user_embedding = np.dot(user_matrix, VT_train.T)
    similarity_scores = VT_train.T.dot(user_embedding)
    sorted_indices = similarity_scores.argsort()[::-1]
    top_relevant_indices = sorted_indices[:n_recommendations]
    recommended_product_ids = [list(product_id_to_index.keys())[list(product_id_to_index.values()).index(idx)] for idx in top_relevant_indices]
    recommended_books = df.loc[df['ProductId'].isin(recommended_product_ids), 'title'].unique().tolist()
    return recommended_books


In [18]:
# RESULTATS DES RECOMMANDATIONS
user_books = get_user_books('john.doe@example.com')
recommended_books = recommend_books_for_new_user(user_books,10)
print("Recommended Books:")
for book in recommended_books:
    print(book)


Recommended Books:
naked in death
like water for chocolate: a novel in monthly installments, withrecipes, romances, and home remedies
forever amber (the greates historical novels)
a christmas carol (classic fiction)
the reader
journey to the center of the earth
the story of ferdinand
the screwtape letters
the selfish gene
jane eyre


In [21]:
recommended_books

['naked in death',
 'like water for chocolate: a novel in monthly installments, withrecipes, romances, and home remedies',
 'forever amber (the greates historical novels)',
 'a christmas carol (classic fiction)',
 'the reader',
 'journey to the center of the earth',
 'the story of ferdinand',
 'the screwtape letters',
 'the selfish gene',
 'jane eyre']

N'hésitez pas à contacter à toute heure de la journée ou de la nuit Logan en cas de problème: logan.le.lay@efrei.net

## TEST SUR UTILISATEUR DEJA PRÉSENT

In [19]:
def fetch_relevant_items_for_user(user_id, relevant_items=5):
    # Get the index of the user
    user_index = user_id_to_index[user_id]
    user_embedding = U_train[user_index, :]
    
    similarity_scores = VT_train.T.dot(user_embedding)

    sorted_indices = similarity_scores.argsort()[::-1]
    top_relevant_indices = sorted_indices[:relevant_items]
    
    relevant_items = [list(product_id_to_index.keys())[list(product_id_to_index.values()).index(idx)] for idx in top_relevant_indices]
    relevant_titles = df.loc[df['ProductId'].isin(relevant_items), 'title'].tolist()
    
    # Remove any duplicate titles
    unique_relevant_titles = list(set(relevant_titles))
    
    # Get the final set of relevant items without duplicate titles
    final_relevant_items = []
    for title in unique_relevant_titles:
        final_relevant_items.append(title)
    
    return final_relevant_items

In [20]:
"""
user_id = ["A30C4HNZBZYDI2", "A3OWUSU9RG4NMF", "A2BIFGERNRDLBB"] 
top_n = 5

for id in user_id:
    relevant_items = fetch_relevant_items_for_user(id, top_n)
    print(f"User: {id}")
    print("Relevant Items:")
    for i, item in enumerate(relevant_items):
        print(f"{i+1}. {item}")
    print()
"""

User: A30C4HNZBZYDI2
Relevant Items:
1. the lord of the rings trilogy: three volumes in slipcase
2. the lord of the rings three volume boxed set (the fellowship of the ring, the return of the king, the two towers)
3. the lord of the rings (3 volume set)
4. the lord of the rings trilogy (the fellowship of the ring, the two towers, the return of the king, i, ii, iii)
5. the lord of the rings - boxed set

User: A3OWUSU9RG4NMF
Relevant Items:
1. the new drawing on the right side of the brain
2. bridge to terabithia
3. mythology
4. his dark materials trilogy

User: A2BIFGERNRDLBB
Relevant Items:
1. bringing down the house: the inside story of six m.i.t. students who took vegas for millions
2. slaughter-house five
3. slaughterhouse-five: or, the children's crusade : a duty-dance with death
4. slaughterhouse-five : or the children's crusade
5. slaughterhouse-five

