# Dataset description:

### 1. 3 Data files contain anonymous ratings data from 73,421 users.

### 2.   Data files are in .zip format, when unzipped, they are in Excel (.xls) format
### 3.   Ratings are real values ranging from -10.00 to +10.00 (the value "99" corresponds to "null" = "not rated").
### 4. One row per user
### 5. The first column gives the number of jokes rated by that user. The next 100 columns give the ratings for jokes 01 - 100.
### 6. The sub-matrix including only columns {5, 7, 8, 13, 15, 16, 17, 18, 19, 20} is dense. Almost all users have rated those jokes (see discussion of "universal queries" in the above paper).









# Étape 1 : Chargement et préparation des données

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models
import pandas as pd
import numpy as np

In [None]:
ratings = pd.read_csv("/content/all_ratings.csv", header=None)
ratings.head()

  ratings = pd.read_csv("/content/all_ratings.csv", header=None)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,359,360,361,362,363,364,365,366,367,368
0,74.0,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,...,99.147,99.148,99.149,99.15,99.151,99.152,99.153,99.154,99.155,99.156
1,100.0,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,...,,,,,,,,,,
2,49.0,,,,,9.03,9.27,9.03,9.27,,...,,,,,,,,,,
3,48.0,,8.35,,,1.8,8.16,-2.82,6.21,,...,,,,,,,,,,
4,91.0,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,...,,,,,,,,,,


In [None]:
jokes = pd.read_csv("/content/all_jokes.csv")
jokes.head()

Unnamed: 0,joke_id,text,dataset,removed,gauge_set
0,1,"A man visits the doctor. The doctor says ""I ha...",1,True,False
1,2,This couple had an excellent relationship goin...,1,True,False
2,3,Q. What's 200 feet long and has 4 teeth? A. Th...,1,True,False
3,4,Q. What's the difference between a man and a t...,1,True,False
4,5,Q.\tWhat's O. J. Simpson's Internet address? A...,1,True,False


# Étape 2 : Nettoyage

In [None]:
# Vérifier les doublons dans les blagues
jokes = jokes.drop_duplicates(subset='joke_id')

# Supprimer les colonnes inutiles dans jokes.csv
jokes = jokes[['joke_id', 'text']]

# Étape 3 : Extraction de la matrice dense

In [None]:
ratings = ratings.apply(pd.to_numeric, errors='coerce')

In [None]:
# Colonnes de la matrice dense
dense_columns = [5, 7, 8, 13, 15, 16, 17, 18, 19, 20]

# Extraction des données denses
dense_ratings = ratings.iloc[:, dense_columns]

# Normaliser les notes entre 0 et 1
dense_ratings = dense_ratings.apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)

In [None]:
dense_ratings.head()

Unnamed: 0,5,7,8,13,15,16,17,18,19,20
0,0.122976,0.005112,0.716388,0.143375,,,0.130435,0.007085,,
1,0.383097,0.47137,0.233891,0.743789,0.730982,0.464803,0.756211,0.46913,0.679139,0.452381
2,0.960526,0.970348,0.97514,1.0,0.180856,0.158385,0.108178,0.960526,0.972834,0.982402
3,0.594636,0.364519,0.819888,0.841615,0.137028,0.170807,0.575569,0.169534,0.323424,0.406832
4,0.572368,0.868609,0.738711,0.311594,0.383879,0.01501,0.655797,0.434717,0.641722,0.748447


In [None]:
dense_ratings = dense_ratings.fillna(0)
dense_ratings.head()

Unnamed: 0,5,7,8,13,15,16,17,18,19,20
0,0.122976,0.005112,0.716388,0.143375,0.0,0.0,0.130435,0.007085,0.0,0.0
1,0.383097,0.47137,0.233891,0.743789,0.730982,0.464803,0.756211,0.46913,0.679139,0.452381
2,0.960526,0.970348,0.97514,1.0,0.180856,0.158385,0.108178,0.960526,0.972834,0.982402
3,0.594636,0.364519,0.819888,0.841615,0.137028,0.170807,0.575569,0.169534,0.323424,0.406832
4,0.572368,0.868609,0.738711,0.311594,0.383879,0.01501,0.655797,0.434717,0.641722,0.748447


# Étape 2 : Division des données
### Nous allons diviser les données en ensembles d'entraînement et de test. Les valeurs manquantes (NaN) seront remplacées par 0, mais le modèle doit gérer ces cas correctement.

In [None]:
# Création de la matrice utilisateur-blague
user_joke_matrix = dense_ratings.to_numpy()
print("Matrice utilisateur-blague :")
print(user_joke_matrix)

Matrice utilisateur-blague :
[[0.12297571 0.00511247 0.71638762 ... 0.00708502 0.         0.        ]
 [0.38309717 0.47137014 0.23389143 ... 0.46912955 0.6791389  0.45238095]
 [0.96052632 0.97034765 0.97513952 ... 0.96052632 0.97283444 0.98240166]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [None]:
# Division des données
train_data, test_data = train_test_split(user_joke_matrix, test_size=0.2, random_state=42)

print(f"Taille de l'ensemble d'entraînement : {train_data.shape}")
print(f"Taille de l'ensemble de test : {test_data.shape}")

# Vérification des types de données
print(f"Type de train_data : {type(train_data)}")
print(f"Type de test_data : {type(test_data)}")

Taille de l'ensemble d'entraînement : (108816, 10)
Taille de l'ensemble de test : (27205, 10)
Type de train_data : <class 'numpy.ndarray'>
Type de test_data : <class 'numpy.ndarray'>


# Étape 3 : Modélisation avec un Autoencodeur
### Nous utiliserons un autoencodeur simple pour modéliser les évaluations des utilisateurs.

In [None]:
# Définir les dimensions
input_dim = user_joke_matrix.shape[1]
encoding_dim = 50

# Définir l'architecture de l'autoencodeur
input_layer = layers.Input(shape=(input_dim,))
encoded = layers.Dense(encoding_dim, activation='relu')(input_layer)
decoded = layers.Dense(input_dim, activation='sigmoid')(encoded)

# Modèle complet de l'autoencodeur
autoencoder = models.Model(inputs=input_layer, outputs=decoded)

# Compiler l'autoencodeur
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
# Résumé du modèle
autoencoder.summary()

# Etape 4 : Entrainement

In [None]:
# Entraîner le modèle autoencodeur
autoencoder.fit(train_data, train_data,
                epochs=50,
                batch_size=256,
                shuffle=True,
                validation_data=(test_data, test_data))

Epoch 1/50
[1m426/426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.1231 - val_loss: 0.0054
Epoch 2/50
[1m426/426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0038 - val_loss: 0.0012
Epoch 3/50
[1m426/426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - loss: 9.1053e-04 - val_loss: 5.6358e-04
Epoch 4/50
[1m426/426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 5.2919e-04 - val_loss: 4.6652e-04
Epoch 5/50
[1m426/426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 4.4811e-04 - val_loss: 3.9873e-04
Epoch 6/50
[1m426/426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 3.7926e-04 - val_loss: 3.5642e-04
Epoch 7/50
[1m426/426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 3.4424e-04 - val_loss: 3.2403e-04
Epoch 8/50
[1m426/426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 3.1376e-04 - val_loss: 2.9

<keras.src.callbacks.history.History at 0x7b29ad41e530>

# Étape 5 : Générer des recommandations
### Nous allons prédire les évaluations et recommander des blagues non notées.

In [None]:
# Utiliser l'encodeur pour obtenir les représentations encodées des utilisateurs
encoded_users = encoder.predict(user_joke_matrix)

[1m4251/4251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def recommander_blagues(user_id, user_joke_matrix, encoded_users, top_n=5):
    """
    Recommande des blagues non vues par l'utilisateur en fonction des représentations encodées.

    :param user_id: L'identifiant de l'utilisateur pour lequel faire des recommandations.
    :param user_joke_matrix: La matrice utilisateur-blague avec les évaluations.
    :param encoded_users: La matrice des représentations encodées des utilisateurs.
    :param top_n: Le nombre de blagues à recommander.
    :return: Une liste de blagues recommandées.
    """

    # Obtenir les évaluations de l'utilisateur
    user_ratings = user_joke_matrix[user_id]

    # Calculer la similarité entre l'utilisateur et tous les autres utilisateurs
    user_encoded = encoded_users[user_id].reshape(1, -1)  # Encoder l'utilisateur cible
    similarities = cosine_similarity(user_encoded, encoded_users)

    # Trouver les blagues non vues par l'utilisateur
    non_rated_jokes = np.where(user_ratings == 0)[0]  # Index des blagues non notées

    # Calculer les prédictions pour les blagues non notées
    joke_scores = []
    for joke_id in non_rated_jokes:
        # Calculer la similarité entre cet utilisateur et les autres utilisateurs pour la blague donnée
        similar_users = similarities.flatten()

        # Utiliser les utilisateurs similaires pour prédire la note pour cette blague
        predicted_score = np.dot(similar_users, user_joke_matrix[:, joke_id]) / np.sum(similar_users)
        joke_scores.append((joke_id, predicted_score))

    # Trier les blagues par score prédit
    joke_scores.sort(key=lambda x: x[1], reverse=True)

    # Retourner les blagues avec les meilleurs scores prédits
    recommended_joke_ids = [joke[0] for joke in joke_scores[:top_n]]

    return recommended_joke_ids

In [None]:
# Exemple : Recommander 5 blagues à l'utilisateur avec ID = 0
user_id = 0
recommended_jokes_0 = recommander_blagues(user_id, user_joke_matrix, encoded_users, top_n=5)

print(f"Blagues recommandées pour l'utilisateur {user_id} : {recommended_jokes_0}")

Blagues recommandées pour l'utilisateur 0 : [8, 9, 4, 5]


In [None]:
def afficher_blagues_recommandees(recommended_jokes, jokes_df):
    # Filtrer les blagues correspondantes aux IDs dans la liste recommended_jokes
    recommended_blagues = jokes_df[jokes_df['joke_id'].isin(recommended_jokes)]

    # Afficher les textes des blagues recommandées
    for index, row in recommended_blagues.iterrows():
        print(f"Blague ID {row['joke_id']} : {row['text']}")

In [None]:
# Exemple d'IDs de blagues recommandées
# recommended_jokes = [1, 5, 12, 23, 34]

# Afficher les textes des blagues recommandées
afficher_blagues_recommandees(recommended_jokes_0, jokes)

Blague ID 4 : Q. What's the difference between a man and a toilet? A. A toilet doesn't follow you around after you use it.
Blague ID 5 : Q.	What's O. J. Simpson's Internet address? A.	Slash, slash, backslash, slash, slash, escape.
Blague ID 8 : Q. Did you hear about the dyslexic devil worshiper? A. He sold his soul to Santa.
Blague ID 9 : A country guy goes into a city bar that has a dress code, and the maitre
d' 
demands he wear a tie. Discouraged, the guy goes to his car to sulk when 
inspiration strikes: He's got jumper cables in the trunk! So he wraps
them around his neck, sort of like a string tie (a bulky string tie to be
sure) and returns to the bar. The maitre d' is reluctant, but says to the
guy, "Okay, you're a pretty resourceful fellow, you can come in... but
just don't start anything"!


In [None]:
# Exemple : Recommander 5 blagues à l'utilisateur avec ID = 0
user_id = 124

recommended_jokes_124 = recommander_blagues(user_id, user_joke_matrix, encoded_users, top_n=5)

print(f"Blagues recommandées pour l'utilisateur {user_id} : {recommended_jokes_124}")
afficher_blagues_recommandees(recommended_jokes_124, jokes)

Blagues recommandées pour l'utilisateur 124 : [1, 9, 4]
Blague ID 1 : A man visits the doctor. The doctor says "I have bad news for you.You have
cancer and Alzheimer's disease". The man replies "Well,thank God I don't have cancer!"
Blague ID 4 : Q. What's the difference between a man and a toilet? A. A toilet doesn't follow you around after you use it.
Blague ID 9 : A country guy goes into a city bar that has a dress code, and the maitre
d' 
demands he wear a tie. Discouraged, the guy goes to his car to sulk when 
inspiration strikes: He's got jumper cables in the trunk! So he wraps
them around his neck, sort of like a string tie (a bulky string tie to be
sure) and returns to the bar. The maitre d' is reluctant, but says to the
guy, "Okay, you're a pretty resourceful fellow, you can come in... but
just don't start anything"!
