In [7]:
import tensorflow as tf
import pandas as pd
import numpy as np

# Dataset description:

### 1. 3 Data files contain anonymous ratings data from 73,421 users.

### 2.   Data files are in .zip format, when unzipped, they are in Excel (.xls) format
### 3.   Ratings are real values ranging from -10.00 to +10.00 (the value "99" corresponds to "null" = "not rated").
### 4. One row per user
### 5. The first column gives the number of jokes rated by that user. The next 100 columns give the ratings for jokes 01 - 100.
### 6. The sub-matrix including only columns {5, 7, 8, 13, 15, 16, 17, 18, 19, 20} is dense. Almost all users have rated those jokes (see discussion of "universal queries" in the above paper).









# Étape 1 : Chargement et préparation des données

In [9]:
# Charger le fichier Excel Jester
file_path = "/content/jester-data-1.xls"  # Remplacez par le chemin réel
jester_data = pd.read_excel(file_path, header=None)

# Afficher les dimensions et un aperçu des données
print("Dimensions initiales du dataset :", jester_data.shape)
print(jester_data.head())

# Filtrer les colonnes pertinentes (5, 7, 8, 13, 15-20) et conserver uniquement les évaluations
columns_dense = [4, 6, 7, 12, 14, 15, 16, 17, 18, 19]  # Index de colonnes 0-based Les colonnes 5, 7, 8, 13, 15-20 ont été identifiées comme étant denses, ce qui signifie que presque tous les utilisateurs ont donné des évaluations dans ces colonnes.
jester_data_dense = jester_data.iloc[:, columns_dense]

# Remplacer les évaluations non notées (99) par NaN pour traitement
jester_data_dense.replace(99, np.nan, inplace=True)

# Normaliser les évaluations entre 0 et 1 pour le modèle
jester_data_dense = (jester_data_dense + 10) / 20

# Afficher un aperçu des données transformées
print("Aperçu des données après filtrage et normalisation :")
jester_data_dense.head()

Dimensions initiales du dataset : (24983, 101)
   0      1      2      3      4     5     6     7     8      9    ...    91   \
0   74  -7.82   8.79  -9.66  -8.16 -7.52 -8.50 -9.85  4.17  -8.98  ...   2.82   
1  100   4.08  -0.29   6.36   4.37 -2.38 -9.66 -0.73 -5.34   8.88  ...   2.82   
2   49  99.00  99.00  99.00  99.00  9.03  9.27  9.03  9.27  99.00  ...  99.00   
3   48  99.00   8.35  99.00  99.00  1.80  8.16 -2.82  6.21  99.00  ...  99.00   
4   91   8.50   4.61  -4.17  -5.39  1.36  1.60  7.04  4.61  -0.44  ...   5.19   

     92     93     94     95     96     97     98     99     100  
0  99.00  99.00  99.00  99.00  99.00  -5.63  99.00  99.00  99.00  
1  -4.95  -0.29   7.86  -0.19  -2.14   3.06   0.34  -4.32   1.07  
2  99.00  99.00   9.08  99.00  99.00  99.00  99.00  99.00  99.00  
3  99.00  99.00   0.53  99.00  99.00  99.00  99.00  99.00  99.00  
4   5.58   4.27   5.19   5.73   1.55   3.11   6.55   1.80   1.60  

[5 rows x 101 columns]
Aperçu des données après filtrage et nor

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jester_data_dense.replace(99, np.nan, inplace=True)


Unnamed: 0,4,6,7,12,14,15,16,17,18,19
0,0.092,0.075,0.0075,0.1625,0.9225,0.141,0.124,0.1285,0.0095,0.0075
1,0.7185,0.017,0.4635,0.932,0.8715,0.728,0.4515,0.733,0.466,0.665
2,,0.9635,0.9515,0.8785,0.8085,0.182,0.1555,0.107,0.9515,0.9515
3,,0.908,0.359,0.83,0.9055,0.1385,0.1675,0.5585,0.17,0.318
4,0.2305,0.58,0.852,0.842,0.8615,0.3835,0.017,0.636,0.432,0.6285


# Étape 2 : Division des données
### Nous allons diviser les données en ensembles d'entraînement et de test. Les valeurs manquantes (NaN) seront remplacées par 0, mais le modèle doit gérer ces cas correctement.

In [10]:
# Diviser les données en ensembles d'entraînement et de test
from sklearn.model_selection import train_test_split

# Remplacer les NaN par 0 pour créer des matrices d'entrée
jester_data_dense_filled = jester_data_dense.fillna(0)

# Diviser les données en 80% pour entraînement et 20% pour test
train_data, test_data = train_test_split(jester_data_dense_filled, test_size=0.2, random_state=42)

# Conversion en matrices numpy
train_data_matrix = train_data.values
test_data_matrix = test_data.values

# Afficher les dimensions des ensembles
print("Dimensions de la matrice d'entraînement :", train_data_matrix.shape)
print("Dimensions de la matrice de test :", test_data_matrix.shape)

Dimensions de la matrice d'entraînement : (19986, 10)
Dimensions de la matrice de test : (4997, 10)


# Étape 3 : Modélisation avec un Autoencodeur
### Nous utiliserons un autoencodeur simple pour modéliser les évaluations des utilisateurs.

In [11]:
# Définir les dimensions de l'autoencodeur
input_dim = train_data_matrix.shape[1]  # Nombre de blagues sélectionnées
encoding_dim = 8  # Taille de la couche encodée (paramètre ajustable)

# Construire l'architecture de l'autoencodeur
input_layer = tf.keras.layers.Input(shape=(input_dim,))
encoded = tf.keras.layers.Dense(encoding_dim, activation='relu')(input_layer)
decoded = tf.keras.layers.Dense(input_dim, activation='sigmoid')(encoded)

# Compiler le modèle
autoencoder = tf.keras.Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

In [12]:
# Entraîner l'autoencodeur
autoencoder.fit(train_data_matrix, train_data_matrix,
                epochs=50,
                batch_size=256,
                shuffle=True,
                validation_data=(test_data_matrix, test_data_matrix))

Epoch 1/50
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.0861 - val_loss: 0.0767
Epoch 2/50
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0745 - val_loss: 0.0705
Epoch 3/50
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0686 - val_loss: 0.0646
Epoch 4/50
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0623 - val_loss: 0.0584
Epoch 5/50
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0566 - val_loss: 0.0530
Epoch 6/50
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0512 - val_loss: 0.0484
Epoch 7/50
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0465 - val_loss: 0.0412
Epoch 8/50
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0397 - val_loss: 0.0361
Epoch 9/50
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x7e18b06ad150>

# Étape 4 : Générer des recommandations
### Nous allons prédire les évaluations et recommander des blagues non notées.

In [13]:
# Prédire les évaluations des utilisateurs
predicted_ratings = autoencoder.predict(test_data_matrix)

# Fonction pour recommander des blagues
def recommend_jokes(user_index, predicted_ratings, original_ratings, num_recommendations=5):
    user_ratings = predicted_ratings[user_index]
    unseen_jokes = np.where(original_ratings[user_index] == 0)[0]
    recommended_indices = unseen_jokes[np.argsort(user_ratings[unseen_jokes])[::-1]]
    return recommended_indices[:num_recommendations]

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [34]:
# Exemple : Recommander 5 blagues pour l'utilisateur 0
recommendations = recommend_jokes(0, predicted_ratings, test_data_matrix, num_recommendations=5)
print("Blagues recommandées pour l'utilisateur 0 :", recommendations)

Blagues recommandées pour l'utilisateur 0 : [1 0]


In [35]:
import zipfile

# Spécifier le fichier ZIP et le dossier de destination
zip_file = '/content/jester_dataset_1_joke_texts.zip'
destination = '/content/jester_dataset_1_joke_texts'

# Ouvrir le fichier ZIP et extraire son contenu
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(destination)
    print("Fichier décompressé avec succès!")

Fichier décompressé avec succès!


In [36]:
import os

# Chemin vers le dossier contenant les fichiers HTML des blagues
jokes_folder = "/content/jester_dataset_1_joke_texts/jokes"

# Charger les blagues dans une liste
jokes_texts = []
for i in range(1, 101):  # Les fichiers vont de init1.html à init100.html
    file_path = os.path.join(jokes_folder, f"init{i}.html")
    with open(file_path, "r", encoding="utf-8") as f:
        jokes_texts.append(f.read().strip())

In [37]:
# Afficher les textes des recommandations
print("Textes des blagues recommandées :")
for joke_id in recommendations:
    print(f"Blague {joke_id + 1}:")
    print(jokes_texts[joke_id])
    print("-" * 50)

Textes des blagues recommandées :
Blague 2:
<HTML>
<HEAD>
    <TITLE>Joke 2 of 25</TITLE>
</HEAD>

<BODY TEXT="black" BGCOLOR="#fddf84">

<TABLE WIDTH="620" CELLSPACING="0" CELLPADDING="0">

<TD Width="130"> </TD>
<TD WIDTH="470">
<BR><CENTER><FONT color="red" size="+2">
</Font></center>
<UL> <Font Size="+1"> <BR>
<!--begin of joke -->
This couple had an excellent relationship going until one day he came home
from work to find his girlfriend packing. He asked her why she was leaving him
and she told him that she had heard awful things about him. 
<P>
"What could they possibly have said to make you move out?" 
<P>
"They told me that you were a pedophile." 
<P>
He replied, "That's an awfully big word for a ten year old." 
<!--end of joke -->
<CENTER><TABLE COLS=3 WIDTH="100%">


</TABLE></CENTER>





</TD></TR></TABLE>
</BODY>
</HTML>
--------------------------------------------------
Blague 1:
<HTML>
<HEAD>
<TITLE>Joke 1 of 25</TITLE>
</HEAD>

<BODY TEXT="black" BGCOLOR="#fddf84">

<ce

In [54]:
from bs4 import BeautifulSoup, Comment
import os

def extract_jokes(jokes_folder):
    jokes_texts = []

    for i in range(1, 101):
        file_path = os.path.join(jokes_folder, f"init{i}.html")

        try:
            with open(file_path, "r", encoding="utf-8") as f:
                html_content = f.read().strip()

            # Parser le contenu HTML
            soup = BeautifulSoup(html_content, 'html.parser')

            # Trouver les commentaires qui délimitent la blague
            comments = soup.find_all(string=lambda text: isinstance(text, Comment))
            joke_text = ""

            start_found = False
            for idx, comment in enumerate(comments):
                if "begin of joke" in comment:
                    start_found = True
                    start_comment = comment
                    end_comment = None

                    # Chercher le commentaire de fin correspondant
                    for potential_end in comments[idx+1:]:
                        if "end of joke" in potential_end:
                            end_comment = potential_end
                            break

                    if start_found and end_comment:
                        # Extraire tout le texte entre les commentaires
                        current = start_comment.next_element
                        while current and current != end_comment:
                            if isinstance(current, str) and current.strip():
                                joke_text += current.strip() + " "
                            current = current.next_element

                    break

            jokes_texts.append(joke_text.strip())

        except Exception as e:
            print(f"Erreur lors du traitement du fichier {i}: {str(e)}")
            jokes_texts.append("")

    return jokes_texts

In [55]:
# Utilisation de la fonction
jokes_folder = "/content/jester_dataset_1_joke_texts/jokes"
jokes_texts = extract_jokes(jokes_folder)

# Affichage des blagues
def print_jokes(jokes_texts, recommendations):
    print("Textes des blagues recommandées :")
    for joke_id in recommendations:
        joke = jokes_texts[joke_id]
        if joke:
            print(f"Blague {joke_id + 1}:")
            print(joke)
            print("-" * 50)
        else:
            print(f"Blague {joke_id + 1}: Aucun texte extrait.")

In [56]:
print_jokes(jokes_texts, recommendations)

Textes des blagues recommandées :
Blague 2:
This couple had an excellent relationship going until one day he came home
from work to find his girlfriend packing. He asked her why she was leaving him
and she told him that she had heard awful things about him. "What could they possibly have said to make you move out?" "They told me that you were a pedophile." He replied, "That's an awfully big word for a ten year old."
--------------------------------------------------
Blague 1:
A man visits the doctor. The doctor says "I have bad news for you.You have
cancer and Alzheimer's disease". The man replies "Well,thank God I don't have cancer!"
--------------------------------------------------


In [85]:
recommendations = recommend_jokes(99, predicted_ratings, test_data_matrix, num_recommendations=5)
print("Blagues recommandées pour l'utilisateur  :", recommendations)
print_jokes(jokes_texts, recommendations)

Blagues recommandées pour l'utilisateur  : []
Textes des blagues recommandées :


In [88]:
recommendations = recommend_jokes(203, predicted_ratings, test_data_matrix, num_recommendations=5)
print("Blagues recommandées pour l'utilisateur  :", recommendations)
print_jokes(jokes_texts, recommendations)

Blagues recommandées pour l'utilisateur  : []
Textes des blagues recommandées :
