# Projet DataMining

## Description du projet
L'objectif de ce projet est de recommander des images de pokémons en fonction des préférences de l'utilisateur. 

## Installation librairies


In [None]:
!pip install -r requirements.txt

## Installation des bases de données

In [None]:
import os
import kaggle
import shutil

In [None]:

# Télécharger le fichier de données
kaggle.api.authenticate()
kaggle.api.dataset_download_files('kvpratama/pokemon-images-dataset', path='.', unzip=True, quiet=False, force=False)
kaggle.api.dataset_download_files('abcsds/pokemon', path='./data_csv', unzip=True, quiet=False, force=False)
kaggle.api.dataset_download_files('avi1023/color-names', path='./data_csv', unzip=True, quiet=False, force=False)

In [None]:
# supprimer le dossier pokemon dans pokemon_img
shutil.rmtree('./pokemon')

# Creation de la base de données

## I. Création de la base de données de pokémons

In [None]:
from PIL import Image
import os
import json
from datetime import datetime
import numpy as np

In [None]:
img_dir = r"./pokemon_jpg/pokemon_jpg" # chemin vers le répertoire contenant les images

# initialise un dictionnaire pour stocker les métadonnées de toutes les images
all_metadata = {}
color_data={}

In [None]:
# boucle à travers tous les fichiers d'image dans le répertoire
for img_filename in os.listdir(img_dir):
    if img_filename.endswith(".jpg") or img_filename.endswith(".png"):
        # construire le chemin complet vers le fichier d'image
        img_path = os.path.join(img_dir, img_filename)

        # ouvrir le fichier d'image
        with Image.open(img_path) as img:

            # supprimer les images avec un nom qui ne sont pas de ce format: 1.jpg, 2.jpg, 3.jpg, etc.
            if not img_filename.split(".")[0].isdigit():
                continue

            # extraire les métadonnées de l'image
            img_filename= img.filename
            img_format = img.format
            img_size = img.size
            img_orientation = "landscape" if img_size[0] > img_size[1] else "portrait"
            creation_date =  datetime.fromtimestamp(os.path.getctime(img_path)).strftime('%d/%m/%Y')

            # créer un dictionnaire de métadonnées pour cette image
            metadata = {
                #on veut juste le nom de l'image
                "id": int((img_filename.split("\\")[-1]).split(".")[0]),
                "format": img_format,
                "size": img_size,
                "orientation": img_orientation,
                "creation_date": creation_date,
                "tags": ""
            }

            # ajouter les métadonnées de cette image au dictionnaire de toutes les métadonnées
            all_metadata[img_filename.split("\\")[-1]] = metadata

In [None]:
# Ajouter les métadonnées au fichier JSON
with open('database.json', "w") as f:
    json.dump(all_metadata, f, indent=4)

## II. Création de la base de données de couleurs

In [None]:
import pandas as pd
import json

In [None]:
# charger le fichier csv
df = pd.read_csv("./data_csv/color_names.csv", sep=',', header=0)

# sélectionner les colonnes Name et Hex (24 bit)
df_selected = df.loc[:, ["Name", "Hex (24 bit)"]]
df_selected.rename(columns={"Hex (24 bit)": "Hex"}, inplace=True)

# remplacer # par rien
df_selected['Hex'] = df_selected['Hex'].str.replace('#', '')

with open('color_names.json', 'w') as f:
    f.write(df_selected.to_json(orient='records')) 

with open('color_names.json', 'r') as f:
    data = json.load(f)

with open('color_names.json', 'w') as f:
    json.dump(data, f, indent=4)

## III. Ajout des couleurs aux pokémons

In [None]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from PIL import Image
from sklearn.cluster import MiniBatchKMeans

In [None]:
# boucle à travers tous les fichiers d'image dans le répertoire
color_data = {}
img_dir = r"./pokemon_jpg/pokemon_jpg" 
for img_filename in os.listdir(img_dir): 
    if img_filename.endswith(".jpg") or img_filename.endswith(".png"):
        # Construire le chemin complet vers le fichier d'image
        img_path = os.path.join(img_dir, img_filename)
        # Ouvrir l'image
        with Image.open(img_path) as img:
            
            
            # Supprimer les images avec un nom qui ne sont pas de ce format: 1.jpg, 2.jpg, 3.jpg, etc.
            if not img_filename.split(".")[0].isdigit():
                continue

            pixel_matrix = np.array(img) # Convertir l'image en matrice de pixels

            # Extraire les valeurs R, G, B
            pixel_data = pixel_matrix.reshape((-1, 3))

            # Utiliser MiniBatchKMeans pour trouver le cluster le plus grand
            kmeans = MiniBatchKMeans(n_clusters=4, random_state=0).fit(pixel_data)
            main_color = kmeans.cluster_centers_[np.argmax(np.unique(kmeans.labels_, return_counts=True)[1])]
            hex_value = "{0:02X}{1:02X}{2:02X}".format(int(main_color[0]), int(main_color[1]), int(main_color[2]))


            # créer un dictionnaire de couleur pour cette image
            color = {
                "id": int((img_filename.split("\\")[-1]).split(".")[0]),
                "couleur dominante": main_color.tolist(),
                "nom couleur": hex_value
            }

            # ajouter les couleurs de cette image au dictionnaire de toutes les couleurs
            color_data[(img_filename.split("\\")[-1]).split(".")[0]] = color


In [None]:
# Ajouter les couleurs au fichier JSON
with open('color_data.json', 'w') as f:
    json.dump(color_data, f, indent=4)

In [None]:
# compare les couleurs de l'image avec la liste des couleurs du fichier json

with open('color_names.json', 'r') as f:
    color_names = json.load(f)

with open('color_data.json', 'r') as f:
    color_data = json.load(f)

for pokemon in color_data:
    color_hex = color_data[pokemon]['nom couleur']
    for color in color_names:
        color_rgb = (int(color_hex[1:2], 16), int(color_hex[2:4], 16), int(color_hex[4:], 16))
        color_rgb_names = (int(color['Hex'][1:2], 16), int(color['Hex'][2:4], 16), int(color['Hex'][4:], 16))
        # verifie si les trois valeurs sont les memes
        if (color_rgb[0] == color_rgb_names[0] and color_rgb[1] == color_rgb_names[1] and color_rgb[2] == color_rgb_names[2]):
            color_data[pokemon]['nom couleur'] = color['Name']
        # verifie si deux des trois valeurs sont les memes
        elif ((color_rgb[0] == color_rgb_names[0] and color_rgb[1] == color_rgb_names[1])
        or (color_rgb[0] == color_rgb_names[0] and color_rgb[2] == color_rgb_names[2]) 
        or (color_rgb[1] == color_rgb_names[1] and color_rgb[2] == color_rgb_names[2])):
            color_data[pokemon]['nom couleur'] = color['Name']
        # verifie si une des trois valeurs est la meme
        elif (color_rgb[0] == color_rgb_names[0] or color_rgb[1] == color_rgb_names[1] or color_rgb[2] == color_rgb_names[2]):
            color_data[pokemon]['nom couleur'] = color['Name']

with open('color_data.json', 'w') as f:
    json.dump(color_data, f, indent=4)

In [None]:
# ajoute les couleurs au fichier json principal
with open('database.json', 'r') as f:
    pokemon_data = json.load(f)

with open('color_data.json', 'r') as f:
    color_data = json.load(f)

# si l'id du pokemon est le meme que l'id de la couleur, ajoute la couleur au pokemon
for pokemon in pokemon_data:
    for color in color_data:
        if pokemon_data[pokemon]['id'] == color_data[color]['id']:
            pokemon_data[pokemon]['couleur dominante'] = color_data[color]['nom couleur']

with open('database.json', 'w') as f:
    json.dump(pokemon_data, f, indent=4)


## Ajout de tags

In [None]:
import pandas as pd
import json
import csv

In [None]:
# Charger les données de tags.json dans data_d["tags"]
df = pd.read_csv(r"./data_csv/Pokemon.csv", sep=',', header=0)
df_selected = df.loc[:, ['#','Name', 'Type 1', 'Type 2', 'Generation', 'Legendary']]

In [None]:
# Renommer la colonne # en id
df_selected.rename(columns={'#': 'id'}, inplace=True)

with open('tags.json', 'w') as f:
    f.write(df_selected.to_json(orient='records')) 

with open('tags.json', 'r') as f:
    data_t = json.load(f)

with open('tags.json', 'w') as f:
    json.dump(data_t, f, indent=4)

with open('database.json', 'r') as f:
    data_d = json.load(f)

In [None]:
# Charger les données de tags.json dans data_d["tags"]
for key in data_d:
    for i in range(len(data_t)):
        if data_d[key]["id"] == data_t[i]["id"]:
            data_d[key]["tags"] = data_t[i]

#Enlever l'id de data_d["tags"]
for key in data_d:
    data_d[key]["tags"].pop("id")

# Enregistrer les données dans database.json
with open('database.json', 'w') as f:
    json.dump(data_d, f, indent=4)

## Creation des utilisateurs

In [None]:
import json
from random import randint, choice
import os

In [None]:
# Fonction pour charger les caractéristiques d'une image
def load(filename):
    train={}
    test={}
    with open(filename, "r") as f:
        data=json.load(f)
    for i in data:
        if data[i]["id"]<598:
            train[i]=data[i] #85% des données
        else:
            test[i]=data[i] #15% des données
    return train,test

# Fonction de filtrage en fonction des préférences utilisateur
def filter_images(images, color, legendary):
    filtered_images = []
    for image in images.values():
        couleur = image["couleur dominante"]
        legendaire = image["tags"]["Legendary"]
        tags= image["tags"]
        tags["color"]=color # ajouter color dans tags
        # ajouter la lettre du debut du nom du pokemon dans tags
        tags["first_letter"]=tags["Name"][0]
        if couleur == color and not(legendaire^legendary):
            filtered_images.append(tags)
    return filtered_images

# Fonction de génération des préférences utilisateur
def get_user_preferences(images,color_t):
    legendary_t=[True,False]
    color = color_t[randint(0, len(color_t)-1)]
    legendary = legendary_t[randint(0, len(legendary_t)-1)]
    filtered_images = filter_images(images, color, legendary)
    return filtered_images

In [None]:
# récuperation des nom couleurs dans database.json
with open("database.json", "r") as f:
    data=json.load(f)

color_t=[]
for i in data:
    if data[i]["couleur dominante"] not in color_t:
        color_t.append(data[i]["couleur dominante"])

In [None]:
#simulation de l'utilisateur
img,test=load("database.json")

favorite_t=["favorite","notfavorite"]
all_user={}

for i in range(100):#100 utilisateurs
    result=[]
    data=get_user_preferences(img,color_t)
    if len(data)==0:
        continue

    for k in range(len(data)):
        result.append(favorite_t[randint(0, len(favorite_t)-1)])
    all_user[i]={'data':data,'result':result}

In [None]:
#sauvegarde des données
with open("user.json", "w") as f:
    json.dump(all_user, f, indent=4)

#sauvegarder les données de test dans un fichier json
with open("test.json", "w") as f:
    json.dump(test, f, indent=4)

## Entrainement du modèle

## A. Decision Tree

In [None]:
from sklearn import tree
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import graphviz
import pydotplus
from IPython.display import Image, display
import json
import pickle
from sklearn import metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [None]:
#importer les données de l'utilisateur
with open('user.json', "r") as f:
        user=json.load(f)

In [None]:
#creation du tree
dtc = tree.DecisionTreeClassifier()

#creation des labels encoder
le1 = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()
le4 = LabelEncoder()
le5 = LabelEncoder()
le6 = LabelEncoder()
le7 = LabelEncoder()
le8 = LabelEncoder()

### __A.1 Création du modèle__

In [None]:
# creating dataframes
for i in user:
    data = user[str(i)]["data"]
    result = user[str(i)]["result"]
    dataframe = pd.DataFrame(data, columns=["Name","Type 1", "Type 2", "Generation","legendary", "color","first_letter"])
    resultframe = pd.DataFrame(result, columns=["favorite"])

    # generating numerical labels
    dataframe["Name"] = le1.fit_transform(dataframe["Name"])
    dataframe["Type 1"] = le2.fit_transform(dataframe["Type 1"])
    dataframe["Type 2"] = le3.fit_transform(dataframe["Type 2"])
    dataframe["Generation"] = le4.fit_transform(dataframe["Generation"])
    dataframe["legendary"] = le5.fit_transform(dataframe["legendary"])
    dataframe["color"] = le6.fit_transform(dataframe["color"])
    dataframe["first_letter"] = le7.fit_transform(dataframe["first_letter"])
    resultframe["favorite"] = le8.fit_transform(resultframe["favorite"])
    # Use of decision tree classifiers
    dtc = dtc.fit(dataframe.values, resultframe)

### __A.2 Visualisation du modèle__

In [None]:
dot_data = tree.export_graphviz(
    dtc,
    out_file=None,
    feature_names=dataframe.columns,
    filled=True,
    rounded=True,
    class_names=le8.inverse_transform(resultframe.favorite.unique()),
    special_characters=True,
)
graph = graphviz.Source(dot_data)

pydot_graph = pydotplus.graph_from_dot_data(dot_data)
img = Image(pydot_graph.create_png())
display(img)

### __A.3 Evaluation du modèle__

In [None]:
print("Accuracy:",metrics.accuracy_score(resultframe, dtc.predict(dataframe)))

In [None]:
# creating predictions
y_pred = dtc.predict(dataframe)

# creating confusion matrix
cm = confusion_matrix(resultframe, y_pred, normalize='true')
    
# displaying confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['not favorite', 'favorite'])
disp.plot(cmap='Blues')
plt.show()

## B. Neural Network

### __B.1 Creation du modèle__

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import plot_model
import json
import matplotlib.pyplot as plt

In [None]:
# Load user input data
with open('user.json', 'r') as f:
    user_data = json.load(f)

In [None]:
# Create label encoders
le1 = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()
le4 = LabelEncoder()
le5 = LabelEncoder()
le6 = LabelEncoder()
le7 = LabelEncoder()
le8 = LabelEncoder()

In [None]:
# Create dataframes
for i in user_data:
    data = user_data[str(i)]["data"]
    result = user_data[str(i)]["result"]
    dataframe = pd.DataFrame(data, columns=["Name","Type 1", "Type 2", "Generation","legendary", "color","first_letter"])
    resultframe = pd.DataFrame(result, columns=["favorite"])

    # Generate numerical labels
    dataframe["Name"] = le1.fit_transform(dataframe["Name"])
    dataframe["Type 1"] = le2.fit_transform(dataframe["Type 1"])
    dataframe["Type 2"] = le3.fit_transform(dataframe["Type 2"])
    dataframe["Generation"] = le4.fit_transform(dataframe["Generation"])
    dataframe["legendary"] = le5.fit_transform(dataframe["legendary"])
    dataframe["color"] = le6.fit_transform(dataframe["color"])
    dataframe["first_letter"] = le7.fit_transform(dataframe["first_letter"])
    resultframe["favorite"] = le8.fit_transform(resultframe["favorite"])

    # Split data into training and testing sets
    train_dataset = dataframe.sample(frac=0.8, random_state=0)
    test_dataset = dataframe.drop(train_dataset.index)

    # Split labels into training and testing sets
    train_labels = resultframe.sample(frac=0.8, random_state=0)
    test_labels = resultframe.drop(train_labels.index)

    # Define the model
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(32, activation='relu', input_shape=(7,)),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(8, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(train_dataset, train_labels, validation_data=(test_dataset, test_labels), epochs=5)

### __B.2 Visualisation du modèle__

In [None]:
# affichage du modele de reseau de neurone
plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)

### __B.3 Evaluation du modèle__

In [None]:
# Evaluate model
test_loss, test_acc = model.evaluate(train_dataset, train_labels)
print('Test accuracy:', test_acc)
print('Test loss:', test_loss)

In [None]:
# afficher la matrice de confusion pour un reseau de neurone
y_pred = model.predict_classes(train_dataset)
cm = confusion_matrix(train_labels, y_pred, normalize='true')
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['not favorite', 'favorite'])
disp.plot(cmap='Blues')
plt.show()

## C. Random Forest

### __C.1 Creation du modèle__

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import json
from sklearn.datasets import make_classification
from sklearn.tree import plot_tree

In [None]:
# Load user input data
with open('user.json', 'r') as f:
    user_data = json.load(f)

# Create label encoders
le1 = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()
le4 = LabelEncoder()
le5 = LabelEncoder()
le6 = LabelEncoder()
le7 = LabelEncoder()
le8 = LabelEncoder()

In [None]:
# Create dataframes
for i in user_data:
    data = user_data[str(i)]["data"]
    result = user_data[str(i)]["result"]
    dataframe = pd.DataFrame(data, columns=["Name","Type 1", "Type 2", "Generation","legendary", "color","first_letter"])
    resultframe = pd.DataFrame(result, columns=["favorite"])

    # Generate numerical labels
    dataframe["Name"] = le1.fit_transform(dataframe["Name"])
    dataframe["Type 1"] = le2.fit_transform(dataframe["Type 1"])
    dataframe["Type 2"] = le3.fit_transform(dataframe["Type 2"])
    dataframe["Generation"] = le4.fit_transform(dataframe["Generation"])
    dataframe["legendary"] = le5.fit_transform(dataframe["legendary"])
    dataframe["color"] = le6.fit_transform(dataframe["color"])
    dataframe["first_letter"] = le7.fit_transform(dataframe["first_letter"])
    resultframe["favorite"] = le8.fit_transform(resultframe["favorite"])

    # Split data into training and testing sets
    train_data = dataframe.sample(frac=0.8, random_state=0)
    test_data = dataframe.drop(train_data.index)

    # Split labels into training and testing sets
    train_labels = resultframe.sample(frac=0.8, random_state=0)
    test_labels = resultframe.drop(train_labels.index)

    # Define the model
    model = RandomForestClassifier(n_estimators=100, random_state=0)

    # Train the model
    model.fit(train_data, train_labels)

### __C.2 Visualisation du modèle__

In [None]:
plot_tree(model.estimators_[0], filled=True, rounded=True)

### __C.3 Evaluation du modèle__

In [None]:
# Evaluate model
y_pred = model.predict(train_data)
print("Accuracy:", accuracy_score(train_labels, y_pred))

In [None]:
# afficher la matrice de confusion
cm = confusion_matrix(train_labels, y_pred, normalize='true')

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['not favorite', 'favorite'])
disp.plot(cmap='Blues')
plt.show()

## Syteme de recommandation

## Visualisation des données

## Tests