# Project CHRETIEN Tristan | DURAND Victor

## Installation

- Install kaggle using pip3 `pip3 install -r requirements.txt`
- Connect to https://kaggle.com 
- Create an account 
- Generate API Keys on your account tab
- Download kaggle.json file and store it under /home/${USER}/.kaggle/kaggle.json


In [None]:
!pip3 install -r requirements.txt

## Download dataset

In [2]:
# Download dataset on kaggle
import kaggle

kaggle.api.authenticate()
# assign directory
directory="./"
kaggle.api.dataset_download_files('vishalsubbiah/pokemon-images-and-types', path=directory, unzip=True)



## Generate files metadata

In [6]:
number_of_image_to_extract = 200

In [7]:
import os
from PIL import Image
from PIL import ImageColor
from PIL.ExifTags import TAGS
from matplotlib import widgets
import pandas as pd
import json
import numpy as np
import math
from sklearn.cluster import KMeans
import webcolors
import progressbar


def main_colors(imgfile):
    numarray = np.array(imgfile.getdata(), np.uint8)
    if len(numarray.shape) == 2:
        clusters = KMeans(n_clusters = 2)
        clusters.fit(numarray)
        colors = []
        for i in range(2):
            color = '#%02x%02x%02x' % (
                math.ceil(clusters.cluster_centers_[i][0]),
                    math.ceil(clusters.cluster_centers_[i][1]), 
                math.ceil(clusters.cluster_centers_[i][2]))
            colors.append(color)
        return colors
    else:
        return ''

def get_closest_color(rgb_triplet):
    min_colours = {}
    for key, name in webcolors.CSS21_HEX_TO_NAMES.items():
        r_c, g_c, b_c = webcolors.hex_to_rgb(key)
        rd = (r_c - rgb_triplet[0]) ** 2
        gd = (g_c - rgb_triplet[1]) ** 2
        bd = (b_c - rgb_triplet[2]) ** 2
        min_colours[(rd + gd + bd)] = name
    return min_colours[min(min_colours.keys())]

df = pd.read_csv('images/pokemon.csv', sep=',',header=None, skiprows=1)
df.replace(np.nan, "")
json_data = []
id = 0
total = len([name for name in os.listdir("images/images/")])
bar = progressbar.ProgressBar(widgets=['Extraction : ', ' ',progressbar.Percentage(), progressbar.Bar(marker='#',left='[',right=']'),
           ' '], maxval=total)
bar.start()

for filename in os.listdir("images/images/")[:number_of_image_to_extract]:
    f = "images/images/" + filename
    image = Image.open(f)
    image = image.resize((120,120))
    metadata = df.loc[df[0] == filename.split(".")[0]]
  
    closest_name_list = []
    name = metadata[0].values[0]
    main_colors_value = main_colors(image)
    for i in range(len(main_colors_value)):
        rgb_color = ImageColor.getcolor(main_colors_value[i], "RGB")
        closest_name = get_closest_color(rgb_color)
        closest_name_list.append(closest_name)  

    id+=1
    json_metadata = {
        "id" : id,
        "properties" : {
            "name" : metadata[0].replace(np.nan, "None").values[0],
            "type1" : metadata[1].replace(np.nan, "None").values[0],
            "type2" : metadata[2].replace(np.nan, "None").values[0]
        },
        "size" : image.size,
        "colors" : main_colors_value,
        "closest_colors": closest_name_list,
        "tags" : [],
        "path" : f 
    }
    bar.update(id)
    json_data.append(json_metadata)
bar.finish()



KeyboardInterrupt: 

### Write metadata to json file

In [4]:
with open("images/metadata/metadata.json", 'w+') as outfile:
    outfile.write(json.dumps(json_data))

### Data visualization

In [5]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import squarify
with open('images/metadata/metadata.json','r') as f:
    data = json.loads(f.read())
    dataFrame = pd.DataFrame(data)

df = pd.read_json('images/metadata/metadata.json')

df = pd.json_normalize(
    data,  
    meta=[
        'class',
        ['properties', 'type1', 'types2'],
        'colors' 
    ]
)

## Count by type
grouped_df = df.groupby(['properties.type1'])['properties.type1']
print(grouped_df.describe())
x = []
y = []
for key, item in grouped_df:
    group = grouped_df.get_group(key)
    x.append(key)
    y.append(group.count())

plt.bar(x,y )
plt.show()

### Colors bar 
count_by_colors = {}

for el in data:
    colors = el["closest_colors"]
    for color in colors:
        if color in count_by_colors.keys():
            count_by_colors[color] += 1
        else: 
            count_by_colors[color] = 1

plt.bar(range(len(count_by_colors)), list(count_by_colors.values()), align='center', color=count_by_colors.keys())
plt.xticks(range(len(count_by_colors)), list(count_by_colors.keys()))
plt.show()


df = pd.DataFrame({'presence':count_by_colors.values(), 'color':count_by_colors.keys() })

# plot it
squarify.plot(sizes=df['presence'], label=df['color'], alpha=.8 ,color=count_by_colors.keys())
plt.axis('off')
plt.show()
import matplotlib.image as mpimg

_, axs = plt.subplots(2, 5, figsize=(8, 8))
axs = axs.flatten()
for ax in axs:
    image = list(dataFrame.sample()['path'].items())[0][1]
    img = mpimg.imread(image)
    ax.imshow(img)
plt.show()




                 count unique       top freq
properties.type1                            
Bug                 23      1       Bug   23
Dark                 4      1      Dark    4
Dragon               5      1    Dragon    5
Electric            11      1  Electric   11
Fairy                5      1     Fairy    5
Fighting             7      1  Fighting    7
Fire                13      1      Fire   13
Ghost                5      1     Ghost    5
Grass               19      1     Grass   19
Ground               9      1    Ground    9
Ice                  9      1       Ice    9
Normal              25      1    Normal   25
Poison              12      1    Poison   12
Psychic             10      1   Psychic   10
Rock                10      1      Rock   10
Steel                9      1     Steel    9
Water               24      1     Water   24


### Generate random users preferences

Number of user to generate preferences :


In [4]:
number_of_users = 10

In [5]:
import random
import json
import numpy as np

with open("images/metadata/metadata.json", 'r') as images_infos:
    images_infos = json.load(images_infos)
users_preferences = []
number_of_user = 10 
for user_id in range(number_of_users):
    favorites_index = random.sample(range(1,len(images_infos)), 8)
    dislike_index = random.choices([i for i in range(1, len(images_infos)) if i not in favorites_index], k=8)

    favorites_colors = [images_infos[index]["closest_colors"] for index in favorites_index if len(images_infos[index]["closest_colors"])]

    favorites_types = [[images_infos[index]["properties"]["type1"], images_infos[index]["properties"]["type2"]] for index in favorites_index]
    disliked_types = [[images_infos[index]["properties"]["type1"], images_infos[index]["properties"]["type2"]] for index in dislike_index]
    user_metadata = {
        "id" : user_id +1,
        "favorites" : favorites_index,
        "dislikes" : dislike_index,
        "favorites_types" : list(set(tuple(el) for el in favorites_types)),
        "disliked_types" : list(set(tuple(el) for el in disliked_types)),
        "colors" : list(set(np.array(favorites_colors).ravel()))
    }
    users_preferences.append(user_metadata)
with open("images/metadata/users_preferences.json", 'w+') as outfile:
    outfile.write(json.dumps(users_preferences))

### Image recommandation based on user preferences

In [None]:
from sklearn import tree
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import json
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.image as mpimg

with open('images/metadata/metadata.json') as meta:
    meta_data = json.load(meta)
    colors_closet1 = [val['closest_colors'][0] if len(val['closest_colors']) > 1 else None for val in meta_data]
    colors_closet2 = [val['closest_colors'][1] if len(val['closest_colors']) > 1 else None for val in meta_data]

    colors1 = [val['colors'][0] if len(val['colors']) > 1 else None for val in meta_data]
    colors2 = [val['colors'][1] if len(val['colors']) > 1 else None for val in meta_data]

    df_images = pd.json_normalize(
    meta_data,  
        meta=[
            'class',
            ['properties', 'type1', 'types2'], 
        ]
    )   
    df_test = pd.json_normalize(
    meta_data,  
        meta=[
            'class',
            ['properties', 'type1', 'types2'], 
        ]
    )   
    df_images = df_images[df_images.columns[~df_images.columns.isin(['size', 'id','properties.name', 'tags', 'path', 'colors', 'closest_colors'])]]
    df_images['colors1'] = colors1
    df_images['colors2'] = colors2
    df_images['colors_closet1'] = colors_closet1
    df_images['colors_closet2'] = colors_closet2

with open('images/metadata/users_preferences.json') as user_metadata:
    user_data = json.load(user_metadata)
    df_u = pd.json_normalize(user_data)

le1 = LabelEncoder()
df_images['colors1'] = le1.fit_transform(df_images['colors1'])
df_images['colors2'] = le1.fit_transform(df_images['colors2'])

le2 = LabelEncoder()
df_images['colors_closet1'] = le2.fit_transform(df_images['colors_closet1'])
df_images['colors_closet2'] = le2.fit_transform(df_images['colors_closet2'])

le3 = LabelEncoder()
df_images['properties.type2'] = le3.fit_transform(df_images['properties.type2'])
df_images['properties.type1'] = le3.fit_transform(df_images['properties.type1'])


"""Separate data for train set and test set"""
train = df_images[:650]
test = df_images[650:]

dtc = tree.DecisionTreeClassifier()

fitted_models = []

def randomDf():
    df = pd.DataFrame()
    for i in range(8):
        df = pd.concat([df, train.sample()], ignore_index = True, axis = 0)
    return df

label_likes = LabelEncoder()
users_preferences = {}
for index, row in df_u.iterrows():
    fits = dtc.fit(randomDf(), label_likes.fit_transform(row['favorites']))
    prediction = fits.predict(test)
    users_preferences[index] = df_test.iloc[label_likes.inverse_transform(prediction.reshape(-1, 1))]


users_ids = []
def users_images():
    for index in range(8):
        # ax.set_title(f"User {index} and image favorites id :{list(users_preferences[index].sample()['id'].items())[0][1]}")
        random_favorite = np.random.choice(df_u.iloc[index]['favorites'], size=1)
        favorite = random_favorite.tolist().pop(0)
        recommanded =  list(users_preferences[index].sample()['id']).pop(0)
        users_ids.append(favorite)
        users_ids.append(recommanded)

users_images()

fig , axs = plt.subplots(8,2, figsize=(8, 8))
fig.suptitle("Each row is a user and left image is favorite and right is a recommanded")
axs = axs.flatten()
def plot_users(plot):
    for index, ax in enumerate(plot):
        ax.set_title(f"Image number : {users_ids[index]}")
        img = mpimg.imread(df_test.iloc[users_ids[index]]['path'])
        ax.imshow(img)
    plt.show()
plot_users(axs)

for i in range(8):
    print("User number :",i," with a random recommanded image : \n")
    print(users_preferences[i].sample())
    print('\nFavorites types, color and image for this user : \n')
    print(df_u.iloc[i]['favorites_types'])
    print(df_u.iloc[i]['colors'])
    print(df_u.iloc[i]['favorites'])
    print('\n\n')