# Recommender system
# 1 Get user preferences from the user profile file
# 2 Get all items that match the user preferences (SQL query to the database) (10 items) and adapt if there is no result


===========
- Retrive user preferences (load it from json file and extract prefered items)
- Query all metadata from the database.
- Rank images based on the user preferences (Clustering, similarity, collaborative filtering)
- Recommend the top 10 images to the user

Rank images :
- Convert data to a matrix or a dataframe
- Apply clustering algorithm (K-means, DBSCAN, Hierarchical clustering)

In [None]:
import sqlite3
import os
import pandas as pd

In [None]:
# Set the base folder path for the project
output_path = "../output"
images_path = os.path.join(output_path, "images")
metadata_path = os.path.join(output_path, "metadata")

database_path = os.path.join(metadata_path, "metadata.db")

In [None]:
def get_users():
    # Connect to the database and get all users from the database
    conn = sqlite3.connect(database_path)
    c = conn.cursor()
    c.execute("SELECT * FROM users")
    # Dictionary of users with keys (pseudo, fav_color, fav_orientation, fav_height, fav_width)
    users = c.fetchall()

    users = {user[0]: user[1:] for user in users}

    return users

In [None]:
get_users()

In [None]:
def get_user_preferences(user):
    # Get the user preferences from the user profile file
    # Return a dictionary with keys (fav_color, fav_orientation, fav_height, fav_width)
    users = get_users()
    return users[user]

In [None]:
get_user_preferences('Yannis')

In [None]:
def get_metadata_from_sqlite_db(db_name='metadata.db'):
    """
    Get the metadata from the sqlite database

    :param db_name: The name of the database
    :return: A dictionary with the metadata
    """
    # Open a connection to the database
    conn = sqlite3.connect(os.path.join(metadata_path, db_name))
    c = conn.cursor()

    # Retrieve all key-value pairs concatenated by filename
    c.execute("SELECT filename, GROUP_CONCAT(key || ':' || value, '; ') AS metadata FROM metadata GROUP BY filename")
    rows = c.fetchall()

    # Store the concatenated key-value pairs in a dictionary
    metadata_dict = {}
    for row in rows:
        filename, metadata_str = row
        metadata_list = metadata_str.split('; ')
        metadata_dict[filename] = {}
        for metadata_item in metadata_list:
            key, value = metadata_item.split(':', 1)
            # if the key is tags, convert the string to a list
            if key == 'tags':
                metadata_dict[filename][key] = eval(value)
            else:
                metadata_dict[filename][key] = value

    conn.close()

    return metadata_dict

In [None]:
metadata = get_metadata_from_sqlite_db()
df_metadata = pd.DataFrame(metadata)
# reverse the columns and rows
df_metadata = df_metadata.transpose()
# add name for the first column
df_metadata = df_metadata.rename_axis('filename').reset_index()
df_metadata.head()

In [None]:
user = get_user_preferences('Yannis')
df_user = pd.DataFrame(user)
df_user = df_user.transpose()
df_user = df_user.rename_axis('user').reset_index()
# rename the columns : ,ID,fav_color,fav_orientation, fav_height, fav_width, fav_tags
df_user = df_user.rename(columns={'user': 'pseudo', 0: 'fav_color', 1: 'fav_orientation', 2: 'fav_height', 3: 'fav_width', 4: 'fav_tags'})
# format fav_tags to remove () and ''
df_user['fav_tags'] = df_user['fav_tags'].str.replace('(', '')
df_user['fav_tags'] = df_user['fav_tags'].str.replace(')', '')
df_user['fav_tags'] = df_user['fav_tags'].str.replace("'", '')
df_user['fav_tags'] = df_user['fav_tags'].str.replace(" ", '')
# add the name of the column for the user name (pseudo)
df_user.head()

In [None]:
# get tags from the user profile (it is a list) and clean it to get a list of tags
tags = df_user['fav_tags'][0]
tags = tags.replace('[', '')
tags = tags.replace('(', '')
tags = tags.replace(')', '')
tags = tags.replace(']', '')
tags = tags.replace("'", '')
tags = tags.split(',')
tags

In [None]:
!pip install nltk gensim

In [None]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import genesis
nltk.download('genesis')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
genesis_ic = wn.ic(genesis, False, 0.0)

import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import roc_auc_score

In [None]:
# get tags from the dataframe and clean it to get a list of tags
tags = df_metadata['tags']

In [None]:
# get the list of all tags
all_tags = []
for tag in tags:
    try:
        for t in tag:
            if t not in all_tags:
                all_tags.append(t)
    except:
        pass
all_tags

In [None]:
# convert the list of tags to a dataframe
df_all_tags = pd.DataFrame(all_tags)
# rename the column
df_all_tags = df_all_tags.rename(columns={0: 'tags'})
df_all_tags.head()

In [None]:
!pip install spacy

In [None]:
import spacy
# download the model
!python -m spacy download en_core_web_md
nlp = spacy.load("en_core_web_md")

In [None]:
# Define a function to convert a word to its vector representation
def word_to_vector(word):
    return nlp(word).vector

# Apply the function to all words in the dataframe
df_all_tags['vector'] = df_all_tags['tags'].apply(word_to_vector)
df_all_tags.head()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Define a function to compute the cosine similarity between two vectors
def cosine_sim(a, b):
    return cosine_similarity(a.reshape(1,-1), b.reshape(1,-1))[0][0]

In [None]:
# Example usage
vector1 = word_to_vector("apple")
vector2 = word_to_vector("orange")
similarity = cosine_sim(vector1, vector2)
print(similarity)  # Output: 0.613587

In [None]:
# Define a function to compute the similarity between a word and a list of words
def word_to_list_similarity(word, word_list):
    word_vector = word_to_vector(word)
    word_list_vectors = word_list.apply(word_to_vector)
    similarities = word_list_vectors.apply(lambda x: cosine_sim(word_vector, x))
    return similarities

In [None]:
# for each fav_tags in user_df get the similarity between the word and all words in the dataframe and update the dataframe
user_tags = df_user['fav_tags'][0]
user_tags = user_tags.split(',')
user_tags

In [None]:
for tag in user_tags:
    similarities = word_to_list_similarity(tag, df_all_tags['tags'])
    df_all_tags[tag] = similarities

# sort each similarity column in descending order
df_all_tags = df_all_tags.sort_values(by=user_tags, ascending=False)
df_all_tags.head()

In [None]:

# Get the similarity between the words in user_preferences and all words in the dataframe and update the dataframe
similarities = word_to_list_similarity("water", df_all_tags['tags'])
df_all_tags['similarity'] = similarities
df_all_tags.head()

In [None]:
# sort the dataframe by similarity
df_all_tags = df_all_tags.sort_values(by=['similarity'], ascending=False)
df_all_tags.head()