# Recommender system
# 1 Get user preferences from the user profile file
# 2 Get all items that match the user preferences (SQL query to the database) (10 items) and adapt if there is no result


===========
- Retrive user preferences (load it from json file and extract prefered items)
- Query all metadata from the database.
- Rank images based on the user preferences (Clustering, similarity, collaborative filtering)
- Recommend the top 10 images to the user

Rank images :
- Convert data to a matrix or a dataframe
- Apply clustering algorithm (K-means, DBSCAN, Hierarchical clustering)

In [3]:
import sqlite3
import os
import pandas as pd

In [4]:
# Set the base folder path for the project
output_path = "../output"
images_path = os.path.join(output_path, "images")
metadata_path = os.path.join(output_path, "metadata")

database_path = os.path.join(metadata_path, "metadata.db")

In [5]:
def get_users():
    # Connect to the database and get all users from the database
    conn = sqlite3.connect(database_path)
    c = conn.cursor()
    c.execute("SELECT * FROM users")
    # Dictionary of users with keys (pseudo, fav_color, fav_orientation, fav_height, fav_width)
    users = c.fetchall()

    users = {user[0]: user[1:] for user in users}

    return users

In [6]:
def get_user_preferences(username):
    # Get the user preferences from the user profile file
    # Return a dictionary with keys (fav_color, fav_orientation, fav_height, fav_width)
    users = get_users()
    return users[username]

In [7]:
def get_metadata_from_sqlite_db(db_name='metadata.db'):
    """
    Get the metadata from the sqlite database

    :param db_name: The name of the database
    :return: A dictionary with the metadata
    """
    # Open a connection to the database
    conn = sqlite3.connect(os.path.join(metadata_path, db_name))
    c = conn.cursor()

    # Retrieve all key-value pairs concatenated by filename
    c.execute("SELECT filename, GROUP_CONCAT(key || ':' || value, '; ') AS metadata FROM metadata GROUP BY filename")
    rows = c.fetchall()

    # Store the concatenated key-value pairs in a dictionary
    metadata_dict = {}
    for row in rows:
        filename, metadata_str = row
        metadata_list = metadata_str.split('; ')
        metadata_dict[filename] = {}
        for metadata_item in metadata_list:
            key, value = metadata_item.split(':', 1)
            # if the key is tags, convert the string to a list
            if key == 'tags':
                metadata_dict[filename][key] = eval(value)
            else:
                metadata_dict[filename][key] = value

    conn.close()

    return metadata_dict

# Get metadata

In [101]:
metadata = get_metadata_from_sqlite_db()
df_metadata = pd.DataFrame(metadata)
# reverse the columns and rows
df_metadata = df_metadata.transpose()
# add name for the first column
df_metadata = df_metadata.rename_axis('filename').reset_index()
df_metadata.head()

Unnamed: 0,filename,Make,DateTimeOriginal,File Name,tags,dominant_color,Artist,GPSInfo,Width,Height,...,LightSource,Flash,FocalLength,ExifImageWidth,ExifImageHeight,ExposureTime,FNumber,ExposureProgram,ISOSpeedRatings,LensModel
0,image_0.jpg,Canon,2014:09:24 20:09:26,image_0.jpg,[person],"[(""#15170e"", 0.3704), (""#4a6423"", 0.1965), (""#...",,,,,...,,,,,,,,,,
1,image_1.jpg,Panasonic,2013:12:07 12:17:20,image_1.jpg,[],"[(""#274210"", 0.2402), (""#e0e2e3"", 0.3357), (""#...",Ugmonk,,,,...,,,,,,,,,,
2,image_10.jpg,SONY,2014:12:03 17:04:16,image_10.jpg,[bird],"[(""#d9d9d9"", 0.3352), (""#474747"", 0.0161), (""#...",,"{0: b'\x02\x03\x00\x00', 1: 'S', 2: (38.0, 28....",6000.0,4000.0,...,,,,,,,,,,
3,image_100.jpg,SONY,2018:07:14 21:07:22,image_100.jpg,[],"[(""#243a45"", 0.3002), (""#e4d3de"", 0.1578), (""#...",,,,,...,,,,,,,,,,
4,image_101.jpg,NIKON CORPORATION,2018:08:06 23:53:48,image_101.jpg,[],"[(""#100e1d"", 0.4205), (""#2e3248"", 0.192), (""#1...",,,,,...,,,,,,,,,,


# Get user preferences

In [67]:
user = get_user_preferences('Yannis')
df_user = pd.DataFrame(user)
df_user = df_user.transpose()
df_user = df_user.rename_axis('user').reset_index()
# rename the columns : ,ID,fav_color,fav_orientation, fav_height, fav_width, fav_tags
df_user = df_user.rename(
    columns={'user': 'pseudo', 0: 'fav_color', 1: 'fav_orientation', 2: 'fav_height', 3: 'fav_width', 4: 'fav_tags'})
# format fav_tags to remove () and ''
df_user['fav_tags'] = df_user['fav_tags'].str.replace('(', '', regex=False)
df_user['fav_tags'] = df_user['fav_tags'].str.replace(')', '', regex=False)
df_user['fav_tags'] = df_user['fav_tags'].str.replace("'", '', regex=False)
df_user['fav_tags'] = df_user['fav_tags'].str.replace(" ", '', regex=False)
# add the name of the column for the user name (pseudo)
df_user.head()

Unnamed: 0,pseudo,fav_color,fav_orientation,fav_height,fav_width,fav_tags
0,0,#28a46a,Landscape,1900,100,"bird,person,surfboard,person"


# Find the nearest tags to the user preferences

In [10]:
!pip install nltk gensim



In [11]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import genesis
import pandas as pd

nltk.download('genesis')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
genesis_ic = wn.ic(genesis, False, 0.0)

[nltk_data] Downloading package genesis to
[nltk_data]     /Users/yannisteissier/nltk_data...
[nltk_data]   Package genesis is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yannisteissier/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yannisteissier/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/yannisteissier/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [12]:
# get tags from the dataframe and clean it to get a list of tags
tags = df_metadata['tags']

In [13]:
# get the list of all tags
all_tags = []
for tag in tags:
    try:
        for t in tag:
            if t not in all_tags:
                all_tags.append(t)
    except:
        pass
all_tags

['person',
 'bird',
 'surfboard',
 'backpack',
 'fire hydrant',
 'traffic light',
 'dog',
 'vase',
 'potted plant',
 'boat',
 'horse',
 'bottle',
 'wine glass',
 'car',
 'cat',
 'truck',
 'cup',
 'dining table',
 'bear',
 'frisbee',
 'carrot',
 'kite',
 'bed',
 'giraffe',
 'orange',
 'oven',
 'clock',
 'sheep',
 'umbrella',
 'cow',
 'zebra',
 'snowboard',
 'train',
 'mouse',
 'cell phone',
 'cake',
 'tv',
 'pizza',
 'skateboard',
 'handbag',
 'toilet',
 'chair',
 'broccoli',
 'banana',
 'book',
 'bench',
 'donut',
 'refrigerator',
 'sports ball',
 'apple',
 'spoon',
 'bowl',
 'airplane',
 'couch',
 'elephant',
 'suitcase',
 'tie',
 'parking meter']

In [14]:
# convert the list of tags to a dataframe
df_all_tags = pd.DataFrame(all_tags)
# rename the column
df_all_tags = df_all_tags.rename(columns={0: 'tags'})
df_all_tags.head()

Unnamed: 0,tags
0,person
1,bird
2,surfboard
3,backpack
4,fire hydrant


In [15]:
!pip install spacy



In [16]:
import spacy
# download the model
!python -m spacy download en_core_web_md
nlp = spacy.load("en_core_web_md")

Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [17]:
# Define a function to convert a word to its vector representation
def word_to_vector(word):
    return nlp(word).vector


# Apply the function to all words in the dataframe
df_all_tags['vector'] = df_all_tags['tags'].apply(word_to_vector)
df_all_tags.head()

Unnamed: 0,tags,vector
0,person,"[-1.0079, -0.025288, -3.5855, -1.278, 2.7287, ..."
1,bird,"[4.8752, -1.9177, -1.3281, -5.278, 2.2977, -0...."
2,surfboard,"[-2.3111, 6.0281, 0.40919, -0.054451, -1.5307,..."
3,backpack,"[-0.88119, 3.1579, -3.6337, 0.77035, -0.19718,..."
4,fire hydrant,"[-2.10155, 0.19204998, -5.1029, 1.7569599, 3.2..."


In [18]:
from sklearn.metrics.pairwise import cosine_similarity


# Define a function to compute the cosine similarity between two vectors
def cosine_sim(a, b):
    return cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0][0]

In [19]:
# Example usage
vector1 = word_to_vector("apple")
vector2 = word_to_vector("orange")
similarity = cosine_sim(vector1, vector2)
print(similarity)  # Output: 0.613587

0.6135187


In [20]:
# Define a function to compute the similarity between a word and a list of words
def word_to_list_similarity(word, word_list):
    word_vector = word_to_vector(word)
    word_list_vectors = word_list.apply(word_to_vector)
    similarities = word_list_vectors.apply(lambda x: cosine_sim(word_vector, x))
    return similarities

In [21]:
# for each fav_tags in user_df get the similarity between the word and all words in the dataframe and update the dataframe
user_tags = df_user['fav_tags'][0]
user_tags = user_tags.split(',')
user_tags

['bird', 'person', 'surfboard', 'person']

In [22]:
for tag in user_tags:
    similarities = word_to_list_similarity(tag, df_all_tags['tags'])
    df_all_tags[tag] = similarities

# sort each similarity column in descending order
df_all_tags = df_all_tags.sort_values(by=user_tags, ascending=False)
df_all_tags.head()

Unnamed: 0,tags,vector,bird,person,surfboard
1,bird,"[4.8752, -1.9177, -1.3281, -5.278, 2.2977, -0....",1.0,0.190218,0.213882
30,zebra,"[0.032863, 1.8007, -1.3854, -3.5269, -0.24236,...",0.567889,0.095144,0.192323
23,giraffe,"[-0.84077, 2.6076, -1.6748, -3.81, 0.60447, -0...",0.563594,0.194894,0.236857
54,elephant,"[-0.84077, 2.6076, -1.6748, -3.81, 0.60447, -0...",0.563594,0.194894,0.236857
14,cat,"[3.7032, 4.1982, -5.0002, -11.322, 0.031702, -...",0.536937,0.205691,0.142983


In [23]:

# Get the similarity between the words in user_preferences and all words in the dataframe and update the dataframe
similarities = word_to_list_similarity("water", df_all_tags['tags'])
df_all_tags['similarity'] = similarities
df_all_tags.head()

Unnamed: 0,tags,vector,bird,person,surfboard,similarity
1,bird,"[4.8752, -1.9177, -1.3281, -5.278, 2.2977, -0....",1.0,0.190218,0.213882,0.23175
30,zebra,"[0.032863, 1.8007, -1.3854, -3.5269, -0.24236,...",0.567889,0.095144,0.192323,0.102425
23,giraffe,"[-0.84077, 2.6076, -1.6748, -3.81, 0.60447, -0...",0.563594,0.194894,0.236857,0.160907
54,elephant,"[-0.84077, 2.6076, -1.6748, -3.81, 0.60447, -0...",0.563594,0.194894,0.236857,0.160907
14,cat,"[3.7032, 4.1982, -5.0002, -11.322, 0.031702, -...",0.536937,0.205691,0.142983,0.09277


In [24]:
# sort the dataframe by similarity
df_all_tags = df_all_tags.sort_values(by=['similarity'], ascending=False)
df_all_tags.head()

Unnamed: 0,tags,vector,bird,person,surfboard,similarity
4,fire hydrant,"[-2.10155, 0.19204998, -5.1029, 1.7569599, 3.2...",0.253747,0.18281,0.321466,0.827036
8,potted plant,"[-1.6188099, -2.70715, -5.14225, 3.5575, 4.762...",0.263577,0.247293,0.089527,0.503158
5,traffic light,"[-0.895235, 2.24475, -4.0668, 4.0631, -0.25559...",0.200752,0.33585,0.249778,0.495233
47,refrigerator,"[0.62057, 0.5426, -1.8448, 2.2689, 1.4919, -1....",0.120834,0.156021,0.248682,0.475892
11,bottle,"[-1.0871, -0.41328, -1.353, 2.3119, -0.10389, ...",0.245645,0.19674,0.291589,0.457597


In [94]:
!pip install tensorflow-macos tensorflow-metal keras
import sklearn

print(sklearn.__version__)
!pip install -U scikit-learn

1.2.1
Collecting scikit-learn
  Downloading scikit_learn-1.2.2-cp39-cp39-macosx_12_0_arm64.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.1
    Uninstalling scikit-learn-1.2.1:
      Successfully uninstalled scikit-learn-1.2.1
Successfully installed scikit-learn-1.2.2


In [95]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import ast

In [106]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import ast

metadata = get_metadata_from_sqlite_db()
df_metadata = pd.DataFrame(metadata)
# reverse the columns and rows
df_metadata = df_metadata.transpose()
# add name for the first column
df_metadata = df_metadata.rename_axis('filename').reset_index()

# convert the dataframe to a csv file
df_metadata.to_csv('metadata.csv', index=False)

# Load the dataset
data = pd.read_csv('metadata.csv')

# Remove unnecessary columns
data = data[['filename', 'Make', 'tags', 'dominant_color', 'Width', 'Height']]

# check if there is nan in dominant_color
if data['dominant_color'].isna().sum() > 0:
    # remove the rows with nan in dominant_color
    data = data.dropna(subset=['dominant_color'])

# Split the dominant_color column into three columns
data[['color1', 'color2', 'color3', 'color4']] = pd.DataFrame(
    data['dominant_color'].apply(lambda x: [c[0] for c in ast.literal_eval(x)]).tolist(), index=data.index)

# Convert the tags column to a list of strings
data['tags'] = data['tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

# One-hot encode the Make column
encoder = OneHotEncoder(sparse=False)
make_encoded = encoder.fit_transform(data[['Make']])
data = pd.concat([data.drop('Make', axis=1),
                  pd.DataFrame(make_encoded, columns=encoder.get_feature_names_out(['Make']), index=data.index)],
                 axis=1)

# Fill missing values in dominant_color column
most_common_color = data['dominant_color'].mode()[0]
data['dominant_color'].fillna(most_common_color, inplace=True)

# Scale selected columns
scaler = StandardScaler()
scaled_columns = ['Width', 'Height']
data[scaled_columns] = scaler.fit_transform(data[scaled_columns])



In [107]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import cv2

# Define CNN architecture
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(256, activation='relu')
])

# Compile model
model.compile(optimizer=Adam(lr=0.001), loss='mse')

# Train model on image dataset
model.fit(data, epochs=10)


# Get feature vector for an input image
def get_feature_vector(image_path):
    img = cv2.imread(image_path)
    img = cv2.resize(img, (224, 224))
    img = np.expand_dims(img, axis=0)
    img = img / 255.0
    features = model.predict(img)
    return features.flatten()


# Compute similarity between two images
def get_similarity(image1_path, image2_path):
    features1 = get_feature_vector(image1_path)
    features2 = get_feature_vector(image2_path)
    similarity = cosine_similarity([features1], [features2])[0][0]
    return similarity


# Make recommendations for an input image
def get_recommendations(image_path, dataset, k=5):
    similarities = []
    for index, row in dataset.iterrows():
        similarity = get_similarity(image_path, row['filename'])
        similarities.append((row['filename'], similarity))
    similarities.sort(key=lambda x: x[1], reverse=True)
    recommendations = similarities[1:k + 1]
    return recommendations




ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

# content-based filtering

In [214]:
def hex_to_rgb(color):
    try:
        # remove the # from the color
        color = color[1:]
        # convert the color to rgb values
        rgb = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4))
        return rgb
    except:
        return 0, 0, 0


def get_clean_dataset():
    metadata = get_metadata_from_sqlite_db()
    df_metadata = pd.DataFrame(metadata)
    # reverse the columns and rows
    df_metadata = df_metadata.transpose()
    # add name for the first column
    df_metadata = df_metadata.rename_axis('filename').reset_index()
    # remove the rows with nan in dominant_color
    df_metadata = df_metadata.dropna(subset=['dominant_color'])
    # split dominant color into 4 columns and remove the dominant_color column
    df_metadata[['color1', 'color2', 'color3', 'color4']] = pd.DataFrame(
        df_metadata['dominant_color'].apply(lambda x: [c[0] for c in ast.literal_eval(x)]).tolist(),
        index=df_metadata.index)
    # convert colors to rgb values
    df_metadata['color1'] = df_metadata['color1'].apply(lambda x: hex_to_rgb(x))
    df_metadata['color2'] = df_metadata['color2'].apply(lambda x: hex_to_rgb(x))
    df_metadata['color3'] = df_metadata['color3'].apply(lambda x: hex_to_rgb(x))
    df_metadata['color4'] = df_metadata['color4'].apply(lambda x: hex_to_rgb(x))
    df_metadata = df_metadata.drop('dominant_color', axis=1)
    # convert the tags column to a list of strings
    df_metadata = df_metadata.fillna(0)
    # remove all columns except filename, tags, color1, color2, color3, color4, Make, Width, Height
    df_metadata = df_metadata[['filename', 'tags', 'color1', 'color2', 'color3', 'color4', 'Make', 'Width', 'Height']]
    # replace all 0 values with empty strings
    df_metadata['Make'] = df_metadata['Make'].replace(0, '')

    return df_metadata

In [215]:
get_clean_dataset().head()

Unnamed: 0,filename,tags,color1,color2,color3,color4,Make,Width,Height
0,image_0.jpg,[person],"(21, 23, 14)","(74, 100, 35)","(123, 150, 82)","(45, 54, 24)",Canon,0,0
1,image_1.jpg,[],"(39, 66, 16)","(224, 226, 227)","(101, 127, 79)","(172, 181, 167)",Panasonic,0,0
2,image_10.jpg,[bird],"(217, 217, 217)","(71, 71, 71)","(190, 190, 190)","(206, 206, 206)",SONY,6000,4000
3,image_100.jpg,[],"(36, 58, 69)","(228, 211, 222)","(8, 31, 17)","(76, 104, 141)",SONY,0,0
4,image_101.jpg,[],"(16, 14, 29)","(46, 50, 72)","(29, 30, 47)","(92, 109, 143)",NIKON CORPORATION,0,0


In [216]:
def get_clean_preferences():
    preferences = get_user_preferences('Yannis')
    df_preferences = pd.DataFrame(preferences)
    # reverse the columns and rows
    df_preferences = df_preferences.transpose()
    # 0, is the color, 1 is the orientation, 2 is the height, 3 is the width, 4 is the tags, name the columns
    df_preferences.columns = ['color', 'orientation', 'height', 'width', 'tags']
    # remove the rows with nan in dominant_color
    df_preferences = df_preferences.dropna(subset=['color'])
    # split dominant color into 4 columns and remove the dominant_color column
    df_preferences = df_preferences.drop('orientation', axis=1)
    # convert the tags column to a list of strings
    df_preferences['tags'] = df_preferences['tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
    # Replace all NaN values with empty strings with the fillna() method
    df_preferences = df_preferences.fillna(0)
    # remove all columns except filename, tags, color1, color2, color3, color4, Make, Width, Height
    df_preferences = df_preferences[['color', 'height', 'width', 'tags']]
    # convert colors to rgb values
    df_preferences['color'] = df_preferences['color'].apply(lambda x: hex_to_rgb(x))
    # replace all 0 values with empty strings
    df_preferences['color'] = df_preferences['color'].replace(0, '')

    return df_preferences

In [217]:
get_clean_preferences().head()

Unnamed: 0,color,height,width,tags
0,"(40, 164, 106)",1900,100,"(bird, person, surfboard, person)"


In [218]:
dataset = get_clean_dataset()
preferences = get_clean_preferences()
# Save dataset as csv file
dataset.to_csv('dataset.csv', index=False)
# Save preferences as csv file
preferences.to_csv('preferences.csv', index=False)

In [337]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def recommend_colors(rgb, dataset_path):
    # Load the dataset into a Pandas DataFrame
    data = pd.read_csv(dataset_path)

    # Extract the individual r, g, and b values from the color columns and create new columns for them
    data[['r1', 'g1', 'b1']] = pd.DataFrame(data['color1'].apply(lambda x: eval(x.strip('()'))).tolist())
    data[['r2', 'g2', 'b2']] = pd.DataFrame(data['color2'].apply(lambda x: eval(x.strip('()'))).tolist())
    data[['r3', 'g3', 'b3']] = pd.DataFrame(data['color3'].apply(lambda x: eval(x.strip('()'))).tolist())
    data[['r4', 'g4', 'b4']] = pd.DataFrame(data['color4'].apply(lambda x: eval(x.strip('()'))).tolist())

    # Normalize the r, g, and b columns to be between 0 and 1
    data[['r1', 'g1', 'b1', 'r2', 'g2', 'b2', 'r3', 'g3', 'b3', 'r4', 'g4', 'b4']] = data[['r1', 'g1', 'b1', 'r2', 'g2', 'b2', 'r3', 'g3', 'b3', 'r4', 'g4', 'b4']] / 255

    # Normalize the input RGB color to be between 0 and 1
    r, g, b = rgb
    r_norm, g_norm, b_norm = r / 255, g / 255, b / 255

    # Compute the cosine similarity between the input color and all the colors in the dataset
    data['cosine_sim'] = cosine_similarity([[r_norm, g_norm, b_norm, r_norm, g_norm, b_norm, r_norm, g_norm, b_norm, r_norm, g_norm, b_norm]], data[['r1', 'g1', 'b1', 'r2', 'g2', 'b2', 'r3', 'g3', 'b3', 'r4', 'g4', 'b4']])[0]

    # Sort the dataset by cosine similarity in descending order and return the top 10 closest matches
    closest_matches = data.sort_values('cosine_sim', ascending=False).head(10)[['filename', 'color1', 'color2', 'color3', 'color4', 'cosine_sim']]

    return closest_matches

rgb = (94, 166, 199)
dataset_path = "dataset.csv"
closest_matches = recommend_colors(rgb, dataset_path)
print(closest_matches)

          filename           color1           color2           color3  \
940  image_946.jpg   (100, 92, 100)   (54, 125, 170)  (101, 147, 170)   
297  image_366.jpg    (40, 98, 135)  (140, 109, 141)  (101, 119, 155)   
171  image_252.jpg  (148, 204, 210)    (9, 183, 198)  (203, 220, 223)   
147  image_230.jpg  (164, 173, 195)  (201, 205, 216)  (141, 152, 177)   
636  image_671.jpg  (189, 184, 229)  (115, 112, 182)  (160, 175, 241)   
503  image_551.jpg  (218, 231, 224)   (98, 200, 184)  (229, 245, 244)   
368   image_43.jpg   (70, 131, 150)  (164, 195, 196)  (138, 138, 122)   
279   image_35.jpg  (137, 164, 166)  (225, 202, 199)  (152, 179, 186)   
882  image_894.jpg  (191, 232, 245)  (201, 170, 162)  (240, 231, 233)   
386  image_446.jpg  (215, 214, 226)  (180, 193, 199)  (244, 239, 241)   

              color4  cosine_sim  
940   (72, 110, 136)    0.978719  
297   (65, 117, 153)    0.976374  
171   (82, 195, 205)    0.970197  
147  (183, 190, 207)    0.969345  
636  (157, 133, 196) 

In [299]:
cosine_sim

array([[1.        , 0.79017373, 0.73772845, ..., 0.70405334, 0.59775274,
        0.5979558 ],
       [0.79017373, 1.        , 0.72152618, ..., 0.7231484 , 0.9082249 ,
        0.92287648],
       [0.73772845, 0.72152618, 1.        , ..., 0.95905934, 0.81252094,
        0.78990573],
       ...,
       [0.70405334, 0.7231484 , 0.95905934, ..., 1.        , 0.81775104,
        0.80504648],
       [0.59775274, 0.9082249 , 0.81252094, ..., 0.81775104, 1.        ,
        0.98435904],
       [0.5979558 , 0.92287648, 0.78990573, ..., 0.80504648, 0.98435904,
        1.        ]])

# CNN version

In [357]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Load the dataset into a Pandas DataFrame
data = pd.read_csv(dataset_path)

# Extract the individual r, g, and b values from the color columns and create new columns for them
data[['r1', 'g1', 'b1']] = pd.DataFrame(data['color1'].apply(lambda x: eval(x.strip('()'))).tolist())
data[['r2', 'g2', 'b2']] = pd.DataFrame(data['color2'].apply(lambda x: eval(x.strip('()'))).tolist())
data[['r3', 'g3', 'b3']] = pd.DataFrame(data['color3'].apply(lambda x: eval(x.strip('()'))).tolist())
data[['r4', 'g4', 'b4']] = pd.DataFrame(data['color4'].apply(lambda x: eval(x.strip('()'))).tolist())

# Normalize the r, g, and b columns to be between 0 and 1
data[['r1', 'g1', 'b1', 'r2', 'g2', 'b2', 'r3', 'g3', 'b3', 'r4', 'g4', 'b4']] = data[['r1', 'g1', 'b1', 'r2', 'g2', 'b2', 'r3', 'g3', 'b3', 'r4', 'g4', 'b4']] / 255

# Normalize the input RGB color to be between 0 and 1
r, g, b = rgb
r_norm, g_norm, b_norm = r / 255, g / 255, b / 255

# Compute the cosine similarity between the input color and all the colors in the dataset
data['cosine_sim'] = cosine_similarity([[r_norm, g_norm, b_norm, r_norm, g_norm, b_norm, r_norm, g_norm, b_norm, r_norm, g_norm, b_norm]], data[['r1', 'g1', 'b1', 'r2', 'g2', 'b2', 'r3', 'g3', 'b3', 'r4', 'g4', 'b4']])[0]

# Define the neural network architecture
model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu', input_shape=(3,)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='linear')
])

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[['r1', 'g1', 'b1']], data[['cosine_sim']], test_size=0.2)

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Normalize the input RGB color to be between 0 and 1
rgb = (94, 166, 199)
r_norm, g_norm, b_norm = rgb[0] / 255, rgb[1] / 255, rgb[2] / 255

# Predict the cosine similarity between the input color and all the colors in the dataset
cosine_sims = model.predict(data[['r1', 'g1', 'b1']])

# Add the cosine similarity values to the dataset
data['cosine_sim'] = cosine_sims

# Sort the dataset by cosine similarity in descending order and return the top 10 closest matches
closest_matches = data.sort_values('cosine_sim', ascending=False).head(10)[['filename', 'color1', 'color2', 'color3', 'color4', 'cosine_sim']]

print(closest_matches)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
          filename          color1           color2           color3  \
989  image_990.jpg   (238, 125, 6)      (23, 10, 4)   (254, 189, 62)   
880  image_892.jpg  (237, 121, 19)     (48, 15, 12)  (239, 193, 170)   
179   image_26.jpg   (166, 31, 38)  (173, 160, 152)  (116, 108, 103)   
312   image_38.jpg   (138, 13, 10)  (227, 175, 176)     (96, 84, 50)   
655  image_689.jpg   (192, 77, 14)     (24, 12, 11)   (233, 154, 82)   
355  image_418.jpg   (153, 32, 23)   (221, 129, 35)       (47, 7, 8)   
534   image_58.jpg   (142, 10, 50)  (203, 128, 139)     (56, 21, 30)   
962  image_966.jpg   (249, 212, 3)      (89, 36, 4)    (197, 115, 5)   
32   image_127.jpg  (218, 122, 79)    (159, 60, 25)  (247, 179, 146)   
761  image_784.jpg   (195, 93, 66)     (32, 17, 11)     (98, 44, 40)   

              color4  cosine_sim  
989     (146, 58, 2)    0.776133  
880    (148, 38, 18)    0.