In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create a simple dataset 

The dataset will only have 4 rows for simplicity and it will be easier to understand also. Each model will have 3 features, but we will only be working on the colour feature and shape feature

In [2]:
df_shop = pd.DataFrame([['Model 1','red', 'round','oblong'], ['Model 2', 'blue', 'round', 'round'], ['Model 3', 'black','cat','diamond'],['Model 4','red','oval','triangle']],
     columns=['Name', 'Colour','Shape','FaceShape'])
df_shop

Unnamed: 0,Name,Colour,Shape,FaceShape
0,Model 1,red,round,oblong
1,Model 2,blue,round,round
2,Model 3,black,cat,diamond
3,Model 4,red,oval,triangle


In [3]:
#Extract each feature as its own
df_colour = df_shop['Colour']
df_shape = df_shop['Shape']
print(df_colour)
print(df_shape)

0      red
1     blue
2    black
3      red
Name: Colour, dtype: object
0    round
1    round
2      cat
3     oval
Name: Shape, dtype: object


In [4]:
#TF-IDF score is the frequency of a word occurring in a document
#TfIdfVectorizer class that produces the TF-IDF matrix in a couple of lines.
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
#However in this case, we do not have any stop word for our dataframe.
tfid=TfidfVectorizer(stop_words='english')
df_colour_matrix = tfid.fit_transform(df_colour)
df_shape_matrix = tfid.fit_transform(df_shape)

In [5]:
# Compute the cosine similarity matrix
#It basically comput the shape cosine similarity score with every other shape
#1 means they are exactly the same
#E.g row one represent model 1 which is red and round, hence the cosine_similarities_colour is 1 for index 0 and 3, 
#since model 1 and model 4 are both red.
#cosine similarity calculate a numeric quantity that denotes the similarity between two models
cosine_similarities_colour = cosine_similarity(df_colour_matrix)
cosine_similarities_shape = cosine_similarity(df_shape_matrix)
print(cosine_similarities_colour)
print()
print(cosine_similarities_shape)
print()

#Linear_kernel works the same way but it is faster, so we will use this
cosine_similarities_colour_lk = linear_kernel(df_colour_matrix, df_colour_matrix)
cosine_similarities_shape_lk = linear_kernel(df_shape_matrix, df_shape_matrix)
print(cosine_similarities_colour_lk)
print()
print(cosine_similarities_shape_lk)

[[1. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 1.]]

[[1. 1. 0. 0.]
 [1. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]

[[1. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 1.]]

[[1. 1. 0. 0.]
 [1. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]


In [6]:
del cosine_similarities_colour
del cosine_similarities_shape

In [7]:
similarities_colour = {}
similarities_shape = {}

for i in range(len(cosine_similarities_colour_lk)):
    # Now we'll sort each element in cosine_similarities and get the indexes of the colour. 
    #Argsort return the indices of the sorted array, we need to do [::-1] as normally it will sort in ascending order 
    #Since we want it to be descending we will do [::-1]
    similar_indices_colour = cosine_similarities_colour_lk[i].argsort()[::-1]
    # After that, we'll store in similarities each model name of the most similar colour.
    similarities_colour[df_shop['Colour'].iloc[i]] = [(cosine_similarities_colour_lk[i][x], df_shop['Name'][x]) for x in similar_indices_colour][:]

#Do the same for the shape
    
for i in range(len(cosine_similarities_shape_lk)):
    similar_indices_shape = cosine_similarities_shape_lk[i].argsort()[::-1]
    similarities_shape[df_shop['Shape'].iloc[i]] = [(cosine_similarities_shape_lk[i][x], df_shop['Name'][x]) for x in similar_indices_shape][:]

In [8]:
#Can see that it is a dictionary with the key being the feature name and the value is the list of tuple (similarity score,model name)
print(similarities_colour)
print()
print(similarities_shape)

{'red': [(1.0, 'Model 4'), (1.0, 'Model 1'), (0.0, 'Model 3'), (0.0, 'Model 2')], 'blue': [(1.0, 'Model 2'), (0.0, 'Model 4'), (0.0, 'Model 3'), (0.0, 'Model 1')], 'black': [(1.0, 'Model 3'), (0.0, 'Model 4'), (0.0, 'Model 2'), (0.0, 'Model 1')]}

{'round': [(1.0, 'Model 2'), (1.0, 'Model 1'), (0.0, 'Model 4'), (0.0, 'Model 3')], 'cat': [(1.0, 'Model 3'), (0.0, 'Model 4'), (0.0, 'Model 2'), (0.0, 'Model 1')], 'oval': [(1.0, 'Model 4'), (0.0, 'Model 3'), (0.0, 'Model 2'), (0.0, 'Model 1')]}


In [14]:
class ContentBasedRecommender:
    '''This is the class that will help us to do the recommendation with the dicitonary data we have obtained on the previous cell'''
    def __init__(self, colour_dic,shape_dic):
        self.colour_dic = colour_dic
        self.shape_dic = shape_dic
        
    def combine_models(self,recom_model_colour_list,recom_model_shape_list):
        '''To combine the two different list into one, I will be giving equal weight
        to both similarity score for shape and colour as they hold same importance for this case, but this can be changed'''
        combined_reco_model_list =[]
        for model_colour_list in recom_model_colour_list: 
            for model_shape_list in recom_model_shape_list: 
                if model_shape_list[1] == model_colour_list[1]:
                    #If the model name is the same, aka same model, then we will run the following
                    simi_combi_score = model_colour_list[0] * 0.5 + model_shape_list[0] * 0.5
                    combined_reco_model_list.append([simi_combi_score,model_colour_list[1]])
        #return sorted cause after doing manipulation it might not be sorted
        return(sorted(combined_reco_model_list,reverse = True))
        
        
    def print_message(self,model_list,feature_list,number_model):
        '''Just to print out the recommendation message'''
        print(f'The top {number_model} recommended models for {feature_list[0]} and {feature_list[1]} are:')
        for i in range(number_model):
            print(f"Number {i+1}:")
            print(f"{model_list[i][1]} with {round(model_list[i][0], 3)} similarity score") 
            print("--------------------")
        


    def recommend(self, recommendation, shop_num):
        '''Recommend the relevant models based on the features and number of models stated by user'''
        # Get feature to find recommendations for
        feature = recommendation['feature']
        feature_list = feature.split(',')
        # Get number of model to recommend
        number_model = recommendation['number_model']
        #check if the feature is in the table/ is valid
        for feature in feature_list:
            if feature not in self.colour_dic and feature not in self.shape_dic:
                print(f"This feature {feature} doesn't exist in the shop, please try another feature")
                return
        # Get the number of model most similars from matrix similarities aka the dictionary
        recom_model_colour_list = self.colour_dic[feature_list[0]][:shop_num]
        recom_model_shape_list = self.shape_dic[feature_list[1]][:shop_num]
        model_list = self.combine_models(recom_model_colour_list, recom_model_shape_list)
        self.print_message(model_list,feature_list, number_model)

In [16]:
#Use the class and the method
recommedations_combined = ContentBasedRecommender(similarities_colour,similarities_shape)
#This will be the input of the user, for e.g based on past data, the user like red and round shaped models and the user want 4 reccomendations
recommendation = {
    "feature": 'red,round',
    "number_model": 4
}
recommedations_combined.recommend(recommendation,len(df_shop))



The top 4 recommended models for red and round are:
Number 1:
Model 1 with 1.0 similarity score
--------------------
Number 2:
Model 4 with 0.5 similarity score
--------------------
Number 3:
Model 2 with 0.5 similarity score
--------------------
Number 4:
Model 3 with 0.0 similarity score
--------------------


In [17]:
''' The ans is correct for the 4 recommended Model for red and round:
model 1 with 1.0 --> as model 1 is both red and round
model 4 with 0.5 --> Colour is red but model is not round
model 2 with 0.5 --> The shape is round but colour is not red
model 3 with 0   --> 0 cause there isn't any similarity at all
'''
df_shop

Unnamed: 0,Name,Colour,Shape,FaceShape
0,Model 1,red,round,oblong
1,Model 2,blue,round,round
2,Model 3,black,cat,diamond
3,Model 4,red,oval,triangle


# All code in one cell

In [18]:
# Basic Libraries
import numpy as np
import pandas as pd
from typing import List, Dict
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


df_shop = pd.DataFrame([['Model 1','red', 'round','oblong'], ['Model 2', 'blue', 'round', 'round'], ['Model 3', 'black','cat','diamond'],['Model 4','red','oval','triangle']],
     columns=['Name', 'Colour','Shape','FaceShape'])

df_colour = df_shop['Colour']
df_shape = df_shop['Shape']

#TF-IDF score is the frequency of a word occurring in a document
#TfIdfVectorizer class that produces the TF-IDF matrix in a couple of lines.
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfid=TfidfVectorizer(stop_words='english')
df_colour_matrix = tfid.fit_transform(df_colour)
df_shape_matrix = tfid.fit_transform(df_shape)


#Linear_kernel works the same way as cosine similarity function but it is faster, so we will use this
cosine_similarities_colour_lk = linear_kernel(df_colour_matrix, df_colour_matrix)
cosine_similarities_shape_lk = linear_kernel(df_shape_matrix, df_shape_matrix)

similarities_colour = {}
similarities_shape = {}

for i in range(len(cosine_similarities_colour_lk)):
    # Now we'll sort each element in cosine_similarities and get the indexes of the colour. 
    #Argsort return the indice of of the sorted array, need do [::-1] as normally it will sort in ascending order 
    #we want decsending
    similar_indices_colour = cosine_similarities_colour_lk[i].argsort()[::-1]
    # After that, we'll store in similarities each name of the most similar colour.
    similarities_colour[df_shop['Colour'].iloc[i]] = [(cosine_similarities_colour_lk[i][x], df_shop['Name'][x]) for x in similar_indices_colour][:]
    
    
for i in range(len(cosine_similarities_shape_lk)):
    # Now we'll sort each element in cosine_similarities and get the indexes of the shape. 
    similar_indices_shape = cosine_similarities_shape_lk[i].argsort()[::-1]
    # After that, we'll store in similarities each name of the most similar shape.
    similarities_shape[df_shop['Shape'].iloc[i]] = [(cosine_similarities_shape_lk[i][x], df_shop['Name'][x]) for x in similar_indices_shape][:]
    
    
    
class ContentBasedRecommender:
    def __init__(self, colour_dic,shape_dic):
        self.colour_dic = colour_dic
        self.shape_dic = shape_dic
        
    def combine_models(self,recom_model_colour_list,recom_model_shape_list):
        '''To combine the two different list into one, i will be giving equal weight
        to both similarity score for shape and colour as they hold same importance, but this can be changed for a smarter ML'''
        combined_reco_model_list =[]
        for model_colour_list in recom_model_colour_list: 
            for model_shape_list in recom_model_shape_list: 
                if model_shape_list[1] == model_colour_list[1]:
                    #If the model name is the same, aka same model, then we will run the following
                    simi_combi_score = model_colour_list[0] * 0.5 + model_shape_list[0] * 0.5
                    combined_reco_model_list.append([simi_combi_score,model_colour_list[1]])
        #return sorted cause after doing manipulation it might not be sorted
        return(sorted(combined_reco_model_list,reverse = True))
        
        
    def print_message(self,model_list,feature_list,number_model):
        '''Just to print out the message'''
        print(f'The top {number_model} recommended model for {feature_list[0]} and {feature_list[1]} are:')
        for i in range(number_model):
            print(f"Number {i+1}:")
            print(f"{model_list[i][1]} with {round(model_list[i][0], 3)} similarity score") 
            print("--------------------")
        


    def recommend(self, recommendation, shop_num):
        # Get feature to find recommendations for
        feature = recommendation['feature']
        feature_list = feature.split(',')
        # Get number of model to recommend
        number_model = recommendation['number_model']
        #check if the feature is in the table/ is valid
        for feature in feature_list:
            if feature not in self.colour_dic and feature not in self.shape_dic:
                print(f"This feature {feature} doesn't exist in the shop, please try another feature")
                return
        # Get the number of model most similars from matrix similarities
        recom_model_colour_list = self.colour_dic[feature_list[0]][:shop_num]
        recom_model_shape_list = self.shape_dic[feature_list[1]][:shop_num]
        model_list = self.combine_models(recom_model_colour_list, recom_model_shape_list)
        self.print_message(model_list,feature_list, number_model)
        
        
#Where you input the features/past data that you want to get recommendation
recommedations_combined = ContentBasedRecommender(similarities_colour,similarities_shape)
recommendation = {
    "feature": 'black,round',
    "number_model": 4
}
recommedations_combined.recommend(recommendation,len(df_shop))

The top 4 recommended model for black and round are:
Number 1:
Model 3 with 0.5 similarity score
--------------------
Number 2:
Model 2 with 0.5 similarity score
--------------------
Number 3:
Model 1 with 0.5 similarity score
--------------------
Number 4:
Model 4 with 0.0 similarity score
--------------------


In [19]:
df_shop

Unnamed: 0,Name,Colour,Shape,FaceShape
0,Model 1,red,round,oblong
1,Model 2,blue,round,round
2,Model 3,black,cat,diamond
3,Model 4,red,oval,triangle
