**LOADING REQUIRED PACKAGES/LIBRARIES**

In [8]:
# remember to use the below line to install "surprise" module in the anaconda prompt
#conda install -c conda-forge scikit-surprise
import pandas as pd
import numpy as np
# from surprise import KNNBasic
# from surprise import Dataset
# from surprise import Reader
import matplotlib.pyplot as plt
import datetime
import pandas as pd

# **I. Data cleaning and exploration**

## **Load data**

In [9]:
#Load user data
data = pd.read_csv("data/full_data_cleaned.csv")
data.columns

Index(['author_id', 'rating_x', 'is_recommended', 'helpfulness',
       'total_feedback_count', 'total_neg_feedback_count',
       'total_pos_feedback_count', 'submission_time', 'review_text',
       'review_title', 'skin_tone', 'eye_color', 'skin_type', 'hair_color',
       'product_id', 'product_name_x', 'brand_name_x', 'price_usd_x',
       'brand_id', 'ingredients', 'limited_edition', 'out_of_stock',
       'sephora_exclusive', 'highlights', 'primary_category', 'num_rating',
       'niche_product'],
      dtype='object')

In [10]:
#group by product_id and get the average rating and niche
data1 = data.groupby('product_id').agg({'rating_x': 'mean', 'niche_product': 'first'}).reset_index()
data1.head(2)

Unnamed: 0,product_id,rating_x,niche_product
0,P114902,4.458924,0
1,P12045,4.468599,0


In [11]:
#function to filter data based on user input 
def filter_data(price, skin_tone, skin_type, eye_color, hair_color):
    filtered_data = data
    if price != None:
        filtered_data = filtered_data[filtered_data['price_usd_x'] <= price]
    if skin_tone != None:
        filtered_data = filtered_data[filtered_data['skin_tone'] == skin_tone]
    if skin_type != None:
        filtered_data = filtered_data[filtered_data['skin_type'] == skin_type]
    if eye_color != None:
        filtered_data = filtered_data[filtered_data['eye_color'] == eye_color]
    if hair_color != None:
        filtered_data = filtered_data[filtered_data['hair_color'] == hair_color]
    return filtered_data
data = filter_data(price = None, skin_tone= None, skin_type = None, eye_color = None, hair_color = None)

In [12]:
# Extract product characteristics
product_characteristics = data.groupby('product_id').agg({'brand_id': 'first', 'limited_edition': 'first', 'sephora_exclusive': 'first',
                                                         'primary_category': 'first', 'ingredients': 'first',
                                     'highlights': 'first', 'niche_product': 'first'})
#covert brand_id and primary_category to dummy variables
product_characteristics = pd.get_dummies(product_characteristics, columns=['brand_id', 'primary_category'], prefix=['brand_id', 'category'])

#convert all columns except the ingredients and highlights to integer
var = product_characteristics.columns.tolist()
var.remove('ingredients')
var.remove('highlights')
product_characteristics[var] = product_characteristics[var].astype(int)
#create function to trim the variable
def trim_func(df, col):
    df[col] = df[col].str.lower() #convert to lower case
    df[col] = df[col].str.replace(r'\d+', '') #remove digits
    df[col] = df[col].str.replace(r'\W', ' ') #remove special characters
    df[col] = df[col].str.replace(r'\s+', ' ') #remove extra spaces
    df[col] = df[col].str.strip() #remove leading and trailing spaces
    return df

product_characteristics = trim_func(product_characteristics, 'highlights') #trimming the highlights column
product_characteristics = trim_func(product_characteristics, 'ingredients') #trimming the ingredients column

#function to vectorize the text data
from sklearn.feature_extraction.text import TfidfVectorizer
def vectorize_text(df, col):
    vectorizer = TfidfVectorizer(min_df = 100, max_df = 0.7) #Vectorizing the description
    vectorized_col = vectorizer.fit_transform(df[col])
    vectorized_df = pd.DataFrame(vectorized_col.toarray(), columns=vectorizer.get_feature_names_out(), index=df.index)
    #concatenate the vectorized column with the original dataframe
    df = pd.concat([df, vectorized_df], axis=1)
    #drop the original column
    df.drop(col, axis=1, inplace=True)
    return df
product_characteristics = vectorize_text(product_characteristics, 'ingredients') #vectorize the highlights column
product_characteristics = vectorize_text(product_characteristics, 'highlights') #vectorize the ingredients column


product_characteristics.head(2)

Unnamed: 0_level_0,limited_edition,sephora_exclusive,niche_product,brand_id_1063,brand_id_1073,brand_id_1132,brand_id_1254,brand_id_1741,brand_id_2000,brand_id_2082,...,sles,sls,spf,spots,sulfates,texture,uneven,vegan,vitamin,without
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P114902,0,0,0,0,0,0,0,0,0,0,...,0.33523,0.33523,0.0,0.0,0.33523,0.0,0.0,0.0,0.0,0.233399
P12045,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Content Based Recommendation System

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
#merge product characteristics and review data
df = pd.merge(data.drop(['limited_edition', 'niche_product','sephora_exclusive'], axis = 1),
               product_characteristics, left_on='product_id', right_index=True)


In [14]:
#build recommendation system based on products users have used
def product_characteristics_recommender(product_lst, n_recommendation, df1 = product_characteristics, df2 = df):
    users_products = df1.reindex(product_lst) #characteristics of the products used by the user
    users_prof = users_products.mean() #average score of the products used by the user
    non_user_products = df1.drop(product_lst, axis = 0 ) #subset of products not used by the user
    #calculate the similarity between the user's products and all other products
    user_prof_similarity = cosine_similarity(users_prof.values.reshape(1, -1), non_user_products)
    #convert the similarity to a dataframe
    user_prof_similarity_df = pd.DataFrame(user_prof_similarity.T, 
                                          index = non_user_products.index,
                                            columns = ['similarity']).sort_values(by = 'similarity', ascending = False)
    
    #return the top n recommendations
    top_n = user_prof_similarity_df.head(n_recommendation)
    top_n = pd.merge(top_n, data1, on='product_id') #add the average rating
    top_n['weighted_sum'] = top_n['rating_x']*top_n['similarity'] #calculate the weighted sum
    #calculate the predicted rating
    pred = top_n['weighted_sum'].sum()/top_n['similarity'].sum()
    return pred, top_n['product_id'].unique()

In [15]:
product_characteristics_recommender(['P420652', 'P7880'], 5)

(4.40763312885391,
 array(['P443563', 'P457005', 'P469490', 'P471237', 'P12336'], dtype=object))