In [1]:
import numpy as np
import pandas as pd

In [2]:
# pd.set_option('display.max_columns', None)
df = pd.read_csv('../data/cleaned_review_data.csv', 
                 parse_dates=['SubmissionTime', 'FirstSubmissionTime', 'LastSubmissionTime'], 
                 low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1364686 entries, 0 to 1364685
Data columns (total 24 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   pd_id                 1364686 non-null  object        
 1   brand                 1364686 non-null  object        
 2   Name                  1364686 non-null  object        
 3   Description           1364686 non-null  object        
 4   AverageOverallRating  1364686 non-null  float64       
 5   love_count            1364686 non-null  float64       
 6   reviews_count         1364686 non-null  float64       
 7   Price                 1364686 non-null  float64       
 8   category_1            1364686 non-null  object        
 9   category_2            1006831 non-null  object        
 10  category_3            957356 non-null   object        
 11  FirstSubmissionTime   1364686 non-null  datetime64[ns]
 12  LastSubmissionTime    1364686 non-null  da

In [3]:
df_wo_na = df.fillna('No_Values')

In [4]:
df['timestamp'] = df['SubmissionTime'].values.astype(np.int64) // 10 ** 9
ratings = df[['AuthorId', 'pd_id', 'Rating', 'timestamp']]
ratings.sample(5)

Unnamed: 0,AuthorId,pd_id,Rating,timestamp
417554,5620735525,P426309,5,1421040695
294659,1502678847,P186805,5,1430452771
144133,7334390065,P270594,4,1567546326
667585,6437648197,P419848,2,1514224696
779251,7044539557,P457248,3,1591744046


In [5]:
pds = pd.read_csv('../data/cleaned_pd_data.csv', parse_dates=['FirstSubmissionTime', 'LastSubmissionTime'])
pds.drop(columns=['Age_35to44', 'Age_18to24', 'Age_25to34', 'Age_45to54', 
                  'Age_over54', 'Age_13to17', 'category', 'size_oz', 'size_mL', 'size_g'], 
         inplace=True)
pds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2301 entries, 0 to 2300
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   brand                 2301 non-null   object        
 1   pd_id                 2301 non-null   object        
 2   Name                  2301 non-null   object        
 3   Description           2301 non-null   object        
 4   AverageOverallRating  2301 non-null   float64       
 5   FirstSubmissionTime   2301 non-null   datetime64[ns]
 6   LastSubmissionTime    2301 non-null   datetime64[ns]
 7   love_count            2301 non-null   float64       
 8   reviews_count         2301 non-null   float64       
 9   Price                 2301 non-null   float64       
 10  category_1            2301 non-null   object        
 11  category_2            1765 non-null   object        
 12  category_3            1598 non-null   object        
dtypes: datetime64[ns](

In [6]:
value_count = ratings['pd_id'].value_counts()
pds_10 = pds[pds['pd_id'].isin(value_count[value_count>10].index)]
pds_10 = pds_10.reset_index(drop=True)
print(f'There are {len(pds_10)} products for analysis')

There are 1957 products for analysis


**Content-based Methods are referring to this article ["Content-Based Recommendation System"](https://medium.com/@bindhubalu/content-based-recommender-system-4db1b3de03e7)**

# Analysing the Description of Content Only

In [7]:
import sklearn
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel, euclidean_distances

## Using only Product Name for TFIDF analysis

In [8]:
def compute_similarity(col):
    """
    Function to compute cosine similarity based on TFIDF.
    Input: 
        col - The text column for TFIDF analysis
    """
    stopwords_list = stopwords.words('english')
    vectorizer = TfidfVectorizer(analyzer='word', stop_words=stopwords_list)
    tfidf_matrix = vectorizer.fit_transform(col)
    cosine_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)
    return cosine_similarity

In [9]:
#Function to get the most similar products
def recommend(index, method, name_col=pds_10['Name'], top_n=5):
    """
    Get the pairwise similarity scores of all products compared that product,
    sorting them and getting top 5.
    Input:
        index - Index of the similarity method
        method - The similarity method
        name_col - The column of product name
        top_n - The number of top recommended item
    """
    similarity_scores = list(enumerate(method[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:1+top_n]
    
    # Get the books index
    pd_index = [i[0] for i in similarity_scores]
    
    #Return the top 5 most similar pds
    return name_col.iloc[pd_index]

In [10]:
def gen_recom_df(method):
    """
    Create the recommendation dataframe.
    Input:
        method - The similarity method
    """
    similar_pds = [recommend(i, method).values for i in range(len(pds_10))]
    recom_df = pd.DataFrame(similar_pds)
    recom_df.columns = [f'Recom{i}' for i in range(5)]
    recom_df['Product_Name'] = pds_10['Name']
    col_names = ['Product_Name'] + [f'Recom{i}' for i in range(5)]
    return recom_df[col_names]

In [11]:
# comping cosine similarity matrix using linear_kernal of sklearn
cosine_similarity = compute_similarity(pds_10['Name'])
recom_df_1 = gen_recom_df(cosine_similarity)
recom_df_1.sample(5)

Unnamed: 0,Product_Name,Recom0,Recom1,Recom2,Recom3,Recom4
495,Snowflower Illuminating Face Oil,Wonder Oil Illuminating Self-Tan Oil Light/Medium,THE FACE Illuminating Self-Tan Drops Light/Medium,Noni Glow Face Oil,Face Cream,Face Mask
1647,Photo Finish Foundation Primer Mini,Photo Finish Foundation Primer,Photo Finish Pore Minimizing Primer,Photo Finish Oil & Shine Control Primer,Photo Finish Reduce Redness Primer,Natural Finish Foundation
494,Coco Bliss Intensive Repair,Vanille Coco,Hair Repair,COCO NOIR,Coco Figue,Moisture Repair Shampoo
781,Intense Therapy Lip Balm SPF 25,Sugar Advanced Therapy Treatment Lip Balm,Sugar Lip Balm Sunscreen SPF 15,Intense Hydrating Mask,Lash Serum Intense,Silk Balm Hydrating and Nourishing Lip Balm Blush
1749,SLEEP OIL Rejuvenating Miracle Tanning Oil,Miracle Facial Oil,25 Miracle Nourishing Oil,Sleep Mist,Argan Liquid Gold Self-Tanning Oil,Rejuvenating Night Cream


## Using only Product Description for TFIDF analysis - cosine similarity

In [12]:
cosine_sim_des = compute_similarity(pds_10['Description'])
recom_df_2 = gen_recom_df(cosine_sim_des)
recom_df_2.sample(5)

Unnamed: 0,Product_Name,Recom0,Recom1,Recom2,Recom3,Recom4
1888,Magic Dancing Roses,Magic Liquid Diamonds,Flowerbomb Travel Spray,Flowerbomb Mini,Eau des Merveilles,Eau des Merveilles Bleue Eau de Toilette
226,Omnia Crystalline,Omnia Crystalline Travel Spray,Omnia Coral,Irrésistible Eau de Parfum,for her Fleur Musc,Gucci Flora Emerald Gardenia Eau de Toilette
933,Gentle Cleanser for Sensitive Skin,Noni Glow Face Oil,Noni Glow Face Oil Mini,Gentle Foaming Face Wash,Noni Radiant Eye Oil,Gel Balm Cleanser
149,Squalane + Probiotic Gel Moisturizer,Squalane + Omega Repair Cream,Jet Lag Mask,15% Niacinamide Gel Serum,Instant Foaming Cleanser,Total Cover Cream Foundation
1444,Kiss Lip Plumping System,Dior Lip Glow,Hydrating Lip Peel,The Pout Sparkling Rosé Hyaluronic Acid Collag...,Essential Lip Enhancer Balm,Lip Fetish Lip Balm Love Supreme


# Using Rated Content to Recommend

## Item profile

In [13]:
# Here we combine all the categories including subcategories, which may cause some category name similar. 
# But it should be fine as the similar categories are not so many.
cat_col_names = ['pd_id', 'category']
category_1 = pds[['pd_id', 'category_1']].dropna()
category_2 = pds[['pd_id', 'category_2']].dropna()
category_3 = pds[['pd_id', 'category_3']].dropna()

category_1.columns = cat_col_names
category_2.columns = cat_col_names
category_3.columns = cat_col_names

categories = pd.concat([category_1, category_2, category_3])
categories['category'] = categories['category'].str.lstrip()
assert len(categories) == len(category_1) + len(category_2) + len(category_3)

In [14]:
# one-hot encoding for category. len(f_genre) = 5000+
df_genre = pd.get_dummies(categories['category'])

# One item per row. Combine categories for the same product. len(f_genre) = 3000+
df_genre = pd.concat([categories, df_genre], axis=1).groupby('pd_id').sum()

#normalized
df_genre_normalized = df_genre.apply(lambda x: x/np.sqrt(df_genre.sum(axis=1))).sort_index()

df_genre_normalized

Unnamed: 0_level_0,Accessories,Aftershave,Anti-Aging,BB & CC Cream,BB & CC Creams,Bath & Body,Bath & Shower,Bath Soaks & Bubble Bath,Beauty Supplements,Best For,...,Teeth Whitening,Tinted Moisturizer,Toners,Tools & Brushes,Treatments,Tweezers & Eyebrow Tools,Value & Gift Sets,Value Sets,Wellness,Women
pd_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P0847,0.0,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735
P107319,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
P109908,0.0,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
P109911,0.0,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
P109936,0.0,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,0.0,...,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P94812,0.0,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
P95040,0.0,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
P9864,0.0,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
P9889,0.0,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000


## User profile

In [15]:
ratings = ratings.copy()
ratings['normalized_rating'] = (ratings['Rating'] - ratings['Rating'].mean()) / ratings['Rating'].std()

In [16]:
# To reduce work, we select users that have more than 5 ratings for analysis
value_count = ratings['AuthorId'].value_counts()
user_ratings_5 = ratings[ratings['AuthorId'].isin(value_count[value_count>5].index)]
user_ratings_5 = user_ratings_5.reset_index(drop=True)
user_ratings_5

Unnamed: 0,AuthorId,pd_id,Rating,timestamp,normalized_rating
0,7502714901,P307801,5,1495589159,0.612646
1,2028423302,P307801,5,1528047608,0.612646
2,995588930,P307801,5,1399187727,0.612646
3,2051869659,P307801,5,1576683581,0.612646
4,7790289612,P307801,5,1453916388,0.612646
...,...,...,...,...,...
313733,2460714319,P437988,5,1566308731,0.612646
313734,12073700896,P437988,4,1566001093,-0.240100
313735,12636369512,P437988,3,1566241276,-1.092845
313736,12675682972,P437988,5,1569343964,0.612646


In [17]:
pivot_rating = pd.pivot_table(user_ratings_5, 
                              values='normalized_rating', 
                              index=['pd_id'], 
                              columns = ['AuthorId']).sort_index()
pivot_rating

AuthorId,10000117144,1000016925,10000770719,10000892274,10001355168,1000145922,10001768414,10001961830,1000235057,1000296322,...,9995095787,9995433396,9995780065,9995810397,9996967068,999711822,dummyUser,orderGen3046665,orderGen309293,orderGen5563740
pd_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P0847,,,,,,,,,,,...,,,,,,,,,,
P107319,,,,,,,,,,,...,,,,,,,,,,
P109908,,,,,,,,,,,...,,,,,,,,,,
P109911,,,,,,,,,,,...,,,,,,,,,,
P109936,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P94812,,,,,,,,,,,...,,,,,,,,,,
P95040,,,,,,,,,,,...,,,,,,,,,,
P9864,,,,,,,,,,,...,,,,,,,,,,
P9889,,,,,,,,,0.612646,,...,-0.2401,-1.945591,,,,,,,,


In [18]:
from tqdm import tqdm

# cat_values is a 143*2198 matrix, showing the category proportion for each item.
cat_values = df_genre_normalized[df_genre_normalized.index.isin(pivot_rating.index)].T.values

# get all the user number
users = user_ratings_5['AuthorId'].unique()

cat_ratings = []

for i in tqdm(range(len(users))):
    user_name = users[i]
    
    # user_item_rating is a 2198*1 matrix, showing the item ratings of a specific user
    user_item_rating = pivot_rating[user_name].fillna(0).values
    
    # cat_rating is a 143*1 matrix, showing the category proportion of a specific user
    cat_rating = cat_values @ user_item_rating
    
    cat_ratings.append(cat_rating)

100%|██████████| 33840/33840 [00:06<00:00, 4893.17it/s]


In [19]:
user_profile = pd.DataFrame(cat_ratings, columns=df_genre_normalized.columns, index=users)
user_profile

Unnamed: 0,Accessories,Aftershave,Anti-Aging,BB & CC Cream,BB & CC Creams,Bath & Body,Bath & Shower,Bath Soaks & Bubble Bath,Beauty Supplements,Best For,...,Teeth Whitening,Tinted Moisturizer,Toners,Tools & Brushes,Treatments,Tweezers & Eyebrow Tools,Value & Gift Sets,Value Sets,Wellness,Women
7502714901,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.061134
2028423302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.215090
995588930,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.353711
2051869659,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.353711,0.0,0.0,0.0,0.0,0.353711
7790289612,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.707422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2831506876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.353711
5774276284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,-0.138622
1708131808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
1595037399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000


## IDF

The dot product of item vectors and IDF vectors gives us the weighted scores of each item

In [20]:
idf = 1/df_genre_normalized.sum()
idf_df_item = df_genre_normalized.mul(idf)

In [21]:
idf_df_item = df_genre_normalized.mul(idf)

## Make Prediction

In [22]:
df_predict = pd.DataFrame()
#user predict by tfidf
for i in tqdm(range(len(users))):
    working_df = idf_df_item.mul(user_profile.iloc[i], axis=1)
    df_predict[users[i]] = working_df.sum(axis=1)
df_predict

100%|██████████| 33840/33840 [25:00<00:00, 22.55it/s]


Unnamed: 0_level_0,7502714901,2028423302,995588930,2051869659,7790289612,1162190518,1080404053,5828699565,2614510141,21303248061,...,5808067347,1262025985,7707295856,8280086880,5850718026,2831506876,5774276284,1708131808,1595037399,985899777
pd_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P0847,0.013144,0.001514,0.004381,0.004381,0.011839,-0.001717,-0.023708,0.006117,0.004381,-0.001717,...,0.005507,0.000000,0.007650,-0.002158,-0.002158,0.005507,-0.002158,0.000000,0.000650,0.019777
P107319,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.031034,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
P109908,0.000000,-0.080809,-0.001048,0.000000,0.000000,0.001140,0.000000,0.001406,0.023929,0.002873,...,0.000000,0.000891,0.000000,0.000000,0.000000,0.000000,0.000000,0.000891,0.000000,0.000000
P109911,0.000000,-0.004850,-0.001815,0.000000,0.000000,0.001975,0.000000,0.002434,0.003325,0.000891,...,0.000000,0.001543,0.000000,0.000000,0.000000,0.000000,0.000000,0.001543,0.000000,0.000000
P109936,0.000000,-0.010703,-0.001048,0.000000,0.000000,0.001140,0.000000,0.001406,0.004278,0.002873,...,0.000000,0.000891,0.000000,0.000000,0.000000,0.000000,0.000000,0.000891,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P94812,0.000733,-0.001805,0.004485,0.013994,0.000000,0.000367,0.000000,0.000367,0.000635,0.013359,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
P95040,0.000733,-0.001805,0.000118,0.021447,0.000000,0.000367,0.000000,0.000367,0.000635,0.020812,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
P9864,0.003378,-0.001307,0.001126,0.001126,0.007581,-0.000441,-0.010410,0.000202,0.001126,-0.000441,...,0.003077,0.000000,0.004203,-0.001206,-0.001206,0.003077,-0.001206,0.000000,0.001126,0.008406
P9889,0.000000,-0.010703,-0.001048,0.000000,0.000000,0.001140,0.000000,0.001406,0.004278,0.002873,...,0.000000,0.000891,0.000000,0.000000,0.000000,0.000000,0.000000,0.000891,0.000000,0.000000


In [23]:
def recommender(user_no, top_n=5):
    return pd.merge(df_predict[user_no].sort_values(ascending=False).reset_index(), 
                    pds, 
                    on='pd_id').iloc[:top_n]

In [24]:
recommender('2028423302')

Unnamed: 0,pd_id,2028423302,brand,Name,Description,AverageOverallRating,FirstSubmissionTime,LastSubmissionTime,love_count,reviews_count,Price,category_1,category_2,category_3
0,P424554,0.022146,calvin-klein,Obsessed for Him,Fragrance Family: Warm & SpicyScent Type: Cool...,4.5,2018-07-16 20:46:36,2019-08-11 14:48:37,512.0,2.0,66.0,Fragrance,Men,Cologne
1,P169301,0.022146,versace,Versace Man Eau Fraiche,Fragrance Family: Fresh Scent Type: Citrus & W...,4.8377,2008-09-11 04:31:23,2020-07-01 04:21:30,7660.0,302.0,152.0,Fragrance,Men,Cologne
2,P158521,0.022146,john-varvatos,Vintage,"This sensual, masculine fragrance is aromatic,...",4.7842,2008-09-12 07:00:18,2020-01-19 03:53:43,2170.0,139.0,72.0,Fragrance,Men,Cologne
3,P408252,0.022146,ralph-lauren,Polo Blue Eau de Parfum,Polo Blue evokes a feeling of the freedom of t...,4.7113,2016-07-20 02:17:00,2020-03-12 13:45:57,1373.0,613.0,77.0,Fragrance,Men,Cologne
4,P406716,0.022146,john-varvatos,Artisan Blu,John Varvatos Artisan Blu is the ultimate embo...,5.0,2016-10-05 18:46:07,2018-06-20 20:19:26,910.0,12.0,72.0,Fragrance,Men,Cologne
