# Capstone: Recommender System
## 5. Aggregation and Recommendation

In [279]:
import pandas as pd
from html.parser import HTMLParser
import seaborn as sns
import numpy as np

In [2]:
df = pd.read_csv('../datasets/product_info_clean.csv')
content = pd.read_csv('../datasets/content_sim.csv')
collab = pd.read_csv('../datasets/collaborative_sim.csv')
repurchase = pd.read_csv('../datasets/repurchase.csv')

In [7]:
#drop Unnamed: 0 columns
df.drop(columns = 'Unnamed: 0', axis = 1, inplace = True)
collab.drop(columns = 'id', axis =1 , inplace =True)
content.drop(columns = 'Unnamed: 0', axis = 1, inplace = True)
repurchase.drop(columns = 'Unnamed: 0', axis = 1, inplace = True)


In [10]:
df.Category.value_counts()

moisturizer-skincare         95
facial-toner-skin-toner      93
face-wash-facial-cleanser    93
Name: Category, dtype: int64

In [292]:
collab

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,271,272,273,274,275,276,277,278,279,280
0,1.000000,0.007424,0.014467,0.212735,0.028928,0.001812,0.008993,0.038045,-0.013437,0.016253,...,0.023221,0.012636,0.108990,-0.007754,0.261532,0.235143,-0.032235,0.081715,0.198258,-0.028750
1,0.007424,1.000000,0.059456,0.150035,-0.013574,0.034248,0.020623,0.033282,0.815632,0.032876,...,-0.008224,0.315635,0.091877,0.357517,-0.008500,0.009846,-0.022590,0.375153,-0.016026,0.495060
2,0.014467,0.059456,1.000000,0.074073,0.009476,-0.003730,0.019439,0.063996,0.171413,0.025225,...,-0.012548,0.389529,0.001757,0.000118,-0.000867,0.185602,-0.003201,0.354348,-0.021057,0.269540
3,0.212735,0.150035,0.074073,1.000000,0.121596,0.370508,0.219642,0.133643,0.417675,0.004828,...,0.039302,0.383658,0.222548,0.052362,0.153487,0.162229,0.321975,0.297825,0.304418,0.296123
4,0.028928,-0.013574,0.009476,0.121596,1.000000,0.246030,0.023239,0.038490,-0.022507,0.011908,...,0.002388,0.001536,0.574438,-0.003643,0.085566,0.006053,-0.036071,0.002719,0.169005,-0.018230
5,0.001812,0.034248,-0.003730,0.370508,0.246030,1.000000,0.074696,0.036785,0.199922,0.006310,...,0.169463,0.255975,0.329317,-0.046401,0.016611,0.041197,0.226925,0.272426,0.109983,0.239753
6,0.008993,0.020623,0.019439,0.219642,0.023239,0.074696,1.000000,0.115634,0.312898,0.051550,...,0.018386,0.400243,-0.008364,-0.011966,-0.035023,0.005826,0.316777,0.365653,0.012381,-0.031282
7,0.038045,0.033282,0.063996,0.133643,0.038490,0.036785,0.115634,1.000000,0.209585,0.020012,...,0.049157,0.437758,0.135999,0.121567,0.140206,0.231032,0.317358,0.452582,0.311697,0.347977
8,-0.013437,0.815632,0.171413,0.417675,-0.022507,0.199922,0.312898,0.209585,1.000000,-0.017918,...,0.003698,0.502274,0.047727,0.153980,-0.058578,-0.004103,0.296691,0.567732,0.062246,0.526346
9,0.016253,0.032876,0.025225,0.004828,0.011908,0.006310,0.051550,0.020012,-0.017918,1.000000,...,0.325735,0.208590,0.225412,0.331634,-0.008644,0.006858,-0.022536,0.128861,-0.025818,0.296770


id 0 - 92 are facial cleansers, id 93 - 185 are toners and id 186 - 280 are moisturizers.

Define the recommender function, the inputs are the category of products users are looking for and the name of product they are using right now.

In [293]:
def recommender(cat, product_name):
    product_id = df[df.full_name == product_name].index[0]
    
    #index of products from desired category
    cat_in = df[df.Category == cat].index
        
    #rank product by similarity, lower the ranking number, higher the similarity
    content_rank = content.iloc[product_id,cat_in].rank(ascending=False, method='min')
    collab_rank = collab.iloc[product_id,cat_in].rank(ascending=False, method='min')
    
    #aggregate two rankings to get overall rankings
    rank = content_rank + collab_rank
    
    #sort rankings with ascending orders, products on top will most likely be recommended
    rank = rank.sort_values()[1:]
    
    #for products have same rankings after aggragation, we will take into account the repurchase rate.
    re_one = rank[rank.duplicated()]
    re_all = rank[rank.duplicated(keep=False)]
    
    for i in re_one:
        list_index = []
        list_num = []
        for e in re_all.index:
            if i == re_all[e]:
                list_index.append(e)
                list_num.append(int(e))
        r = repurchase.loc[list_num,'rate'].rank(ascending = False,method = 'min')
        rank[list_index] = rank[list_index] + list(0.01*r)   
            
    #show the top 10 products' infomation  
    result = df.iloc[list(rank.sort_values()[:10].index)]
    
    #show cosine similarity score for both content base and collaborative
    content_sim = list(content.iloc[product_id,list(result.index)])
    collab_sim = list(collab.iloc[product_id,list(result.index)])
    result.loc[:,'content_sim'] = content_sim
    result.loc[:,'collab_sim'] = collab_sim
    final = result[['full_name','brand','content_sim','collab_sim']]
    return final
    