In [45]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

## Load data

In [46]:
dirpath = '../data/processed'
dirpath

'../data/processed'

In [47]:
transactions = pd.read_csv(f'{dirpath}/transactions.csv')
young_customers = pd.read_csv(f'{dirpath}/young_customers.csv')
article_descriptions = pd.read_csv(f'{dirpath}/article_descriptions.csv')
article_info = pd.read_csv(f'{dirpath}/article_info.csv')

## Collaborative filtering

In [48]:
transactions.isna().sum()

customer_id         0
article_id          0
t_dat               0
count               0
sum                 0
price               0
sales_channel_id    0
date_diff           0
dtype: int64

In [49]:
M = transactions['customer_id'].nunique()
N = transactions['article_id'].nunique()

customer_mapper = dict(zip(np.unique(transactions['customer_id']), list(range(M))))
article_mapper = dict(zip(np.unique(transactions['article_id']), list(range(N))))

customer_inv_mapper = dict(zip(list(range(M)), np.unique(transactions['customer_id'])))
article_inv_mapper = dict(zip(list(range(N)), np.unique(transactions['article_id'])))

purchased_ids = np.unique(transactions['article_id'])
unpurchased_articles = article_info[~article_info["article_id"].isin(purchased_ids)]

i = N
for a in unpurchased_articles["article_id"]:
    article_mapper[a] = i
    article_inv_mapper[i] = a
    i += 1

In [50]:
transactions.head()

Unnamed: 0,customer_id,article_id,t_dat,count,sum,price,sales_channel_id,date_diff
0,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,483064002,2019-09-21,1,0.025407,0.025407,2,1791
1,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,525518005,2019-09-21,1,0.042356,0.042356,2,1791
2,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,541518004,2019-12-01,1,0.016932,0.016932,2,1720
3,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,562245018,2019-12-01,2,0.033864,0.016932,2,1720
4,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,562245018,2019-12-01,2,0.033864,0.016932,2,1720


In [51]:
transactions['score_by_date'] = transactions['count'] / (transactions['date_diff'] + 1)
transaction_scores = transactions.groupby(['customer_id', 'article_id'])['score_by_date'].agg('sum').reset_index(name='score')

In [52]:
transaction_scores.head()

Unnamed: 0,customer_id,article_id,score
0,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,483064002,0.000558
1,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,525518005,0.000558
2,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,541518004,0.000581
3,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,562245018,0.002324
4,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,562245100,0.000662


In [53]:
transaction_scores.isna().sum()

customer_id    0
article_id     0
score          0
dtype: int64

Cap the scores at 90-percentile

In [54]:
transaction_scores['score']

0          0.000558
1          0.000558
2          0.000581
3          0.002324
4          0.000662
             ...   
1120407    0.000540
1120408    0.000644
1120409    0.000580
1120410    0.000580
1120411    0.000644
Name: score, Length: 1120412, dtype: float64

In [55]:
q1 = transaction_scores['score'].quantile(0.25)
q3 = transaction_scores['score'].quantile(0.75)
iqr = q3 - q1
upper = q3 + iqr * 1.5

transaction_scores['score'] = transaction_scores['score'].apply(lambda x: upper if x > upper else x)

Normalize scores so they falls in range [1,10].

In [56]:
def normalize_scores(scores, range=(1,10)):
    scaled = (scores - scores.min()) / (scores.max() - scores.min())
    return scaled * (range[1] - range[0]) + range[0]

transaction_scores['score'] = normalize_scores(transaction_scores['score'])

In [57]:
transaction_scores.head()

Unnamed: 0,customer_id,article_id,score
0,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,483064002,3.062966
1,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,525518005,3.062966
2,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,541518004,3.564777
3,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,562245018,10.0
4,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,562245100,5.325028


In [58]:
from scipy.sparse import csr_matrix

data = transaction_scores['score']
row_ind = [customer_mapper[i] for i in transaction_scores['customer_id']]
col_ind = [article_mapper[i] for i in transaction_scores['article_id']]

R = csr_matrix((data, (row_ind, col_ind)), shape=(M,N))

### Item-based similarity

In [59]:
from sklearn.neighbors import NearestNeighbors

def find_similar_articles(aid, R, article_inv_mapper, k = 10):
    """
        Using KNN to find k-similar items
        R's rows are item vectors
    """

    knn = NearestNeighbors(n_neighbors=k, metric="cosine")
    knn.fit(R)

    neighbors = knn.kneighbors(R[aid], return_distance=False)
    return [article_inv_mapper[aid] for aid in neighbors[0][1:]]

Predict for most-recent-bought item

In [60]:
customer_map = 2
cid = customer_inv_mapper[customer_map]
most_recent_article = transactions[transactions["customer_id"] == cid]["article_id"].values[0]

aid = article_mapper[most_recent_article]

recs_item = find_similar_articles(aid, R.T, article_inv_mapper)

article_descriptions[article_descriptions["article_id"].isin(recs_item)]

Unnamed: 0,article_id,desc
9452,533329017,Sweatshirt Overall Jumpsuit/Playsuit Garment F...
62907,737994010,LEGGINGS BELLE AOP Leggings/Tights Garment Low...
63728,739684006,James leggings Leggings/Tights Garment Lower b...
63729,739684007,James leggings Leggings/Tights Garment Lower b...
67402,750847002,Basic Detroit Skirt Skirt Garment Lower body S...
67687,751839001,Maserati dress Dress Garment Full body Solid L...
75938,783354004,Blake Linen Slim Blz Blazer Garment Upper body...
78275,793249001,James cord dress SET Dress Garment Full body S...
85549,820246001,KIDS-Alek Cardigan Cardigan Garment Upper body...


### Matrix Factorization with SVD

In [61]:
from scipy.sparse.linalg import svds

U, sigma, Vt = svds(R, k = 2)

R_svd = np.dot(np.dot(U, np.diag(sigma)), Vt)

In [62]:
R_svd.shape

(16595, 63887)

In [63]:
def top_recommendations(cid, R, R_pred, article_inv_mapper, top_N=10):
    """
        Make recommendation with filled user-item matrix, 
        exclude those already purchased by the active user
    """
    scores = R_pred[cid]
    
    past_purchase = R[cid].toarray().flatten() > 0
    past_purchase_indices = np.where(past_purchase)[0]
    
    sorted_id = scores.argsort()[::-1]

    recommendations = [aid for aid in sorted_id if aid not in past_purchase_indices][:top_N]
    
    return [article_inv_mapper[r] for r in recommendations]

In [64]:
recs_svd = top_recommendations(customer_map, R, R_svd, article_inv_mapper)

article_descriptions[article_descriptions['article_id'].isin(recs_svd)]

Unnamed: 0,article_id,desc
3091,448509014,Perrie Slim Mom Denim TRS Trousers Garment Low...
3711,464297007,Greta Thong Mynta Low 3p Underwear bottom Unde...
24837,610776002,Tilly (1) T-shirt Garment Upper body Solid Bla...
47592,689109001,Timeless Sports Top Bikini top Swimwear All ov...
58491,720125001,SUPREME RW tights Leggings/Tights Garment Lowe...
64367,741356002,Pamela Shorts HW Shorts Garment Lower body Den...
70221,759871002,Tilda tank Vest top Garment Upper body Solid B...
81824,806388001,Therese tee T-shirt Garment Upper body Solid B...
81825,806388002,Therese tee T-shirt Garment Upper body Solid W...
81826,806388003,Therese tee T-shirt Garment Upper body Solid B...


## Content-based Approach

### Customer-profile by Item similarity

Build customer profile with purchase history

In [65]:
F = article_descriptions['article_id'].nunique()

feature_mapper = dict(zip(np.unique(article_descriptions['article_id']), list(range(F))))
feature_inv_mapper = dict(zip(list(range(F)), np.unique(article_descriptions['article_id'])))

Transform the text descriptions into bag_of_words, apply PCA to reduce dimensionality.\
(using TruncatedSVD on sparse tfidf matrix due to memory limitations) 

In [66]:
from sklearn.feature_extraction.text import TfidfVectorizer

article_descriptions = article_descriptions.sort_values(by="article_id")
vect = TfidfVectorizer()
X = vect.fit_transform(article_descriptions['desc'])

Higher n reserves more information thus produce results that reflects purchase history better

In [67]:
from sklearn.decomposition import TruncatedSVD

n = 30 # hyperparam
truncated_svd = TruncatedSVD(n_components=n)
X_reduced = truncated_svd.fit_transform(X)

In [68]:
X_reduced

array([[ 0.3807024 , -0.02420071, -0.21431938, ...,  0.11924072,
        -0.21245269, -0.09330861],
       [ 0.37258455,  0.02226785, -0.25698451, ...,  0.11493939,
        -0.12322113, -0.06344632],
       [ 0.36449935,  0.02481174, -0.24895266, ...,  0.12697976,
        -0.10827037, -0.06952679],
       ...,
       [ 0.37414375, -0.20938299, -0.15966905, ..., -0.05625845,
         0.02741748, -0.01742337],
       [ 0.1089469 , -0.14975674,  0.02781775, ..., -0.02189444,
         0.01470068, -0.02023709],
       [ 0.38718758, -0.18189758, -0.19429851, ..., -0.06378211,
         0.10461439, -0.00185525]])

In [69]:
def customer_profile(cid, R, X, article_inv_mapper, feature_mapper):
    """
        Build user profile by aggregating items in purchase history
    
        Input:
            cid: mapped id of customer
            R: user-item matrix
            X: item features
        
        Output: A vector of customer-features (customer profile)
    """

    customer_scorings = R[cid].toarray().flatten()
    purchased = np.where(customer_scorings > 0)[0]
    
    purchased_aids = [article_inv_mapper[p] for p in purchased]
    feature_ids = [feature_mapper[a] for a in purchased_aids]

    agg_scores = np.matmul(customer_scorings[purchased], X[feature_ids])

    return agg_scores

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def customer_profile_recommendation(
        cid, R, X, 
        article_inv_mapper, 
        feature_mapper,
        feature_inv_mapper,
        top_N = 10
    ):
    """
        Make recommendation based on customer purchase history
    """
    profile = customer_profile(cid, R, X, article_inv_mapper, feature_mapper)
    
    sim = cosine_similarity([profile], X)[0]
    indices = sim.argsort()[::-1]

    purchased = np.where(R[cid].toarray().flatten() > 0)[0]
    purchased_feature_ids = [feature_mapper[article_inv_mapper[i]]
                            for i in purchased]
    
    top_N_indices = [i for i in indices if i not in purchased_feature_ids][:top_N]
    
    return [feature_inv_mapper[i] for i in top_N_indices]

    

In [None]:
recs = customer_profile_recommendation(customer_map, R, X_reduced, article_inv_mapper, feature_mapper, feature_inv_mapper)

article_descriptions[article_descriptions["article_id"].isin(recs)]

Unnamed: 0,article_id,desc
30225,627267001,Corina fancy poncho Other accessories Accessor...
30226,627267002,Corina fancy poncho Other accessories Accessor...
35267,646067002,Sansa kaftan Other accessories Accessories Sol...
70217,759847004,Elba swimsuit Swimsuit Swimwear Colour blockin...
73767,773323001,PE BECK SWIMSUIT Swimsuit Swimwear Solid Dark ...
80112,800862001,B VERBENA KAFTAN new Top Garment Upper body So...
92682,854786001,B Wow Kaftan Sarong Swimwear Glittering/Metall...
94415,862398001,CE Sable bikini bottom Swimwear bottom Swimwea...
94416,862399001,CE L'eau swimsuit OL Swimsuit Swimwear All ove...
96945,872275001,SC D2 - CARTER swimsuit Swimsuit Swimwear All ...


In [None]:
cust_2 = transactions[transactions['customer_id'] == customer_inv_mapper[customer_map]]

article_descriptions[article_info['article_id'].isin(cust_2['article_id'])]

Unnamed: 0,article_id,desc
24,120129014,Babette long Leggings/Tights Garment Lower bod...
491,250099001,Mama Heavy Plain 2p Tights Underwear Tights So...
1320,341129001,Mama Support 70 den 1p Tights Underwear Tights...
2533,412370001,Mama fleece leggings Underwear Tights Socks & ...
3909,469562061,Skinny denim (1) Trousers Garment Lower body D...
10785,543054014,Danja Sweater Sweater Garment Upper body Other...
22656,599580055,Timeless Midrise Brief Swimwear bottom Swimwea...
25190,611635003,Sylvia Cross Bag Bag Accessories Solid Black D...
33960,640174001,Lola Lace-Up Sweater Garment Upper body Solid ...
39647,662257006,Selma sport dress J Dress Garment Full body St...


### Customer Info similarity

In [None]:
young_customers.head()

Unnamed: 0,customer_id,age
0,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,28.0
1,0010e8eb18f131e724d6997909af0808adbba057529edb...,25.0
2,0013bde09d10db6b0a6a3b0987ac60b643013dfc6f924b...,27.0
3,00155b2ef48cfb5d2fce4642f670f151efe0747542a5b9...,21.0
4,001a7fb6def4cc4de27cb02f0025ea28c8ee74efdd3c73...,24.0


In [None]:
transactions.head()

Unnamed: 0,customer_id,article_id,t_dat,count,sum,price,sales_channel_id,date_diff,score_by_date
0,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,483064002,2019-09-21,1,0.025407,0.025407,2,1791,0.000558
1,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,525518005,2019-09-21,1,0.042356,0.042356,2,1791,0.000558
2,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,541518004,2019-12-01,1,0.016932,0.016932,2,1720,0.000581
3,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,562245018,2019-12-01,2,0.033864,0.016932,2,1720,0.001162
4,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,562245018,2019-12-01,2,0.033864,0.016932,2,1720,0.001162


Summarize customer average spent per month

In [None]:
customer_spent = transactions.copy()[["customer_id", "article_id", "t_dat", "sum", "sales_channel_id"]]

customer_spent["t_dat"] = pd.to_datetime(customer_spent["t_dat"])
customer_spent["month"] = customer_spent["t_dat"].dt.to_period('M')

In [None]:
customer_spent.head()

Unnamed: 0,customer_id,article_id,t_dat,sum,sales_channel_id,month
0,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,483064002,2019-09-21,0.025407,2,2019-09
1,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,525518005,2019-09-21,0.042356,2,2019-09
2,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,541518004,2019-12-01,0.016932,2,2019-12
3,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,562245018,2019-12-01,0.033864,2,2019-12
4,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,562245018,2019-12-01,0.033864,2,2019-12


In [None]:
customer_total_spent = customer_spent.groupby("customer_id")["sum"].sum().reset_index()
customer_total_months = customer_spent.groupby("customer_id")["month"].nunique().reset_index()
customer_info = pd.merge(customer_total_spent, customer_total_months)
customer_info["avg_per_month"] = customer_info["sum"] / customer_info["month"]

# take the most active channel for each user
mode = lambda x : x.mode().iloc[0] if not x.mode().empty else None
customer_most_active_channel = customer_spent.groupby("customer_id")["sales_channel_id"].agg(mode).reset_index()
pd.merge(customer_info, customer_most_active_channel)

Unnamed: 0,customer_id,sum,month,avg_per_month,sales_channel_id
0,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,0.596220,4,0.149055,2
1,0010e8eb18f131e724d6997909af0808adbba057529edb...,2.416390,19,0.127178,1
2,0013bde09d10db6b0a6a3b0987ac60b643013dfc6f924b...,0.457746,4,0.114436,1
3,00155b2ef48cfb5d2fce4642f670f151efe0747542a5b9...,2.135847,14,0.152561,2
4,001a7fb6def4cc4de27cb02f0025ea28c8ee74efdd3c73...,0.196407,4,0.049102,2
...,...,...,...,...,...
16590,ffe2e37b34b822313fa1da777d63d530f2aeea2dc6d5ae...,1.948068,14,0.139148,2
16591,ffe6376eb6b854d842e5a7714ea758de127f086a60d67d...,30.498153,17,1.794009,2
16592,fff472b503d848c0446b040b2475dccba2c1a9d1f0541b...,2.287915,13,0.175993,2
16593,fff7f145e356557541af211bb11aa9d78d7edf51bd3c40...,2.378627,18,0.132146,1


In [None]:
customer_info = pd.merge(customer_info, young_customers)
customer_info = customer_info.drop(columns=["sum"])

#### Customer information
- month: total number of months the customer being active
- sales_channel_id: the customer's most active sales channel
- avg_per_month: average spent per month
- age: mostly young customers (age 16 to 29)

In [None]:
customer_info.head()

Unnamed: 0,customer_id,month,avg_per_month,age
0,00066fdcf5f0da690b898b287d05ce477bd2764ce975d1...,4,0.149055,28.0
1,0010e8eb18f131e724d6997909af0808adbba057529edb...,19,0.127178,25.0
2,0013bde09d10db6b0a6a3b0987ac60b643013dfc6f924b...,4,0.114436,27.0
3,00155b2ef48cfb5d2fce4642f670f151efe0747542a5b9...,14,0.152561,21.0
4,001a7fb6def4cc4de27cb02f0025ea28c8ee74efdd3c73...,4,0.049102,24.0


In [None]:
customer_info.describe()

Unnamed: 0,month,avg_per_month,age
count,16595.0,16595.0,16595.0
mean,11.258451,0.240975,24.296174
std,5.939607,0.483157,2.965404
min,1.0,0.008627,16.0
25%,7.0,0.099229,22.0
50%,11.0,0.165583,24.0
75%,16.0,0.295652,27.0
max,25.0,53.031461,29.0


In [None]:
I = customer_info["customer_id"].nunique()

info_mapper = dict(zip(np.unique(customer_info["customer_id"]), list(range(I))))
info_inv_mapper = dict(zip(list(range(I)), np.unique(customer_info["customer_id"])))

Customer-content-based recommendation:
1. Find the similar ustomers by KNN
2. Predict active customer scores for articles:
    $$
    p(a,i) = \bar r_i + {\sum_u sim(a,u) \times (r_{u,i} - \bar r_u)
                        \over \sum_u sum(a,u)}
    $$

In [None]:
customer_info = customer_info.sort_values(by="customer_id")
info_mat = customer_info.drop(columns=['customer_id']).values
k = 30
knn = NearestNeighbors(n_neighbors=k, metric="cosine")
knn.fit(info_mat)

def recommendation_by_similar_customers(
        info_cid, 
        info_mat,
        R, 
        model,
        customer_mapper,
        info_inv_mapper,
        article_inv_mapper,
        top_N = 10
    ):
    
    
    distance, neighbors = knn.kneighbors([info_mat[info_cid]])
    cosine = (1 - distance[0])[neighbors[0] != info_cid]
    neighbors = neighbors[0][neighbors[0] != info_cid]
    
    # extract similar customers scorings
    active = R[customer_mapper[info_inv_mapper[info_cid]]].toarray().flatten()
    similar_cust_ids = [customer_mapper[info_inv_mapper[i]] for i in neighbors]
    similar_cust_R = R[similar_cust_ids].toarray()
    similar_cust_mean = similar_cust_R.mean(axis=1).reshape(-1,1)
    
    # calculate predicted articles scoring of the active user
    scores = (
        active.mean() 
        + np.dot(cosine, similar_cust_R - similar_cust_mean) 
        / cosine.sum()
    )
    scores = scores.argsort()[::-1]
    
    # exclude those already purchased
    purchased = np.where(active > 0)[0]
    
    return [article_inv_mapper[i] for i in scores if i not in purchased][:top_N]

In [None]:
recs = recommendation_by_similar_customers(2, info_mat, R, knn, 
                                           customer_mapper, 
                                           info_inv_mapper, 
                                           article_inv_mapper)

article_descriptions[article_descriptions["article_id"].isin(recs)]

Unnamed: 0,article_id,desc
880,300024048,Superskinny Trousers Garment Lower body Denim ...
22716,599719024,Hudson shorts Shorts Garment Lower body Solid ...
22717,599719025,Hudson shorts Shorts Garment Lower body Stripe...
24837,610776002,Tilly (1) T-shirt Garment Upper body Solid Bla...
57399,716672001,Jennifer Top Vest top Garment Upper body Solid...
57401,716672003,Jennifer Top Vest top Garment Upper body Solid...
58799,721266002,Carla parka Hoodie Garment Upper body Solid Da...
74259,776041002,Holly brazilian w tie ribbons Underwear bottom...
95644,867257001,Braided aliceband Hair/alice band Accessories ...
97822,876021001,Gemma playsuit Jumpsuit/Playsuit Garment Full ...


### Item features similarity

In [None]:
X_reduced.shape

(105542, 30)

In [None]:
def find_similar_articles_by_feature(aid, X, feature_inv_mapper, top_N=10):
    article = X[aid]
    
    sim_scores = cosine_similarity([article], X)[0]
    sim_scores = sim_scores.argsort()[::-1]

    return [feature_inv_mapper[i] for i in sim_scores if i != aid][:top_N]

In [None]:
aid = feature_mapper[most_recent_article]
recs = find_similar_articles_by_feature(aid, X_reduced, feature_inv_mapper)

print(article_descriptions[article_descriptions["article_id"] == most_recent_article])
article_descriptions[article_descriptions["article_id"].isin(recs)]

    article_id                                               desc
24   120129014  Babette long Leggings/Tights Garment Lower bod...


Unnamed: 0,article_id,desc
23,120129001,Babette long Leggings/Tights Garment Lower bod...
26,120129025,Babette long Leggings/Tights Garment Lower bod...
19345,583732002,Active Fifi Leggings/Tights Garment Lower body...
22044,596643001,Babette Cropped Leggings/Tights Garment Lower ...
41703,669952001,Fiffi Folded waist Leggings/Tights Garment Low...
75090,779755001,Isabelle folded tights Leggings/Tights Garment...
90626,843940001,Babette 2-pack Leggings/Tights Garment Lower b...
90628,843940005,Babette 2-pack Leggings/Tights Garment Lower b...
93435,858125001,Babette Capri 2-pack Leggings/Tights Garment L...
104427,923010002,ED Long leggings update Leggings/Tights Garmen...
