# Import libraries


In [None]:
import numpy as np
import pandas as pd
import os
import glob
#import reco
from tqdm import tqdm
import datetime
from functools import partial
from dask.diagnostics import ProgressBar
ProgressBar().register()
import dask.dataframe as dd

In [None]:
tqdm.pandas()

# Read data

In [None]:
data = pd.read_csv('transactions_train.csv',dtype={'article_id':str})
print(data.shape)

In [None]:
data.head()

In [None]:
data.t_dat = pd.to_datetime(data.t_dat)#data.t_dat.progress_apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d'))
data = data[['t_dat','customer_id','article_id']]

## Select only the last weeks

This way we will keep the relevant data and keep its size reasonable. We will take 2 weeks for training and leave the last one for validation

In [None]:
print("All Transactions Date Range: {} to {}".format(data['t_dat'].min(), data['t_dat'].max()))

train1 = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,8)) & (data['t_dat'] < datetime.datetime(2020,9,16))]
train2 = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,1)) & (data['t_dat'] < datetime.datetime(2020,9,8))]
train3 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,23)) & (data['t_dat'] < datetime.datetime(2020,9,1))]
train4 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,15)) & (data['t_dat'] < datetime.datetime(2020,8,23))]

val = data.loc[data["t_dat"] >= datetime.datetime(2020,9,16)]

#del data

In [None]:
# List of all purchases per user (has repetitions)
positive_items_per_user1 = train1.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user2 = train2.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user3 = train3.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user4 = train4.groupby(['customer_id'])['article_id'].apply(list)

In [None]:
train = pd.concat([train1,train2], axis=0) #train2

# Non-personalized 

For this we will compute the top best items in our data as well as stratify the customer population in ages (divided in deciles), as we suppose different intervals will buy different items. 

The best items will have a score sensible to time, as items that have not been bought in the recent weeks might not be as relevant as new ones. 

Let's read the customer dataset to find the age:

In [None]:
customers = pd.read_csv('customers.csv')
customers = customers[['customer_id','age']]

Merge it with the training dataset:

In [None]:
train = train.merge(customers, how='left', on='customer_id')
val = val.merge(customers, how='left', on='customer_id')

Calculate deciles of age and a popularity factor based on the time of the transaction:

In [None]:
train['age2'] = pd.qcut(train['age'], 10)
train['age2'].value_counts()
train['pop_factor'] = train['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,16) - x).days)

We define the function that calculates the most frequent bought items on the training dataset given an age that is in the training set:

In [None]:
intervals = train.age2.unique().dropna()

top_by_age = {}
for inter in intervals: 
    train_age = train.loc[train.age2 == inter]
    popular_items_group = train_age.groupby(['article_id'])['pop_factor'].sum()
    _, popular_items = zip(*sorted(zip(popular_items_group, popular_items_group.keys()))[::-1][:12])
    
    top_by_age.setdefault(inter,popular_items)

age_interval = {age : interval for age in range(15,100) for interval in intervals if age in interval}

popular_items_group = train.groupby(['article_id'])['pop_factor'].sum()
_, top = zip(*sorted(zip(popular_items_group, popular_items_group.keys()))[::-1][:12])


In [None]:
train.pop_factor.describe()

In [None]:
top

# Item-Based collaborative filtering

For this methodology we will only pick users that have bought more than 20 times, as this calculus is very memory consuming. Either way, it is also important to pick this group as we need information to generate the scores. 

Memory-wise, we have also implemented a sparse matrix, as normal numpy matrices consumed all our RAM :(. This has also resulted in faster computation. 

Also in order to be able to retrieve data after sparsing the data, we have created different dictionaries which allow us to access to any data in real-time. 


Filter for the most popular users (have bought >=20 times):

In [None]:
train_cf = train[['customer_id','article_id','age']].copy()
print(train_cf.shape)
v = train_cf.customer_id.value_counts()
train_cf = train_cf[train_cf.customer_id.isin(v.index[v.gt(20)])]
train_cf['rating'] = 1
print(train_cf.shape)

Generate the sparse matrix

In [None]:
from sklearn.decomposition import TruncatedSVD
from scipy import sparse


num_components = 100 #20

svd = TruncatedSVD(n_components=num_components)

users = dict(map(lambda x: x[::-1],enumerate(train_cf.customer_id.unique())))
items = dict(map(lambda x: x[::-1],enumerate(train_cf.article_id.unique())))
id_item = dict(enumerate(train_cf.article_id.unique()))


row = []
col = []
for i,j,k in zip(train_cf.customer_id.values,train_cf.article_id.values,train_cf.rating.values):
    row.append(users[i])
    col.append(items[j])

print(len(users),len(items),len(train_cf.rating))

X = sparse.csr_matrix((train_cf.rating.values, (col, row)), shape=(len(items),len(users)))
        
matrix = svd.fit_transform(X)


Design the function that given a user it computes the top12 items more suitable:

In [None]:
#Funció per a cada usuari:

def predict_cf(customer_id):
    '''
    Function that receives a customer_id and based on the sparse matrix return the 12 best items for it.
    '''
    try:
        #Get the indexes of the items that the user has bought
        index = X.todense()[:,users[customer_id]].flatten().nonzero()[1]

        #Get vector of row means
        mean = matrix[[index],:][0].mean(axis=0)
        
        #Multiply the sparse with the vector 
        pred = np.dot(matrix, mean)

        #Get the top 12 according to pred
        top_12_indexes = np.argpartition(-pred,12)[:12]

        top12_to_sort = list(zip(top_12_indexes,pred[top_12_indexes]))

        top12_sorted = list(sorted(top12_to_sort,key=lambda x: x[1],reverse=True))

        top12_articles = [id_item[pair[0]] for pair in top12_sorted]

        return top12_articles
    except:
        pass
    


Compute the CF RecSys for the training dataset:

In [None]:
ddf = dd.from_pandas(pd.DataFrame(list(users.keys()),columns=['customer_id']), npartitions=12)

ddf['recom'] = ddf.map_partitions(lambda df: df.customer_id.apply(lambda x: predict_cf(x))).compute()

recom = pd.DataFrame(ddf.compute())


recom

# RecSys Implementation

We implement the recommendation system. We also add code to simulate the score. The implementation is the following:

For a user, if it's found in the training set that we use to compute the collaborative-filtering (users that buy >=15 items) we will give the recommendation based on this algorithm computed before. If in the contrary, the user is found in the training set but not in the one used for CF, we recommend the most common items for this user. The most common items is calculated first looking at the items bought in the last week, then if no item is found we look at the second week, and so on. Finally, if the user is not in any of the sets used for training, we give him the most popular items based on the age (if it's found) or in general.


Calculation of the score:

In [None]:
def apk(actual, predicted, k=12):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

Implement the recommender system in the validation dataset:

In [None]:
#Comparem el validation items (actual output del validation) amb el predit del validation a partir del training!
positive_items_val = val.groupby(['customer_id'])['article_id'].apply(list)
val_users = positive_items_val.keys()
val_items = []

for i,user in tqdm(enumerate(val_users)):
    val_items.append(positive_items_val[user])

In [None]:
from collections import Counter
outputs = []
cnt = 0
cnt2 = 0

user_age = dict(zip(val.customer_id,val.age))
popular_items = list(popular_items)

for user in tqdm(val_users):
    if user not in users.keys():
        
        user_output = []
        
        if user in positive_items_per_user1.keys():
            most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user1[user]).most_common()}
            user_output += list(most_common_items_of_user.keys())[:12]
            
        if user in positive_items_per_user2.keys():
            most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user2[user]).most_common()}
            user_output += list(most_common_items_of_user.keys())[:12]
            
        if user in positive_items_per_user3.keys():
            most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user3[user]).most_common()}
            user_output += list(most_common_items_of_user.keys())[:12]
            
        if user in positive_items_per_user4.keys():
            most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user4[user]).most_common()}
            user_output += list(most_common_items_of_user.keys())[:12]
            
        if user in user_age.keys() and ~np.isnan(user_age[user]) and user_age[user] >= 15.0 and user_age[user] < max(age_interval.keys()):
            cnt2 += 1
            user_output += list(top_by_age[age_interval[int(user_age[user])]][:12 - len(user_output)])
            outputs.append(user_output)
        
        
        else: 
            user_output += list(top[:12 - len(user_output)])
            outputs.append(user_output)
    else:
        user_output = list(recom.loc[recom['customer_id'] == user,'recom'].values)
        user_output += list(top[:12 - len(user_output)])
        outputs.append(list(user_output))
        cnt+=1
        
    
        
print(cnt2,cnt)
print("mAP Score on Validation set:", mapk(val_items, outputs))

# Test submission:

Implement the RecSyst for the test dataset:

In [None]:
train1_t = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,16)) & (data['t_dat'] < datetime.datetime(2020,9,23))]
train2_t = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,8)) & (data['t_dat'] < datetime.datetime(2020,9,16))]
train3_t = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,31)) & (data['t_dat'] < datetime.datetime(2020,9,8))]
train4_t = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,23)) & (data['t_dat'] < datetime.datetime(2020,8,31))]

positive_items_per_user1_t = train1_t.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user2_t = train2_t.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user3_t = train3_t.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user4_t = train4_t.groupby(['customer_id'])['article_id'].apply(list)

train_t = pd.concat([train1_t,train2_t], axis=0) #train2_t
train_t = train_t.merge(customers, how='left', on='customer_id')
train_t['pop_factor'] = train_t['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,23) - x).days)

train_t['age2'] = pd.qcut(train_t['age'], 10)
train_t

In [None]:
intervals_t = train_t.age2.unique().dropna()

top_by_age = {}
for inter in intervals_t: 
    train_age_t = train_t.loc[train_t.age2 == inter]
    popular_items_group_t = train_age_t.groupby(['article_id'])['pop_factor'].sum()
    _, popular_items_t = zip(*sorted(zip(popular_items_group_t, popular_items_group_t.keys()))[::-1][:12])
    
    top_by_age.setdefault(inter,popular_items_t)

age_interval = {age : interval for age in range(15,100) for interval in intervals_t if age in interval}

popular_items_group_t = train.groupby(['article_id'])['pop_factor'].sum()
_, top = zip(*sorted(zip(popular_items_group_t, popular_items_group_t.keys()))[::-1][:12])

user_group = pd.concat([train1, train2, train3, train4], axis=0).groupby(['customer_id'])['article_id'].apply(list)

Load test submission:

In [None]:
test = pd.read_csv("sample_submission.csv")
test = test[['customer_id']]
test.head()

Apply the RecSys on these customers ids to get predictions:

In [None]:
test1 = test.copy()

In [None]:
test = test1.merge(customers, how='left', on='customer_id')

def to_submission(data):
    return " ".join([str(x) for x in data])
        
def recommend(user,age): 
    recommendation = []
    if user not in users.keys():
        user_output = []
        if user in positive_items_per_user1_t.keys():
            most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user1_t[user]).most_common()}
            user_output += list(most_common_items_of_user.keys())[:12]
            
        if user in positive_items_per_user2_t.keys():
            most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user2_t[user]).most_common()}
            user_output += list(most_common_items_of_user.keys())[:12]
            
        if user in positive_items_per_user3_t.keys():
            most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user3_t[user]).most_common()}
            user_output += list(most_common_items_of_user.keys())[:12]
            
        if user in positive_items_per_user4_t.keys():
            most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user4_t[user]).most_common()}
            user_output += list(most_common_items_of_user.keys())[:12]

        if user in user_age.keys() and ~np.isnan(user_age[user]) and user_age[user] >= 15.0 and user_age[user] < max(age_interval.keys()):
            user_output += list(top_by_age[age_interval[int(user_age[user])]][:12 - len(user_output)])
        else: 
            user_output += list(top[:12 - len(user_output)])
            
        return user_output
    else:
        user_output = list(recom.loc[recom['customer_id'] == user,'recom'].values)
        user_output += list(top[:12 - len(user_output)])
        
        return user_output


test['prediction'] = test.progress_apply(lambda x: to_submission(recommend(x.customer_id,x.age)),axis=1)


In [None]:
test

Add the predictions to the test dataset to create the final submission:

In [None]:
del test['age']

test.to_csv(f'submission.csv',index=False)
test.head()