In [1]:
# Libraries

import numpy as np
import pandas as pd 
import datetime # calculate the duration between two dates and times
import gc                                # garbage collector
import time
import multiprocessing as mp             # supports spawning processes using an API similar to the threading module
from multiprocessing import Pool         # offers a convenient means of parallelizing the execution of a function across multiple input values, 
                                         # distributing the input data across processes (data parallelism)
from functools import partial            # higher order function which takes a function as input (like map and filter)

import glob
from tqdm import tqdm
from collections import Counter

import warnings
warnings.filterwarnings("ignore")

!pip install git+https://github.com/mayukh18/reco
import reco
from reco.recommender import FunkSVD
from reco.metrics import rmse

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/mayukh18/reco
  Cloning https://github.com/mayukh18/reco to /tmp/pip-req-build-upuoyg1v
  Running command git clone -q https://github.com/mayukh18/reco /tmp/pip-req-build-upuoyg1v
Building wheels for collected packages: reco
  Building wheel for reco (setup.py) ... [?25l[?25hdone
  Created wheel for reco: filename=reco-0.2.1-cp37-cp37m-linux_x86_64.whl size=9678198 sha256=e02742f7d1b39cdd6bc7cb7ca5635b38ae952c7d73be026fa0dc7fbd38f0e6d9
  Stored in directory: /tmp/pip-ephem-wheel-cache-r0llx1kr/wheels/6b/77/06/3fabb6de467036024e54b62021cc5d1dc8cf3a300b17c43089
Successfully built reco
Installing collected packages: reco
Successfully installed reco-0.2.1


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# read the datasets

customers = pd.read_csv('/content/drive/MyDrive/Project_Data/customers.csv')

articles = pd.read_csv('/content/drive/MyDrive/Project_Data/articles.csv',dtype={'article_id': 'int32'})

submissions = pd.read_csv('/content/drive/MyDrive/Project_Data/sample_submission.csv')

# Content-based filtering

In [4]:
transactions_train = pd.read_csv('/content/drive/MyDrive/Project_Data/transactions_train.csv', dtype={'article_id': str}, parse_dates=['t_dat'])
transactions_train_df = transactions_train.sort_values(["customer_id", "t_dat"], ascending=False)


In [5]:
last_date = transactions_train_df.t_dat.max()
print(last_date)
print(transactions_train_df.loc[transactions_train_df.t_dat==last_date].shape)

2020-09-22 00:00:00
(32866, 5)


In [6]:
most_frequent_articles = list(transactions_train_df.loc[transactions_train_df.t_dat==last_date].article_id.value_counts()[0:12].index)
art_list = []
for art in most_frequent_articles:
    art = "0"+str(art)
    art_list.append(art)
art_str = " ".join(art_list)
print("Frequent articles bought recently: ", art_str)

Frequent articles bought recently:  00924243002 00751471001 00448509014 00918522001 00866731001 00714790020 00788575004 00915529005 00573085028 00918292001 00850917001 00928206001


In [7]:
agg_df = transactions_train_df.groupby(["customer_id"])["article_id"].agg(lambda x: str(x.values[0:12])[1:-1]).reset_index()

In [8]:
def padding_articles(x):
    if x:
        xl = x.split()
        x = []
        for xi in xl:
            x.append("0"+xi)
        dimm_x = len(x)
        if dimm_x < 12:
            x.extend(art_list[:12-dimm_x])
        return(" ".join(x))

In [9]:
agg_df["article_id"] = agg_df["article_id"].apply(lambda x: padding_articles(x))

In [10]:
print("Aggregated transaction history: ", agg_df.customer_id.nunique())
print("Submission sample: ", submissions.customer_id.nunique())

Aggregated transaction history:  1362281
Submission sample:  1371980


In [11]:
print(submissions.shape)
submissions.head()

(1371980, 2)


Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0706016002 0372860001 0610776002 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0706016002 0372860001 0610776002 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0706016002 0372860001 0610776002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0706016002 0372860001 0610776002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0706016002 0372860001 0610776002 07...


In [12]:
# For the customers with missing articles, we simply replace with most frequent buyed articles in most recent day(s).

submission_df = agg_df.merge(submissions[["customer_id"]], how="right")
submission_df.columns = ["customer_id", "prediction"]
print(submission_df.shape)
submission_df.head()

(1371980, 2)


Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0'0568601043' 0'0841260003' 0'0887593002' 0'08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0'0826211002' 0'0599580055' 0'0599580055' 0'08...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0'0794321007' 0'0858883002' 0'0851400006' 0'07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0'0742079001' 0'0732413001' 00924243002 007514...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0'0896152002' 0'0730683050' 0'0927530004' 0'07...


In [13]:
print("Rows with missing data in submission: ", submission_df.loc[submission_df.prediction.isna()].shape[0])

Rows with missing data in submission:  9699


In [14]:
# We replace the missing data with the most frequently bought articles, from recent days. We calculated it before.

submission_df.loc[submission_df.prediction.isna(), ["prediction"]] = art_str

In [15]:
print("Rows with missing data in submission: ", submission_df.loc[submission_df.prediction.isna()].shape[0])

Rows with missing data in submission:  0


In [16]:
submission_df.to_csv("/content/drive/MyDrive/Project_Data/sub_content.csv", index=False)

# Collaborative filtering (user-user)

In [17]:
df = transactions_train
df_sub = pd.read_csv('/content/drive/MyDrive/Project_Data/sample_submission.csv')

In [18]:
dfu = pd.read_csv('/content/drive/MyDrive/Project_Data/customers.csv')
dfi = pd.read_csv('/content/drive/MyDrive/Project_Data/articles.csv', dtype={'article_id': str})

ALL_USERS = dfu['customer_id'].unique().tolist()
ALL_ITEMS = dfi['article_id'].unique().tolist()

user_to_customer_map = {user_id: customer_id for user_id, customer_id in enumerate(ALL_USERS)}
customer_to_user_map = {customer_id: user_id for user_id, customer_id in enumerate(ALL_USERS)}

item_to_article_map = {item_id: article_id for item_id, article_id in enumerate(ALL_ITEMS)}
article_to_item_map = {article_id: item_id for item_id, article_id in enumerate(ALL_ITEMS)}

del dfu, dfi

In [19]:
df['user_id'] = df['customer_id'].map(customer_to_user_map)
df['item_id'] = df['article_id'].map(article_to_item_map)

In [20]:
N_SIMILAR_USERS = 20

MINIMUM_PURCHASES = 10

START_DATE = '2020-09-07'

DROP_PURCHASED_ITEMS = False

DROP_USER_FROM_HIS_NEIGHBORHOOD = False

TEST_RUN = False

TEST_SIZE = 1000

In [21]:
def flatten(l):

    return [item for sublist in l for item in sublist]

def compare_vectors(v1, v2):

    intersection = len(set(v1) & set(v2))
    denominator = np.sqrt(len(v1) * len(v2))
    return intersection / denominator

def get_similar_users(u, v, dfh):

    similar_users = dfh.apply(lambda v_other: compare_vectors(v, v_other)).sort_values(ascending=False).head(N_SIMILAR_USERS + 1)
    
    if DROP_USER_FROM_HIS_NEIGHBORHOOD:
        similar_users = similar_users[similar_users.index != u]
        
    return similar_users.index.tolist(), similar_users.tolist()

def get_items(u, v, dfh):

    global i, n
    
    users, scores = get_similar_users(u, v, dfh)
    df_nn = pd.DataFrame({'user': users, 'score': scores})
    df_nn['items'] = df_nn.apply(lambda row: dfh.loc[row.user], axis=1)
    df_nn['weighted_items'] = df_nn.apply(lambda row: [(item, row.score) for item in row['items']], axis=1)

    recs = pd.DataFrame(flatten(df_nn['weighted_items'].tolist()), columns=['item', 'score']).groupby('item')['score'].sum().sort_values(ascending=False)
    if DROP_PURCHASED_ITEMS:
        recs = recs[~recs.index.isin(v)]
    # Keep the first 12 and get the item_ids
    i +=1
    if i % 200 == 0:
        pid = mp.current_process().pid
        print(f"[PID {pid:>2d}] Finished {i:3d} / {n:5d} - {i/n*100:3.0f}%")
    return recs.head(12).index.tolist()

def get_items_chunk(user_ids: np.array, dfh: pd.DataFrame):

    global i, n
    i = 0
    
    n = len(user_ids)
    pid = mp.current_process().pid
    print(f"[PID {pid:>2d}] Started working with {n:5d} users")
    
    df_user_vectors = pd.DataFrame(dfh.loc[user_ids]).reset_index()
    df_user_vectors['recs'] = df_user_vectors.apply(lambda row: get_items(row.user_id, row.item_id, dfh), axis=1)
    return df_user_vectors.set_index('user_id')['recs']

def get_recommendations(users: list, dfh: pd.DataFrame):

    time_start = time.time()
    
    # Split into approximately evenly sized chunks
    # We will send just one batch to each CPU 
    user_chunks = np.array_split(users, mp.cpu_count())
    
    f = partial(get_items_chunk, dfh=dfh)
    with Pool(mp.cpu_count()) as p:
        res = p.map(f, user_chunks)
    
    df_rec = pd.DataFrame(pd.concat(res))

    elapsed = (time.time() - time_start) / 60
    print(f"Finished get_recommendations({len(users)}). It took {elapsed:5.2f} mins")
    return df_rec


def uucf(df, start_date=START_DATE):

    df_small = df[df['t_dat'] > start_date]
    print(f"Kept data from {start_date} on. Total rows: {len(df_small)}")
    
    # H stands for "Transaction history"
    # dfh is a series of user_id => list of item_id (the list of purchases in order)
    dfh = df_small.groupby("user_id")['item_id'].apply(lambda items: list(set(items)))
    dfh = dfh[dfh.str.len() >= MINIMUM_PURCHASES]
    if TEST_RUN:
        print("WARNING: TEST_RUN is True. It will be a toy execution.")
        dfh = dfh.head(TEST_SIZE)
    
    users = dfh.index.tolist()
    n_users = len(users)
    print(f"Total users in the time frame with at least {MINIMUM_PURCHASES}: {n_users}")
    
    df_rec = get_recommendations(users, dfh)
    df_rec['customer_id'] = df_rec.index.map(user_to_customer_map)
    df_rec['prediction'] = df_rec['recs'].map(lambda l: [item_to_article_map[i] for i in l])
    
    # Submission ready dataframe
    df_rec.reset_index(drop=True)[['customer_id', 'prediction']]
    return df_rec 

In [22]:
df_recs = uucf(df)

Kept data from 2020-09-07 on. Total rows: 531967
Total users in the time frame with at least 10: 6964
[PID 462] Started working with  3482 users
[PID 463] Started working with  3482 users
[PID 462] Finished 200 /  3482 -   6%
[PID 463] Finished 200 /  3482 -   6%
[PID 462] Finished 400 /  3482 -  11%
[PID 463] Finished 400 /  3482 -  11%
[PID 462] Finished 600 /  3482 -  17%
[PID 463] Finished 600 /  3482 -  17%
[PID 462] Finished 800 /  3482 -  23%
[PID 463] Finished 800 /  3482 -  23%
[PID 462] Finished 1000 /  3482 -  29%
[PID 463] Finished 1000 /  3482 -  29%
[PID 462] Finished 1200 /  3482 -  34%
[PID 463] Finished 1200 /  3482 -  34%
[PID 463] Finished 1400 /  3482 -  40%
[PID 462] Finished 1400 /  3482 -  40%
[PID 463] Finished 1600 /  3482 -  46%
[PID 462] Finished 1600 /  3482 -  46%
[PID 463] Finished 1800 /  3482 -  52%
[PID 462] Finished 1800 /  3482 -  52%
[PID 462] Finished 2000 /  3482 -  57%
[PID 463] Finished 2000 /  3482 -  57%
[PID 463] Finished 2200 /  3482 -  63%
[

In [23]:
df_fill = pd.read_csv('/content/drive/MyDrive/Project_Data/sub_content.csv')

In [24]:
def drop_duplicates(seq):
    """ Remove duplicates of a given sequence keeping order"""
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

def fill_row(row):
    uucf = row['prediction_uucf']
    fill = row['prediction_fill'].split()
    new_list = drop_duplicates(uucf + fill)[:12]
    return ' '.join(new_list)


def fill(df_recs, df_fill):
    df_recs['len'] = df_recs['prediction'].str.len()
    df_recs = pd.merge(df_fill, df_recs, how='left', on='customer_id', suffixes=('_fill', '_uucf'))
    
    
    # No recs from UUCF at all: use the fallback model 
    df_recs.loc[df_recs['prediction_uucf'].isnull(), 'prediction'] = df_recs['prediction_fill']


    # Full UUCF recommendation
    mask = df_recs['prediction_uucf'].notnull() & (df_recs['len'] == 12)
    df_recs.loc[mask, 'prediction'] = df_recs['prediction_uucf']


    # Fill with another model. Not enough recs from UUCF
    fill_mask = df_recs['prediction_uucf'].notnull() & (df_recs['len'] < 12)
    df_recs.loc[fill_mask, 'prediction'] = df_recs[fill_mask].apply(fill_row, axis=1)
    return df_recs.drop(['prediction_uucf', 'prediction_fill', 'len', 'recs'], axis=1)

In [25]:
# Fill with another model
df_sub = fill(df_recs, df_fill)
df_sub.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0'0568601043' 0'0841260003' 0'0887593002' 0'08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0'0826211002' 0'0599580055' 0'0599580055' 0'08...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0'0794321007' 0'0858883002' 0'0851400006' 0'07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0'0742079001' 0'0732413001' 00924243002 007514...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0'0896152002' 0'0730683050' 0'0927530004' 0'07...


In [26]:
df_sub.shape

(1371980, 2)

In [27]:
# Submit
df_sub.to_csv("/content/drive/MyDrive/Project_Data/sub_collaborative.csv", index=False)

# SVD Reranking Model

In [6]:
# !pip install git+https://github.com/mayukh18/reco

In [7]:
data = pd.read_csv('/content/drive/MyDrive/Project_Data/transactions_train.csv', dtype={'article_id':str})
data = data.loc[data['sales_channel_id'] == 2]
data["t_dat"] = pd.to_datetime(data["t_dat"])

In [8]:
train1 = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,16)) & (data['t_dat'] < datetime.datetime(2020,9,23))]
train2 = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,8)) & (data['t_dat'] < datetime.datetime(2020,9,16))]
train3 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,31)) & (data['t_dat'] < datetime.datetime(2020,9,8))]
train4 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,23)) & (data['t_dat'] < datetime.datetime(2020,8,31))]

positive_items_per_user1 = train1.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user2 = train2.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user3 = train3.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user4 = train4.groupby(['customer_id'])['article_id'].apply(list)

train = pd.concat([train1, train2], axis=0)
train['pop_factor'] = train['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,23) - x).days)
popular_items_group = train.groupby(['article_id'])['pop_factor'].sum()

_, popular_items = zip(*sorted(zip(popular_items_group, popular_items_group.keys()))[::-1])

user_group = pd.concat([train1, train2, train3, train4], axis=0).groupby(['customer_id'])['article_id'].apply(list)

def get_most_freq_next_item(user_group):
    next_items = {}
    for user in tqdm(user_group.keys()):
        items = user_group[user]
        for i,item in enumerate(items[:-1]):
            if item not in next_items:
                next_items[item] = []
            if item != items[i+1]:
                next_items[item].append(items[i+1])

    pred_next = {}
    for item in next_items:
        if len(next_items[item]) >= 5:
            most_common = Counter(next_items[item]).most_common()
            ratio = most_common[0][1]/len(next_items[item])
            if ratio >= 0.1:
                pred_next[item] = most_common[0][0]
            
    return pred_next

pred_next = get_most_freq_next_item(user_group)

100%|██████████| 155638/155638 [00:01<00:00, 154665.46it/s]


In [9]:
# SVD
train = pd.concat([train1, train2, train3, train4], axis=0)
train['pop_factor'] = train['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,23) - x).days**2)
popular_items_group = train.groupby(['article_id'])['pop_factor'].sum()

train['feedback'] = 1
train = train.groupby(['customer_id', 'article_id']).sum().reset_index()

train['feedback'] = train.apply(lambda row: row['feedback']/popular_items_group[row['article_id']], axis=1)

train['feedback'] = train['feedback'].apply(lambda x: 5.0 if x>5.0 else x)
train.drop(['price', 'sales_channel_id'], axis=1, inplace=True)
train['feedback'].describe()

count    669276.000000
mean          0.850796
std           1.520509
min           0.006205
25%           0.057209
50%           0.158365
75%           0.627093
max           5.000000
Name: feedback, dtype: float64

In [10]:
train = train.sample(frac=1).reset_index(drop=True)
train.head()

Unnamed: 0,customer_id,article_id,pop_factor,feedback
0,0bb534f7e537065ed54f8d8084a611d8620001b4f22b4b...,748566027,0.472222,1.351352
1,c24f102825cd1c5f915f00dd5e66f1fcb33f6738c53cee...,832453005,0.003086,5.0
2,cf12d0c770a9baf8266e23739c254d402dde28424ee3fd...,752657001,0.00346,0.08073
3,461be44c8ce11e0757d9f1024ce00c907acc6a1f9bcc3b...,835348010,0.020408,0.152692
4,7ee1ae23a2996d1e282dc53123d99bd05e8934afe98bef...,883307001,0.003086,0.096015


In [11]:
f = FunkSVD(k=8, learning_rate=0.005, regularizer = .01, iterations = 200, method = 'stochastic', bias=True)
f.fit(X=train, formatizer={'user':'customer_id', 'item':'article_id', 'value':'feedback'},verbose=True)

Epoch 0: Error: 0.800922191764222
Epoch 1: Error: 0.5874897083404117
Epoch 2: Error: 0.5084851441080492
Epoch 3: Error: 0.46105831334040825
Epoch 4: Error: 0.42747300213640704
Epoch 5: Error: 0.4019097802839868
Epoch 6: Error: 0.381511241640542
Epoch 7: Error: 0.3646370532840004
Epoch 8: Error: 0.3502774057514303
Epoch 9: Error: 0.33776488737283544
Epoch 10: Error: 0.32667523021939904
Epoch 11: Error: 0.31669813050920737
Epoch 12: Error: 0.3076117791554887
Epoch 13: Error: 0.29924465851827026
Epoch 14: Error: 0.29148222867877494
Epoch 15: Error: 0.2842265422891626
Epoch 16: Error: 0.2773984166733543
Epoch 17: Error: 0.2709401983018213
Epoch 18: Error: 0.26480629991257915
Epoch 19: Error: 0.2589499816227505
Epoch 20: Error: 0.25334212123420186
Epoch 21: Error: 0.24796559441306962
Epoch 22: Error: 0.24280990338856126
Epoch 23: Error: 0.23785294069723992
Epoch 24: Error: 0.2330839465622111
Epoch 25: Error: 0.22849722312429244
Epoch 26: Error: 0.2240797596419642
Epoch 27: Error: 0.21982519

In [12]:
outputs = []
cnt = 0

popular_items = list(popular_items)
userindexes = {f.users[i]:i for i in range(len(f.users))}

for user in tqdm(submissions['customer_id']):
    user_output = []
    if user in positive_items_per_user1.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user1[user]).most_common()}
        
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = f.items.index(k)
                pred_value = np.dot(f.userfeatures[user_index], f.itemfeatures[itemindex].T) + f.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    if user in positive_items_per_user2.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user2[user]).most_common()}
        
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = f.items.index(k)
                pred_value = np.dot(f.userfeatures[user_index], f.itemfeatures[itemindex].T) + f.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    if user in positive_items_per_user3.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user3[user]).most_common()}
        
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = f.items.index(k)
                pred_value = np.dot(f.userfeatures[user_index], f.itemfeatures[itemindex].T) + f.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    if user in positive_items_per_user4.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user4[user]).most_common()}
        
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = f.items.index(k)
                pred_value = np.dot(f.userfeatures[user_index], f.itemfeatures[itemindex].T) + f.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    user_output += [pred_next[item] for item in user_output if item in pred_next and pred_next[item] not in user_output]      
    
    user_output += list(popular_items[:12 - len(user_output)])
    outputs.append(user_output)
    
str_outputs = []
for output in outputs:
    str_outputs.append(" ".join([str(x) for x in output]))

100%|██████████| 1371980/1371980 [04:09<00:00, 5494.56it/s]


In [13]:
submissions['prediction'] = str_outputs

In [14]:
submissions.to_csv("/content/drive/MyDrive/Project_Data/sub_svd.csv", index=False)