In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
events = pd.read_csv('events.csv')

In [3]:
events.sort_values(by=['timestamp'], ignore_index=True, inplace=True)
events.drop(['timestamp', 'event', 'transactionid'], axis=1, inplace=True)

In [4]:
events.head()

Unnamed: 0,visitorid,itemid
0,693516,297662
1,829044,60987
2,652699,252860
3,1125936,33661
4,693516,297662


In [5]:
train, test = train_test_split(events, test_size=0.2, shuffle=False)

In [6]:
top3 = list(train.itemid.value_counts()[:3].index)
top3

[5411, 461686, 187946]

In [7]:
# 20 mins
train['first'] = train.groupby('visitorid')['itemid'].transform(lambda s: s.value_counts().index[0])
train['second'] = train.groupby('visitorid')['itemid'].transform(lambda s: s.value_counts().index[1] if len(s.value_counts())>1 else np.nan)
train['third'] = train.groupby('visitorid')['itemid'].transform(lambda s: s.value_counts().index[2] if len(s.value_counts())>2 else np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['first'] = train.groupby('visitorid')['itemid'].transform(lambda s: s.value_counts().index[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['second'] = train.groupby('visitorid')['itemid'].transform(lambda s: s.value_counts().index[1] if len(s.value_counts())>1 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.h

In [8]:
# Create a dict of visitors wih values of 'first', 'second', 'third' columns and save it to the file 'visitors.csv'
visitors = train.groupby('visitorid').agg({'first': 'first', 'second': 'first', 'third': 'first'}).reset_index()
visitors['items_list'] = visitors[['first', 'second', 'third']].values.tolist()
# Count number of zeros in each row and save it to the column 'cold_start_count'
visitors['cold_start_count'] = visitors.apply(lambda row: sum(row.isnull()), axis=1)
visitors.to_csv('visitors.csv', index=False)

In [9]:
visitors = pd.read_csv('visitors.csv')

In [10]:
visitors.head()

Unnamed: 0,visitorid,first,second,third,items_list,cold_start_count
0,1,72028,,,"[72028.0, nan, nan]",2
1,2,325215,342816.0,216305.0,"[325215.0, 342816.0, 216305.0]",0
2,3,385090,,,"[385090.0, nan, nan]",2
3,5,61396,,,"[61396.0, nan, nan]",2
4,7,226353,164941.0,139394.0,"[226353.0, 164941.0, 139394.0]",0


In [11]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 551221 entries, 2204880 to 2756100
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   visitorid  551221 non-null  int64
 1   itemid     551221 non-null  int64
dtypes: int64(2)
memory usage: 12.6 MB


In [12]:
# Create columns 'first', 'second', 'third' for test data by merging visitors data with test data
test = pd.merge(test, visitors, on='visitorid', how='left')
test.head()

Unnamed: 0,visitorid,itemid,first,second,third,items_list,cold_start_count
0,526699,304858,12040.0,214029.0,359491.0,"[12040.0, 214029.0, 359491.0]",0.0
1,384670,419836,,,,,
2,1166298,375066,386701.0,375066.0,51483.0,"[386701.0, 375066.0, 51483.0]",0.0
3,531293,48030,,,,,
4,86098,231243,,,,,


In [19]:
# Count Nans in each row and save it to the column 'nans'
test['cold_start_count'] = test.apply(lambda row: sum(row[['first', 'second', 'third']].isnull()), axis=1)

# Replace Nans in rows with top3[0] if there is 1 Nan in the row
test.loc[test['cold_start_count'] == 1, 'third'] = top3[0]
# Replace Nans in rows with top3[0] and top3[1] if there are 2 Nans in the row
test.loc[test['cold_start_count'] == 2, 'second'] = top3[0]
test.loc[test['cold_start_count'] == 2, 'third'] = top3[1]
# Relace Nans in rows with top3[0], top3[1] and top3[2] if there are 3 Nans in the row
test.loc[test['cold_start_count'] == 3, 'first'] = top3[0]
test.loc[test['cold_start_count'] == 3, 'second'] = top3[1]
test.loc[test['cold_start_count'] == 3, 'third'] = top3[2]

In [20]:
# Create 'pred' column by Merge 'firsd', 'second', 'third' columns into list
test['pred'] = test[['first', 'second', 'third']].values.tolist()

In [21]:
test.head()

Unnamed: 0,visitorid,itemid,first,second,third,items_list,cold_start_count,pred,is_true
0,526699,304858,12040.0,214029.0,359491.0,"[12040.0, 214029.0, 359491.0]",0,"[12040.0, 214029.0, 359491.0]",False
1,384670,419836,5411.0,461686.0,187946.0,,3,"[5411.0, 461686.0, 187946.0]",False
2,1166298,375066,386701.0,375066.0,51483.0,"[386701.0, 375066.0, 51483.0]",0,"[386701.0, 375066.0, 51483.0]",True
3,531293,48030,5411.0,461686.0,187946.0,,3,"[5411.0, 461686.0, 187946.0]",False
4,86098,231243,5411.0,461686.0,187946.0,,3,"[5411.0, 461686.0, 187946.0]",False


In [22]:
# Create 'is_true' column by comparing 'pred' column with 'itemid' column
test['is_true'] = test.apply(lambda row: row['itemid'] in row['pred'], axis=1)
results = test.is_true.value_counts()

In [23]:
# Precision@3 is Relevant Items / Viewed Items
metric = results[1] / (results[0] + results[1])
print(f'Metric: {metric:.2%}')

Metric: 2.26%


In [104]:
def show_recs(visitorid):
    visitors = pd.read_csv('visitors.csv')
    top3 = [5411, 461686, 187946]
    print(f'Top 3 recommendations for visitor {visitorid} are:')

    if visitorid not in visitors.visitorid.values:
        first, second, third = top3[0], top3[1], top3[2]
        return first, second, third
    else:
        # Get the row with visitorid
        row = visitors[visitors.visitorid == visitorid]
        # Get the list of items from the row
        items_list = []
        try:
            items_list.append(int(row['first'].values))
        except:
            items_list.append(np.nan)
        try:
            items_list.append(int(row['second'].values))
        except:
            items_list.append(np.nan)
        try:
            items_list.append(int(row['third'].values))
        except:
            items_list.append(np.nan)
        # Get the number of Nans in the item_list
        cold_start_count = row.cold_start_count.values[0]
        # If there is 1 Nan in the row, replace it with top3[0]
        if cold_start_count == 1:
            items_list[2] = top3[0]
            first, second, third = items_list[0], items_list[1], items_list[2]
            return first, second, third
        # If there are 2 Nans in the row, replace it with top3[0] and top3[1]
        elif cold_start_count == 2:
            items_list[1] = top3[0]
            items_list[2] = top3[1]
            first, second, third = items_list[0], items_list[1], items_list[2]
            return first, second, third
        # If there are 3 Nans in the row, replace it with top3[0], top3[1] and top3[2]
        elif cold_start_count == 3:
            items_list[0] = top3[0]
            items_list[1] = top3[1]
            items_list[2] = top3[2]
            first, second, third = items_list[0], items_list[1], items_list[2]
            return first, second, third
        else:
            first, second, third = items_list[0], items_list[1], items_list[2]
            return first, second, third
        

In [110]:
ans1, ans2, asn3 = show_recs(1166298)
print(ans1, ans2, asn3)


Top 3 recommendations for visitor 1166298 are:
386701 375066 51483
