In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
events = pd.read_csv('events.csv')

props_1 = pd.read_csv('item_properties_part1.csv')
props_2 = pd.read_csv('item_properties_part2.csv')
props = pd.concat([props_1, props_2])

categories = pd.read_csv("category_tree.csv")

In [3]:
events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [4]:
events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2756101 entries, 0 to 2756100
Data columns (total 5 columns):
 #   Column         Dtype  
---  ------         -----  
 0   timestamp      int64  
 1   visitorid      int64  
 2   event          object 
 3   itemid         int64  
 4   transactionid  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 105.1+ MB


In [5]:
# Сколько записей событий находится в датасете? Ответ введите без пробелов и знаков препинания.
len(events)

2756101

In [6]:
# Какие типы событий содержатся в датасете? Выберите все верные варианты.
events.event.value_counts()

view           2664312
addtocart        69332
transaction      22457
Name: event, dtype: int64

In [7]:
# Сколько уникальных признаков товара есть в датасете? Ответ вводите без пробелов и знаков препинания.
props.property.nunique()

1104

## Описание данных

events — датасет с событиями:
* timestamp — время события
* visitorid — идентификатор пользователя
* event — тип события
* itemid — идентификатор объекта
* transactionid — идентификатор транзакции, если она проходила

category_tree — файл с деревом категорий (можно восстановить дерево):
* category_id — идентификатор категорий
* parent_id — идентификатор родительской категории

item_properties — файл с свойствами товаров:
* timestamp — момент записи значения свойства
* item_id — идентификатор объекта
* property — свойство, кажется, они все, кроме категории, захешированы
* value — значение свойства

## Целевая метрика

Precision@3

## Дополнительное задание первой недели

In [8]:
# Какой процент продаж обеспечивают топовые товары (точка отсечения — 1 июля)?

In [9]:
sales = events[events['event']=='transaction']
sales.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
130,1433222276276,599528,transaction,356475,4000.0
304,1433193500981,121688,transaction,15335,11117.0
418,1433193915008,552148,transaction,81345,5444.0
814,1433176736375,102019,transaction,150318,13556.0
843,1433174518180,189384,transaction,310791,7244.0


In [10]:
sales_train = sales[pd.to_datetime(sales['timestamp'], unit='ms') < pd.Timestamp('2015-07-01 00:00:00.000000')]

In [11]:
sales_train.itemid.value_counts()

119736    36
369447    31
7943      30
461686    23
318333    23
          ..
176309     1
229555     1
413865     1
202920     1
363040     1
Name: itemid, Length: 6231, dtype: int64

In [12]:
# most frequent item gives 0.37% of total sales
sales_train.itemid.value_counts()[119736] / len(sales_train) * 100

0.3729024238657551

In [13]:
# @Леон Кочиев (Ментор)
# выбираем столько, чтобы эти "топовые" покрывали больше половины всех отсеченных
sales_train_gr = sales_train.groupby(['itemid']).count()
sales_train_gr = sales_train_gr.sort_values('event', ascending=False)
sum_sales_train_grp = sales_train_gr['event'].sum()
sales_train_gr['percentage'] = (sales_train_gr['event'] / sum_sales_train_grp) * 100
top_50_percent_items = np.sum(sales_train_gr['percentage'].cumsum()<=50)

# 24.9% items givr 50% of transaction events
top_50_percent_items / sales_train.itemid.nunique() 

0.24971914620446156

In [14]:
events['event_datetime'] = pd.to_datetime(events['timestamp'], unit = 'ms')
props['event_datetime'] = pd.to_datetime(props['timestamp'], unit = 'ms')
sales['event_datetime'] = pd.to_datetime(sales['timestamp'], unit = 'ms')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales['event_datetime'] = pd.to_datetime(sales['timestamp'], unit = 'ms')


## Дополнительное задание третьей недели 

In [15]:
# какое свойство айтемов не входит в топ-20?
props.property.value_counts()[:20] # color is not on list

888           3000398
790           1790516
available     1503639
categoryid     788214
6              631471
283            597419
776            574220
678            481966
364            476486
202            448938
839            417239
917            417227
112            417053
764            417053
159            417053
227            347492
698            289849
451            264416
663            240813
962            239372
Name: property, dtype: int64

In [16]:
# Возьмем только самые распространенные proprties, например топ 20 (при построении модели можно играть)
top_properties = props.drop_duplicates(['itemid', 'property']).groupby("property")['itemid'].count().sort_values(ascending=False)[:20]
props = props[props['property'].isin(set(top_properties.index))]
props.shape

(13563669, 5)

# Генерация признаков

In [17]:
props.head()

Unnamed: 0,timestamp,itemid,property,value,event_datetime
0,1435460400000,460429,categoryid,1338,2015-06-28 03:00:00
1,1441508400000,206783,888,1116713 960601 n277.200,2015-09-06 03:00:00
3,1431226800000,59481,790,n15360.000,2015-05-10 03:00:00
4,1431831600000,156781,917,828513,2015-05-17 03:00:00
5,1436065200000,285026,available,0,2015-07-05 03:00:00


In [18]:
props["prop"] = props["property"].astype(str) # + " " + props["value"].astype(str)

In [19]:
props = props[['itemid', 'property']]

In [20]:
props.head()

Unnamed: 0,itemid,property
0,460429,categoryid
1,206783,888
3,59481,790
4,156781,917
5,285026,available


In [21]:
# props_items = props.groupby('itemid')['prop'].apply(list).reset_index(name='list_of_prop_values')

In [22]:
# props_items.head()

In [23]:
# props of items as a feature in events table
# events_with_item_props = events.merge(props, how='inner', on='itemid')
# events_with_item_props.dropna(subset=['property'], inplace=True)
events_with_item_props = events

In [24]:
events_with_item_props.isna().sum()

timestamp               0
visitorid               0
event                   0
itemid                  0
transactionid     2733644
event_datetime          0
dtype: int64

In [25]:
# number of users's actions as a feature
events_with_item_props['freq'] = events_with_item_props.groupby('visitorid')['visitorid'].transform('count')
events_with_item_props.sort_values(by=['timestamp'], inplace=True)
events_with_item_props.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,event_datetime,freq
1462974,1430622004384,693516,addtocart,297662,,2015-05-03 03:00:04.384,3
1464806,1430622011289,829044,view,60987,,2015-05-03 03:00:11.289,1
1463000,1430622013048,652699,view,252860,,2015-05-03 03:00:13.048,1
1465287,1430622024154,1125936,view,33661,,2015-05-03 03:00:24.154,1
1462955,1430622026228,693516,view,297662,,2015-05-03 03:00:26.228,3


In [26]:
events_with_item_props['day_of_week'] = events_with_item_props['event_datetime'].map(lambda x: x.weekday())
events_with_item_props['Year'] = events_with_item_props['event_datetime'].map(lambda x: x.year)
events_with_item_props['Month'] = events_with_item_props['event_datetime'].map(lambda x: x.month)
events_with_item_props['Day'] = events_with_item_props['event_datetime'].map(lambda x: x.day)
events_with_item_props['Hour'] = events_with_item_props['event_datetime'].map(lambda x: x.hour)
events_with_item_props['minute'] = events_with_item_props['event_datetime'].map(lambda x: x.minute)

In [27]:
def get_time_periods(hour):
    if hour >= 3 and hour < 7:
        return 'Dawn'
    elif hour >= 7 and hour < 12:
        return 'Morning'
    elif hour >= 12 and hour < 16:
        return 'Afternoon'
    elif hour >= 16 and hour < 22:
        return 'Evening'
    else:
        return 'Night'
    
events_with_item_props['Day Period'] = events_with_item_props['Hour'].map(get_time_periods)
events_with_item_props['Day Period'].value_counts()

Evening      1078199
Night         765924
Dawn          494588
Afternoon     293490
Morning       123900
Name: Day Period, dtype: int64

In [28]:
events_with_item_props.to_csv('data.csv')  


## Train-test-split

In [2]:
data = pd.read_csv('data.csv')


In [7]:
train = data[pd.to_datetime(data['timestamp'], unit='ms') < pd.Timestamp('2015-08-15 00:00:00.000000')]
test = data[pd.to_datetime(data['timestamp'], unit='ms') >= pd.Timestamp('2015-08-15 00:00:00.000000')]

In [8]:
len(train) / len(data)

0.7804920792089985

In [9]:
len(test) / len(data)

0.21950792079100148

## Model & Predictions

### Baseline - Most frequent items

### Cosine Similarity

In [10]:
# Dropping text values (we'll be using them in another approach).
train.drop(['Unnamed: 0', 'timestamp', 'event', 'transactionid', 'event_datetime', 'Day Period'], axis=1, inplace=True)
test.drop(['Unnamed: 0', 'timestamp', 'event', 'transactionid', 'event_datetime', 'Day Period'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [11]:
# train['property'] = pd.to_numeric(train['property'], errors='coerce')
# train['property'].fillna(0, inplace=True)

# test['property'] = pd.to_numeric(test['property'], errors='coerce')
# test['property'].fillna(0, inplace=True)

In [12]:
# train['property'] = train['property'].astype('category')
# test['property'] = test['property'].astype('category')
# train['Day Period'] = train['Day Period'].astype('category')
# test['Day Period'] = test['Day Period'].astype('category')


In [13]:
train.head()

Unnamed: 0,visitorid,itemid,freq,day_of_week,Year,Month,Day,Hour,minute
0,693516,297662,3,6,2015,5,3,3,0
1,829044,60987,1,6,2015,5,3,3,0
2,652699,252860,1,6,2015,5,3,3,0
3,1125936,33661,1,6,2015,5,3,3,0
4,693516,297662,3,6,2015,5,3,3,0


In [14]:
# Converting the dataframe into numpy format for further processing.
train_np = train.to_numpy()
test_np = test.to_numpy()

In [15]:
print(train_np.shape)
print(test_np.shape)

(2151115, 9)
(604986, 9)


In [None]:
# Finding the cosine distance of every record with every other record in the dataset.
cosine_sim_train = cosine_similarity(train_np, train_np)
cosine_sim_test = cosine_similarity(test_np, test_np)

In [None]:
# run out of memory
# source: rai-harshit/music-recommendation-engine