In [2]:
import pandas as pd
import numpy as np
import json
from lightfm import LightFM
from scipy import sparse as sp
from scipy import stats
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pymorphy2
import re

In [2]:
train_full = pd.read_csv('./data/train.csv', parse_dates=['event_date'])
item_data = pd.read_csv('./data/item_data.csv')
test_users = pd.read_csv('./data/test_users.csv')

In [3]:
# Перекодируем microcat
enc = LabelEncoder()
item_data['microcat_id'] = enc.fit_transform(item_data['microcat_id'])

In [24]:
# Описание item это item_id и microcat_id
item_data = item_data.sort_values('item_id')
enc = OneHotEncoder()
item_features = enc.fit_transform(item_data[['item_id','microcat_id']])

In [5]:
with open('data/item_titles.json') as f:
    titles_json = json.load(f)

df_titles = pd.DataFrame(titles_json['data'])
df_titles.columns = titles_json['columns']
df_titles = df_titles.sort_values('item_id')

# Приведем слова в title к нормальной форме
morph = pymorphy2.MorphAnalyzer()
def parse(x):
    return ' '.join([morph.normal_forms(i)[0] for i in re.findall('\w+', x)]) 

In [6]:
%%time
df_titles.title = df_titles.title.apply(parse)

CPU times: user 1h 5min 1s, sys: 12.4 s, total: 1h 5min 14s
Wall time: 1h 5min 23s


In [25]:
vectorizer = CountVectorizer(max_features=1000, binary=True)
title = vectorizer.fit_transform(df_titles.title)

# Теперь добавим к описанию itemа наличие слов из топ-10000 в title
item_features = sp.hstack([item_features, title])

In [8]:
def get_dict_set(X, col):
    some_dict = {}
    for i, j in X[col].values:
        if i not in some_dict:
            some_dict[i] = set([j])
        else:
            some_dict[i].add(j)
    return some_dict

In [9]:
train_full = train_full.merge(item_data, on='item_id')

# user_location - в каком регионе чаще всего ищет пользователь 
user_location = train_full.groupby('user_id').location_id.apply(lambda x: stats.mode(x)[0][0]) 
users = test_users['user_id'].values

# location2item - словарь, где по региону ставится id товаров в этом регионе
location2item = get_dict_set(item_data[item_data.active_during_test == 1], ['location_id', 'item_id'])

# user_seen - уже просмотренные пользователем товары
user_seen = get_dict_set(train_full, ['user_id', 'item_id'])

mode_loc = user_location.mode().values[0]

In [10]:
# Посчитаем веса. Чем дальше по времени, тем больший вес
weight = train_full.event_date - train_full.event_date.min()
weight = weight.apply(lambda x: x.total_seconds())
weight = (weight/ weight.max())**8

shape = (train_full.user_id.max() + 1, train_full.item_id.max() + 1)

In [26]:
W = sp.coo_matrix((weight, (train_full.user_id, train_full.item_id)), shape=shape)
M = sp.coo_matrix(([1]*len(train_full), (train_full.user_id, train_full.item_id)), shape=shape)

In [27]:
W.tocsr()
M.tocsr()

<646888x5436440 sparse matrix of type '<class 'numpy.int64'>'
	with 21847937 stored elements in Compressed Sparse Row format>

In [28]:
%%time
model = LightFM(learning_rate=0.03, loss='warp', no_components=100, random_state=241)
model.fit(M, epochs=70, item_features=item_features, sample_weight=W)

CPU times: user 5h 33min 40s, sys: 31.2 s, total: 5h 34min 12s
Wall time: 5h 34min 19s


In [30]:
%%time
j = 0
users_to_sub = []
items_to_sub = []
rank = []
for u in users:
    j += 1
    items = np.array(list(location2item[user_location.get(u, mode_loc)].difference(user_seen.get(u, set()))))
    result = model.predict(u, items, item_features=item_features)
    top50 = items[result.argsort()[-50:]]
    users_to_sub += [u]*len(top50)
    items_to_sub += list(top50)
    rank += list(range(len(top50) - 1, -1, -1)) 
    if j % 10000 == 0:
        print(j / 1000, '%')

10.0 %
20.0 %
30.0 %
40.0 %
50.0 %
60.0 %
70.0 %
80.0 %
90.0 %
100.0 %
CPU times: user 2d 14h 23min 54s, sys: 3h 30min 42s, total: 2d 17h 54min 37s
Wall time: 3d 4h 13min 22s


In [31]:
submission = pd.DataFrame()
submission['user_id'] = users_to_sub
submission['item_id'] = items_to_sub
submission['rank'] = rank

In [34]:
submission[['user_id', 'item_id']].to_csv('20170609_avito_2017_VasiliyRubtsov.csv', index=False)
submission.to_csv('lightFM.csv')