In [1]:
import numpy as np
import pandas as pd
import joblib
import xgboost
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder

In [2]:
pd.options.display.max_columns = 100

In [3]:
data = pd.read_hdf('df_data/train.hdf')

In [4]:
df_playlists_info = pd.read_hdf('df_data/df_playlists_info.hdf')
df_playlists_test_info = pd.read_hdf('df_data/df_playlists_test_info.hdf')

In [5]:
tracks_info = pd.read_hdf('df_data/df_tracks.hdf')

tracks_info['album'] = LabelEncoder().fit_transform(tracks_info.album_uri)
tracks_info['artist'] = LabelEncoder().fit_transform(tracks_info.artist_uri)

In [6]:
train = pd.read_hdf('df_data/ii_candidate.hdf')
val = pd.read_hdf('df_data/iii_candidate.hdf')
test = pd.read_hdf('df_data/test_candidate.hdf')

In [7]:
train_holdouts = pd.read_hdf('df_data/val1.hdf')
val_holdouts = pd.read_hdf('df_data/val2.hdf')

In [8]:
train_length = train_holdouts.groupby('pid').tid.nunique()
val_length = val_holdouts.groupby('pid').tid.nunique()
test_length = df_playlists_test_info.set_index('pid').num_holdouts

In [9]:
num_items = data.tid.max() + 1

In [10]:
def create_count(df):
    
    tid_count = data.tid.value_counts()
    pid_count = data.pid.value_counts()

    df['tid_count'] = df.tid.map(tid_count).fillna(0)
    df['pid_count'] = df.pid.map(pid_count).fillna(0)
    
    album_count = data.tid.map(tracks_info.album).value_counts()
    artist_count = data.tid.map(tracks_info.artist).value_counts()
    
    df['album_count'] = df.tid.map(tracks_info.album).map(album_count).fillna(0)
    df['artist_count'] = df.tid.map(tracks_info.artist).map(artist_count).fillna(0)
     
    album_count

In [11]:
def isin(i, j):
    if j is not np.nan:
        return i in j
    return False

def isin_sum(i, j):
    if j is not np.nan:
        return (i == j).sum()
    return 0

In [12]:
def creaet_artist_features(df):
    
    data_short = data[data.pid.isin(df.pid)]
    pid_artist = data_short.tid.map(tracks_info.artist).groupby(data_short.pid).apply(np.array)
    df_playlist = df.pid.map(pid_artist)
    df_artist = df.tid.map(tracks_info.artist)
    
    share_unique = pid_artist.apply(np.unique).apply(len) / pid_artist.apply(len)
    
    df['share_of_unique_artist'] = df.pid.map(share_unique).fillna(-1)
    df['sim_artist_in_playlist'] = [isin_sum(i, j) for i, j in zip(df_artist, df_playlist)]
    df['mean_artist_in_playlist'] = (df['sim_artist_in_playlist'] / df.pid.map(pid_artist.apply(len))).fillna(-1)

In [13]:
def creaet_album_features(df):
    
    data_short = data[data.pid.isin(df.pid)]
    pid_album = data_short.tid.map(tracks_info.album).groupby(data_short.pid).apply(np.array)
    df_playlist = df.pid.map(pid_album)
    df_album = df.tid.map(tracks_info.album)
    
    share_unique = pid_album.apply(np.unique).apply(len) / pid_album.apply(len)
    
    df['share_of_unique_album'] = df.pid.map(share_unique).fillna(-1)
    df['sim_album_in_playlist'] = [isin_sum(i, j) for i, j in zip(df_album, df_playlist)]
    df['mean_album_in_playlist'] = (df['sim_album_in_playlist'] / df.pid.map(pid_album.apply(len))).fillna(-1)


In [14]:
def create_features(df, df_length):
    create_count(df)
    creaet_artist_features(df)
    creaet_album_features(df)
    df['tracks_holdout'] = df.pid.map(df_length)

In [15]:
create_features(train, train_length)
create_features(val, val_length)
create_features(test, test_length)

In [17]:
train_co = pd.read_hdf('df_data/ii_co_occurence_features.hdf').drop('target', axis=1)
val_co = pd.read_hdf('df_data/iii_co_occurence_features.hdf').drop('target', axis=1)
test_co = pd.read_hdf('df_data/test_co_occurence_features.hdf')

train_lightfm = pd.read_hdf('df_data/ii_lightfm_features.hdf').drop('target', axis=1)
val_lightfm = pd.read_hdf('df_data/iii_lightfm_features.hdf').drop('target', axis=1)
test_lightfm = pd.read_hdf('df_data/test_lightfm_features.hdf')

train = train.merge(train_co, on=['pid', 'tid'])
val = val.merge(val_co, on=['pid', 'tid'])
test = test.merge(test_co, on=['pid', 'tid'])

train = train.merge(train_lightfm, on=['pid', 'tid'])
val = val.merge(val_lightfm, on=['pid', 'tid'])
test = test.merge(test_lightfm, on=['pid', 'tid'])

In [18]:
cols = ['pid', 'tid', 'target']
xgtrain = xgboost.DMatrix(train.drop(cols, axis=1), train.target)

In [22]:
xgval = xgboost.DMatrix(val.drop(cols, axis=1), val.target)

In [23]:
xgtest = xgboost.DMatrix(test.drop(['pid', 'tid'], axis=1))

In [None]:
params = {
    'objective':'binary:logistic', 
    'eta':0.1, 
    'booster':'gbtree',
    'max_depth':7,         
    'nthread':50,  
    'seed':1,    
    'eval_metric':'auc',
}

a = xgboost.train(
    params=list(params.items()),  
    early_stopping_rounds=30, 
    verbose_eval=10, 
    dtrain=xgtrain,
    evals=[(xgtrain, 'train'), (xgval, 'test')],
    num_boost_round=300,
)

In [25]:
p = a.predict(xgval)
val['p'] = p

In [None]:
scores = []
for pid, df, in val.sort_values('p', ascending=False).groupby('pid'):
    n = val_length[pid]
    scores.append(df[:n].target.sum() / n)
np.mean(scores)

In [27]:
test['p'] = a.predict(xgtest)
test = test.sort_values(['pid', 'p'], ascending=[True, False])
recs = test.groupby('pid').tid.apply(lambda x: x.values[:500])
track_uri = tracks_info.track_uri 

In [28]:
sabmission = open('submission.csv', 'w')
sabmission.write('team_info,main,Avito,vrubcov@hse.ru\n')

for pid, tids in recs.items():
    sabmission.write('{}, '.format(pid) + ', '.join(track_uri.loc[tids].values) + '\n')
    
sabmission.close()