In [1]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

# загрузим все данные

In [2]:
train = pd.read_csv('train.csv')
                                        
test = pd.read_csv('test.csv')
songs = pd.read_csv('songs.csv')
members = pd.read_csv('members.csv',
                     parse_dates=['registration_init_time','expiration_date'])

songs_extra = pd.read_csv('song_extra_info.csv')

# составим датасеты

In [3]:
songs_mi = songs.loc[:, ['song_id', 'artist_name', 'genre_ids', 'song_length', 'language']]
train = train.merge(songs_mi, on='song_id', how='left')
test = test.merge(songs_mi, on='song_id', how='left')

In [4]:
members['registration_year'] = members.registration_init_time.apply(lambda x: x.year)
members['registration_month'] = members.registration_init_time.apply(lambda x: x.month)
members['registration_day'] = members.registration_init_time.apply(lambda x: x.day)

members['expiration_year'] = members.expiration_date.apply(lambda x: x.year)
members['expiration_month'] = members.expiration_date.apply(lambda x: x.month)
members['expiration_day'] = members.expiration_date.apply(lambda x: x.day)

members = members.drop(['registration_init_time'], axis=1)
members = members.drop(['expiration_date'], axis=1)

In [5]:
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan

In [6]:
songs_extra['song_year'] = songs_extra['isrc'].apply(isrc_to_year)
songs_extra.drop(['isrc', 'name'], axis = 1, inplace = True)

In [7]:
train = train.merge(members, on='msno', how='left')
test = test.merge(members, on='msno', how='left')

train = train.merge(songs_extra, on = 'song_id', how = 'left')
test = test.merge(songs_extra, on = 'song_id', how = 'left')

In [9]:
for col in train.columns:
    if col != 'target':
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')

In [10]:
X = train.drop(['target'], axis=1)
y = train['target']

In [11]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [12]:
kf = KFold(n_splits=5)

In [13]:
k_fold_results = []
for train_index, test_index in kf.split(X):
    print("Start iteration")
    # test
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]
    
    XX = X.iloc[train_index]
    yy = y.iloc[train_index]
    
    X_train, X_val, y_train, y_val = train_test_split(
                                     XX, yy, test_size=0.2, random_state=42, stratify=yy)
    
    print('Training LGBM model...')
    
    
    

    print('prepare model')
    lgbtrain = lgb.Dataset(X_train, y_train)
    lgbval = lgb.Dataset(X_val, y_val)


    params = {}
    params['learning_rate'] = 0.2
    params['application'] = 'binary'
    params['max_depth'] = 8
    params['num_leaves'] = 2**8
    params['verbosity'] = 0
    params['metric'] = 'auc'
    
    print('learning...')
    evals = {}
    gbm = lgb.train(params, 
                      train_set=lgbtrain, 
                      num_boost_round=50, 
                      valid_sets=[lgbtrain, lgbval],
                      verbose_eval=5,
                      evals_result=evals,)
    
    y_pred = gbm.predict(X_test)
    scr = roc_auc_score(y_test, y_pred)
    
    k_fold_results.append((gbm, scr))

Start iteration
Training LGBM model...
prepare model
learning...
[5]	training's auc: 0.68861	valid_1's auc: 0.685922
[10]	training's auc: 0.704484	valid_1's auc: 0.70055
[15]	training's auc: 0.713689	valid_1's auc: 0.709235
[20]	training's auc: 0.718527	valid_1's auc: 0.713748
[25]	training's auc: 0.723919	valid_1's auc: 0.718577
[30]	training's auc: 0.727401	valid_1's auc: 0.721831
[35]	training's auc: 0.72982	valid_1's auc: 0.724051
[40]	training's auc: 0.732468	valid_1's auc: 0.726435
[45]	training's auc: 0.735113	valid_1's auc: 0.728823
[50]	training's auc: 0.737154	valid_1's auc: 0.73066
Start iteration
Training LGBM model...
prepare model
learning...
[5]	training's auc: 0.707827	valid_1's auc: 0.704909
[10]	training's auc: 0.72265	valid_1's auc: 0.718675
[15]	training's auc: 0.731697	valid_1's auc: 0.727158
[20]	training's auc: 0.736638	valid_1's auc: 0.731704
[25]	training's auc: 0.739153	valid_1's auc: 0.734094
[30]	training's auc: 0.742987	valid_1's auc: 0.737638
[35]	training

In [20]:
# mean auc
np.array(k_fold_results)[:,1].mean()

0.7030190287116239