# Table of Contents
 <p><div class="lev3 toc-item"><a href="#Based-on-the-following-notebooks" data-toc-modified-id="Based-on-the-following-notebooks-001"><span class="toc-item-num">0.0.1&nbsp;&nbsp;</span>Based on the following notebooks</a></div><div class="lev3 toc-item"><a href="#Load-input-data" data-toc-modified-id="Load-input-data-002"><span class="toc-item-num">0.0.2&nbsp;&nbsp;</span>Load input data</a></div><div class="lev3 toc-item"><a href="#Merge-DFs" data-toc-modified-id="Merge-DFs-003"><span class="toc-item-num">0.0.3&nbsp;&nbsp;</span>Merge DFs</a></div><div class="lev3 toc-item"><a href="#Add-features" data-toc-modified-id="Add-features-004"><span class="toc-item-num">0.0.4&nbsp;&nbsp;</span>Add features</a></div>

### Based on the following notebooks

https://www.kaggle.com/asmitavikas/feature-engineered-0-68310

https://www.kaggle.com/vinnsvinay/introduction-to-boosting-using-lgbm-lb-0-68357

https://www.kaggle.com/kamilkk/i-have-to-say-this

Python 2.7
Pandas 0.19

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import datetime
import math
import gc


In [2]:
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

members.csv
sample_submission.csv
song_extra_info.csv
songs.csv
submission_lgbm_avg.csv.gz
test.csv
train.csv



### Load input data

In [3]:
data_path = '../input/'
# really? msno, song_id etc. as category?
train = pd.read_csv(data_path + 'train.csv', dtype={'msno' : 'category',
                                                'source_system_tab' : 'category',
                                                  'source_screen_name' : 'category',
                                                  'source_type' : 'category',
                                                  'target' : np.uint8,
                                                  'song_id' : 'category'})
test = pd.read_csv(data_path + 'test.csv', dtype={'msno' : 'category',
                                                'source_system_tab' : 'category',
                                                'source_screen_name' : 'category',
                                                'source_type' : 'category',
                                                'song_id' : 'category'})
songs = pd.read_csv(data_path + 'songs.csv',dtype={'genre_ids': 'category',
                                                  'language' : 'category',
                                                  'artist_name' : 'category',
                                                  'composer' : 'category',
                                                  'lyricist' : 'category',
                                                  'song_id' : 'category'})
members = pd.read_csv(data_path + 'members.csv',dtype={'city' : 'category',
                                                      'bd' : np.uint8,
                                                      'gender' : 'category',
                                                      'registered_via' : 'category'},
                     parse_dates=['registration_init_time','expiration_date'])
songs_extra = pd.read_csv(data_path + 'song_extra_info.csv')

### Merge DFs

In [4]:
train = train.merge(songs, on='song_id', how='left')
test = test.merge(songs, on='song_id', how='left')

members['membership_days'] = members['expiration_date'].subtract(members['registration_init_time']).dt.days.astype(int)

members['registration_year'] = members['registration_init_time'].dt.year
members['registration_month'] = members['registration_init_time'].dt.month
members['registration_date'] = members['registration_init_time'].dt.day

members['expiration_year'] = members['expiration_date'].dt.year
members['expiration_month'] = members['expiration_date'].dt.month
members['expiration_date'] = members['expiration_date'].dt.day
members = members.drop(['registration_init_time'], axis=1)

def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan
        
songs_extra['song_year'] = songs_extra['isrc'].apply(isrc_to_year)
songs_extra.drop(['isrc', 'name'], axis = 1, inplace = True)

train = train.merge(members, on='msno', how='left')
test = test.merge(members, on='msno', how='left')

train = train.merge(songs_extra, on = 'song_id', how = 'left')
train.song_length.fillna(200000,inplace=True)
train.song_length = train.song_length.astype(np.uint32)
train.song_id = train.song_id.astype('category')


test = test.merge(songs_extra, on = 'song_id', how = 'left')
test.song_length.fillna(200000,inplace=True)
test.song_length = test.song_length.astype(np.uint32)
test.song_id = test.song_id.astype('category')

### Add features

In [5]:
def genre_id_count(x):
    if x == 'no_genre_id':
        return 0
    else:
        return x.count('|') + 1

train['genre_ids'].fillna('no_genre_id',inplace=True)
test['genre_ids'].fillna('no_genre_id',inplace=True)
train['genre_ids_count'] = train['genre_ids'].apply(genre_id_count).astype(np.int8)
test['genre_ids_count'] = test['genre_ids'].apply(genre_id_count).astype(np.int8)

def lyricist_count(x):
    if x == 'no_lyricist':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1
    return sum(map(x.count, ['|', '/', '\\', ';']))

train['lyricist'].fillna('no_lyricist',inplace=True)
test['lyricist'].fillna('no_lyricist',inplace=True)
train['lyricists_count'] = train['lyricist'].apply(lyricist_count).astype(np.int8)
test['lyricists_count'] = test['lyricist'].apply(lyricist_count).astype(np.int8)

def composer_count(x):
    if x == 'no_composer':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

train['composer'].fillna('no_composer',inplace=True)
test['composer'].fillna('no_composer',inplace=True)
train['composer_count'] = train['composer'].apply(composer_count).astype(np.int8)
test['composer_count'] = test['composer'].apply(composer_count).astype(np.int8)

def is_featured(x):
    if 'feat' in str(x) :
        return 1
    return 0

train['artist_name'].fillna('no_artist',inplace=True)
test['artist_name'].fillna('no_artist',inplace=True)
train['is_featured'] = train['artist_name'].apply(is_featured).astype(np.int8)
test['is_featured'] = test['artist_name'].apply(is_featured).astype(np.int8)

def artist_count(x):
    if x == '佚名':
        return 0
    if x == '群星':
        return -1
    return sum(map(x.count, ['|', ',', 'feat', '&', 'and'])) + 1

train['artist_count'] = train['artist_name'].apply(artist_count).astype(np.int8)
test['artist_count'] = test['artist_name'].apply(artist_count).astype(np.int8)

# if artist is same as composer
train['artist_composer'] = (train['artist_name'] == train['composer']).astype(np.int8)
test['artist_composer'] = (test['artist_name'] == test['composer']).astype(np.int8)


# if artist, lyricist and composer are all three same
train['artist_composer_lyricist'] = ((train['artist_name'] == train['composer']) & (train['artist_name'] == train['lyricist']) & (train['composer'] == train['lyricist'])).astype(np.int8)
test['artist_composer_lyricist'] = ((test['artist_name'] == test['composer']) & (test['artist_name'] == test['lyricist']) & (test['composer'] == test['lyricist'])).astype(np.int8)

# is song language 17 or 45. 
def song_lang_boolean(x):
    if '17.0' in str(x) or '45.0' in str(x):
        return 1
    return 0

train['song_lang_boolean'] = train['language'].apply(song_lang_boolean).astype(np.int8)
test['song_lang_boolean'] = test['language'].apply(song_lang_boolean).astype(np.int8)


_mean_song_length = np.mean(train['song_length'])
def smaller_song(x):
    if x < _mean_song_length:
        return 1
    return 0

train['smaller_song'] = train['song_length'].apply(smaller_song).astype(np.int8)
test['smaller_song'] = test['song_length'].apply(smaller_song).astype(np.int8)

# number of times a song has been played before
_dict_count_song_played_train = {k: v for k, v in train['song_id'].value_counts().iteritems()}
_dict_count_song_played_test = {k: v for k, v in test['song_id'].value_counts().iteritems()}
def count_song_played(x):
    try:
        return _dict_count_song_played_train[x]
    except KeyError:
        try:
            return _dict_count_song_played_test[x]
        except KeyError:
            return 0
    

train['count_song_played'] = train['song_id'].apply(count_song_played).astype(np.int64)
test['count_song_played'] = test['song_id'].apply(count_song_played).astype(np.int64)

# number of times the artist has been played
_dict_count_artist_played_train = {k: v for k, v in train['artist_name'].value_counts().iteritems()}
_dict_count_artist_played_test = {k: v for k, v in test['artist_name'].value_counts().iteritems()}
def count_artist_played(x):
    try:
        return _dict_count_artist_played_train[x]
    except KeyError:
        try:
            return _dict_count_artist_played_test[x]
        except KeyError:
            return 0

train['count_artist_played'] = train['artist_name'].apply(count_artist_played).astype(np.int64)
test['count_artist_played'] = test['artist_name'].apply(count_artist_played).astype(np.int64)

In [6]:
print ("Train test and validation sets")
for col in train.columns:
    if train[col].dtype == object:
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')


from sklearn.model_selection import train_test_split
        
X_train = train.drop(['target'], axis=1)
y_train = train['target'].values

X_test = test.drop(['id'], axis=1)
ids = test['id'].values

# Use the test size as the validation set size, make use of the fact that the training set is chronically ordered
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train,test_size=X_test.shape[0],shuffle=False)

d_train_final = lgb.Dataset(X_tr, y_tr)
watchlist_final = lgb.Dataset(X_val, y_val)

Train test and validation sets


In [7]:
params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'learning_rate': 0.3 ,
        'verbose': 0,
        'num_leaves': 108,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 256,
        'max_depth': 10,
        'num_rounds': 200,
        'metric' : 'auc'
    }

%time model_f1 = lgb.train(params, train_set=d_train_final,  valid_sets=watchlist_final, verbose_eval=5)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


[5]	valid_0's auc: 0.653114
[10]	valid_0's auc: 0.658683
[15]	valid_0's auc: 0.661981
[20]	valid_0's auc: 0.664433
[25]	valid_0's auc: 0.665969
[30]	valid_0's auc: 0.667273
[35]	valid_0's auc: 0.669206
[40]	valid_0's auc: 0.670184
[45]	valid_0's auc: 0.670915
[50]	valid_0's auc: 0.671749
[55]	valid_0's auc: 0.672233
[60]	valid_0's auc: 0.672655
[65]	valid_0's auc: 0.673599
[70]	valid_0's auc: 0.674236
[75]	valid_0's auc: 0.674638
[80]	valid_0's auc: 0.675098
[85]	valid_0's auc: 0.675493
[90]	valid_0's auc: 0.675735
[95]	valid_0's auc: 0.676085
[100]	valid_0's auc: 0.676491
[105]	valid_0's auc: 0.676676
[110]	valid_0's auc: 0.676863
[115]	valid_0's auc: 0.676983
[120]	valid_0's auc: 0.677084
[125]	valid_0's auc: 0.677262
[130]	valid_0's auc: 0.677497
[135]	valid_0's auc: 0.67772
[140]	valid_0's auc: 0.677922
[145]	valid_0's auc: 0.678023
[150]	valid_0's auc: 0.678137
[155]	valid_0's auc: 0.678209
[160]	valid_0's auc: 0.678321
[165]	valid_0's auc: 0.678447
[170]	valid_0's auc: 0.678468
[

In [8]:
params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'dart',
        'learning_rate': 0.3 ,
        'verbose': 0,
        'num_leaves': 108,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 256,
        'max_depth': 10,
        'num_rounds': 200,
        'metric' : 'auc'
    }

%time model_f2 = lgb.train(params, train_set=d_train_final,  valid_sets=watchlist_final, verbose_eval=5)

[5]	valid_0's auc: 0.653114
[10]	valid_0's auc: 0.657574
[15]	valid_0's auc: 0.66144
[20]	valid_0's auc: 0.664042
[25]	valid_0's auc: 0.665703
[30]	valid_0's auc: 0.666774
[35]	valid_0's auc: 0.667773
[40]	valid_0's auc: 0.669122
[45]	valid_0's auc: 0.669959
[50]	valid_0's auc: 0.670366
[55]	valid_0's auc: 0.671043
[60]	valid_0's auc: 0.671683
[65]	valid_0's auc: 0.672232
[70]	valid_0's auc: 0.67285
[75]	valid_0's auc: 0.672997
[80]	valid_0's auc: 0.673515
[85]	valid_0's auc: 0.673327
[90]	valid_0's auc: 0.673535
[95]	valid_0's auc: 0.673701
[100]	valid_0's auc: 0.673904
[105]	valid_0's auc: 0.67395
[110]	valid_0's auc: 0.673987
[115]	valid_0's auc: 0.675365
[120]	valid_0's auc: 0.675752
[125]	valid_0's auc: 0.675624
[130]	valid_0's auc: 0.675811
[135]	valid_0's auc: 0.676278
[140]	valid_0's auc: 0.676506
[145]	valid_0's auc: 0.676711
[150]	valid_0's auc: 0.676791
[155]	valid_0's auc: 0.676818
[160]	valid_0's auc: 0.676617
[165]	valid_0's auc: 0.677086
[170]	valid_0's auc: 0.677591
[17

In [9]:
print('Making predictions')
p_test_1 = model_f1.predict(X_test)
p_test_2 = model_f2.predict(X_test)
p_test_avg = np.mean([p_test_1, p_test_2], axis = 0)


print('Done making predictions')

Making predictions
Done making predictions


In [10]:
print ('Saving predictions Model model of gbdt')

subm = pd.DataFrame()
subm['id'] = ids
subm['target'] = p_test_avg
subm.to_csv(data_path + 'submission_lgbm_avg.csv.gz', compression = 'gzip', index=False, float_format = '%.5f')

print('Done!')

Saving predictions Model model of gbdt
Done!
