In [1]:
#Name :- Somesh Bachani
#Task :- Song Recommendation System
#importing required files
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import os
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import gc
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import datetime
import math

In [2]:
#CSV files for required for project
print(os.listdir("D:\Data Science Task 2"))

['members.csv', 'sample_submission.csv', 'songs.csv', 'song_extra_info.csv', 'test.csv', 'train.csv']


In [3]:
#reducing memory usage for easier data processing
def reduce_mem_usage(df):
   #itterating through data columns of data frame and modifying data type to reduce memory usage
   #source :- https://www.mikulskibartosz.name/how-to-reduce-memory-usage-in-pandas/
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
#reducing memory usage from data files
train = reduce_mem_usage(pd.read_csv('D:/Data Science Task 2/train.csv'))
test = reduce_mem_usage(pd.read_csv('D:/Data Science Task 2/test.csv'))
sei = pd.read_csv('D:/Data Science Task 2/song_extra_info.csv')
members = pd.read_csv('D:/Data Science Task 2/members.csv',parse_dates=['registration_init_time','expiration_date'])
songs = pd.read_csv('D:/Data Science Task 2/songs.csv')

Memory usage of dataframe is 337.71 MB
Memory usage after optimization is: 84.59 MB
Decreased by 75.0%
Memory usage of dataframe is 117.04 MB
Memory usage after optimization is: 44.23 MB
Decreased by 62.2%


In [5]:
#Dataset properties
print('Shape of train is ->',train.shape)
print('Shape of test is ->',test.shape)
print('Shape of Song Extra Info is ->',sei.shape)
print('Shape of Members is ->',members.shape)
print('Shape of Songs is ->',songs.shape)

Shape of train is -> (7377418, 6)
Shape of test is -> (2556790, 6)
Shape of Song Extra Info is -> (2295971, 3)
Shape of Members is -> (34403, 7)
Shape of Songs is -> (2296320, 7)


In [6]:
def get_codes(isrc):
    if pd.isnull(isrc):
        return np.nan
    else:
        if int(str(isrc)[5:7]) > 17:
            temp =  1900+int(str(isrc)[5:7])
        else:
            temp = 2000+int(isrc[5:7])
        return temp

In [7]:
sei['year'] = sei['isrc'].apply(lambda x: get_codes(x))
sei.sample(10)

Unnamed: 0,song_id,name,isrc,year
2000297,0N4TrjmqsJnTAMi1gEazWff4aq1KzyTdQw7AO0SWD78=,Sleeping Beauty (1993 - Remaster): No.25 Pas d...,GBAYC9302732,1993.0
632963,SdU0cwaGk+6TOnpXqx32p3BoIUEE/55xgx1f6Mg/jyI=,Modokashii Distance,JPU901600167,2016.0
490207,/X0I3DKCUINHhPqdxHf66yzJ/W+LlPQQ0laETK9DGCQ=,I Saw Three Ships,TCABB1194521,2011.0
1032320,FuUD1EU4tYg2hVJO7lPbUxVjPUzKadZV+/HipH3j7Sg=,everybody together,KRB920137552,2001.0
1644498,UoyrRwAmJngIVl043DxRTNb6ajklyhNgH3Vl9ZCWp5Q=,I'm gonna live,TCJPD1679797,2016.0
558259,bW7aA+B9oDjoCFV9tCyGbA1gJRLA092uJs6POioFI/4=,Heartwork,TCACF1572303,2015.0
471272,b1sTD3Iek1He5Baje+/QhP5SD8sSFXZsqLVeqFR/kPE=,Will I (feat. Kaytee),DEHK91368305,2013.0
2085880,t/W+BTqZQ4OWVdTMAhBTGloxfHLgftzXWjySCuQEuOs=,You've Got a Friend,USA371451278,2014.0
868286,JHCA7jhJjcmbgcOPzS9YkYnrNK30qYI1lG072ycdL/o=,Drummer Boy,USSM10702407,2007.0
371311,tPXQ7PKks23A1ovtB6vkAWZOIg2YOGEYGkp9Sz3ZEFw=,J'ai vu,FRS630000025,2000.0


In [8]:
members['membership_days'] = members['expiration_date'].subtract(members['registration_init_time']).dt.days.astype(int)
members['registration_year'] = members['registration_init_time'].dt.year
members['expiration_year'] = members['expiration_date'].dt.year
members.drop(columns = ['registration_init_time' , 'expiration_date'] , inplace = True)
members.head()

Unnamed: 0,msno,city,bd,gender,registered_via,membership_days,registration_year,expiration_year
0,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,1,0,,7,2223,2011,2017
1,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,1,0,,7,725,2015,2017
2,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,1,0,,4,457,2016,2017
3,mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=,1,0,,9,1,2015,2015
4,q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=,1,0,,4,138,2017,2017


In [9]:
# Extending columns
# merging the database
train = train.merge(songs , on='song_id' , how='left')
train = train.merge(members , on = 'msno' , how='left')
train = train.merge(sei , on = 'song_id' , how='left')
test  = test.merge(songs , on='song_id' , how='left')
test = test.merge(members , on = 'msno' , how = 'left')
test =  test.merge(sei , on = 'song_id' , how = 'left')
del sei ,members , songs
gc.collect()

0

In [10]:
#properties of song dataset
print(train['song_length'].isnull().value_counts()/train.shape[0])
train['song_length'].fillna(train['song_length'].mean() , inplace = True)
train['song_length'] = train['song_length'].astype(np.uint32)
print(train['language'].isnull().value_counts()/train.shape[0])
train['language'].fillna(train['language'].mode().values[0] , inplace= True)
train['language'] = train['language'].astype(np.int8)
test['song_length'].fillna(test['song_length'].mean() , inplace = True)
test['song_length'] = test['song_length'].astype(np.uint32)
test['language'].fillna(test['language'].mode().values[0] , inplace= True)
test['language'] = test['language'].astype(np.int8)

False    0.999985
True     0.000015
Name: song_length, dtype: float64
False    0.99998
True     0.00002
Name: language, dtype: float64


In [11]:
def genre_count(genre):
    if genre == 'no_genre_id':
        return 0
    else :
        return genre.count('|') + 1
print(train['genre_ids'].isnull().value_counts()/train.shape[0])
train['genre_ids'].fillna('no_genre_id' , inplace= True)
train['genre_ids_count'] = train['genre_ids'].apply(lambda x: genre_count(x)).astype(np.int8)
test['genre_ids'].fillna('no_genre_id' , inplace= True)
test['genre_ids_count'] = test['genre_ids'].apply(lambda x: genre_count(x)).astype(np.int8)
                                                       

False    0.983944
True     0.016056
Name: genre_ids, dtype: float64


In [12]:
def artist_count(art):
    if art=='no_artist_name':
        return 0
    else:
        return art.count('|')+art.count('/') + art.count('//') + art.count(';') + 1
train['artist_name'].isnull().value_counts()
train['artist_name'].fillna('no_artist_name' , inplace = True)
train['artist_count'] = train['artist_name'].apply(lambda x : artist_count(x)).astype(np.int8)
test['artist_name'].fillna('no_artist_name' , inplace = True)
test['artist_count'] = test['artist_name'].apply(lambda x : artist_count(x)).astype(np.int8)

In [13]:
def  count_composer(comp):
    if comp=='no_composer':
        return 0
    else:
        return comp.count('|')+comp.count('/') + comp.count('//') + comp.count(';') + 1
def  count_lyricist(lyr):
    if lyr=='no_lyricist':
        return 0
    else:
        return lyr.count('|')+lyr.count('/') + lyr.count('//') + lyr.count(';') + 1

In [14]:
train['composer'].fillna('no_composer',inplace=True)
train['composer_count'] = train['composer'].apply(lambda x: count_composer(x)).astype(np.int8)
train['lyricist'].fillna('no_lyricist',inplace=True)
train['lyricist_count'] = train['lyricist'].apply(lambda x: count_lyricist(x)).astype(np.int8)
test['composer'].fillna('no_composer',inplace=True)
test['composer_count'] = test['composer'].apply(lambda x: count_composer(x)).astype(np.int8)
test['lyricist'].fillna('no_lyricist',inplace=True)
test['lyricist_count'] = test['lyricist'].apply(lambda x: count_lyricist(x)).astype(np.int8)

In [15]:
dict_count_song_played_train = {k: v for k, v in train['song_id'].value_counts().iteritems()}
dict_count_song_played_test = {k: v for k, v in test['song_id'].value_counts().iteritems()}
def return_number_played(x):
    try:
        return dict_count_song_played_train[x]
    except KeyError:
        try:
            return dict_count_song_played_test[x]
        except KeyError:
            return 0
train['number_of_time_played'] = train['song_id'].apply(lambda x: return_number_played(x))
test['number_of_time_played'] = test['song_id'].apply(lambda x: return_number_played(x))

In [16]:
dict_user_activity = {k:v for k,v in pd.concat([train['msno'] , test['msno']] , axis = 0).value_counts().iteritems()}
def return_user_activity(x):
    try:
        return dict_user_activity[x]
    except KeyError:
        return 0
train['user_activity_msno'] = train['msno'].apply(lambda x: return_user_activity(x))
test['user_activity_msno'] = test['msno'].apply(lambda x: return_user_activity(x))

In [17]:
train_col = list(train.columns)
test_col = list(test.columns)
for f in test_col :
    if f not in train_col:
        print('ERROR !!!  Column from Test not found in train is ->' , f)
label_encoding = ['source_system_tab', 'source_screen_name',
       'source_type','gender']
drop = ['msno', 'song_id' , 'isrc','artist_name',
       'composer', 'lyricist','name','genre_ids']
min_max_scaling = ['number_of_time_played', 'user_activity_msno','membership_days', 'song_length']

ERROR !!!  Column from Test not found in train is -> id


In [18]:
for f in label_encoding:
    lb = LabelEncoder()
    lb.fit(list(train[f].values) + list(test[f].values))
    train[f] = lb.transform(list(train[f].values))
    test[f] = lb.transform(list(test[f].values))
for f in min_max_scaling:
    ms = MinMaxScaler()
    train[f] = ms.fit_transform(train[[f]])
    test[f] = ms.transform(test[[f]])

In [19]:
for col in train.columns:
    if train[col].dtype == object:
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')

In [20]:
train.sample(10)

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,...,expiration_year,name,isrc,year,genre_ids_count,artist_count,composer_count,lyricist_count,number_of_time_played,user_activity_msno
4391798,/1SwgVHh8c46pmuq1WlHnRz1maIqTYncZZmihvFJZtk=,DT35ri8FZuj7PGLxW/PhK5lt+CFVV85jDqPrBz6kixc=,3,8,3,1,0.023523,458,周杰倫 (Jay Chou),周杰倫,...,2017,說好的幸福呢,TWK970801506,2008.0,1,1,1,1,0.239479,0.039275
3226937,N/Fvfb8d+kJS3kouK80mailHEjVjA3wCCFYGs+P692c=,s9Qhzsn4ak6Cy6yC9vFm61P9GL2/vHUnCJ/oJoVEVrA=,3,8,4,0,0.019731,1609,Redfoo,Brandon Garcia| Stefan Kendal Gordy,...,2017,Let's Get Ridiculous,USUM71312055,2013.0,1,1,2,0,0.009734,0.106423
3086683,MHQU5zk4FAJWceFzvs8XdGZsBgqC7dC6pN64Yz9Ualc=,EL5DbsSvhAFtMBBoOjs4aipOjC3Xj4syX/l/wlK+kdM=,1,17,7,1,0.026191,458,周杰倫 (Jay Chou),周杰倫,...,2016,愛情廢柴,TWK971601310,2016.0,1,1,1,1,0.458274,0.002661
1239068,/RBBFUzduS/599s+9hUeMCt7McOJldcXCGYQ1pC18LA=,EvM2QBlYdQ5x4yneGwfavgf91iJaxoC8vq+OzjL6I3I=,3,8,3,1,0.022522,465,謝和弦 (R-chord),R-chord| TeN,...,2017,蘋果 (feat. 頑童MJ116瘦子) (The Apple),TWA531657211,2016.0,1,1,2,2,0.03643,0.115926
5547912,kS3GF5UkVBJOw2FZs3yqN+Rgn4BkW1YQA4FeLEuzAhM=,PgRtmmESVNtWjoZHO5a1r21vIz9sVZmcJJpFCbRa1LI=,0,11,7,0,0.020824,465,謝和弦 (R-chord),R-chord,...,2017,謝謝妳愛我 (Thanks For Your Love),TWA531657203,2016.0,1,1,1,1,0.859075,0.003294
5589039,hITawwIaRVFAf9qjfoGfD6Mx9b3tbIx+xRwyxtSt1aQ=,Xpjwi8UAE2Vv9PZ6cZnhc58MCtl3cKZEO1sdAkqJ4mo=,5,11,7,0,0.025815,458,田馥甄 (Hebe),薛之謙,...,2017,演員,TWD951676105,2016.0,1,1,1,1,0.6357,0.023565
6852180,xS0MS5BJIS/tLHcNez4K9oiQivMQiPkgGs9Xmmb4E0k=,DdKsqy3JAygpcHwihcjBKzzp8SDYhdtXbEZmhKDrOSo=,7,17,9,0,0.02568,465,Bii畢書盡+陳勢安+陳彥允+李玉璽,畢書盡/陳又齊,...,2018,逆時光的浪 (Back In Time),TWD631518202,2015.0,1,1,2,2,0.152662,0.119473
5840061,MXIMDXO0j3UpaT7FvOSGW6Y5zfhlh+xYjTqGoUdMzEE=,W3DpMlMRnUfhQiVVTVazSYi6K/qerhwlrFJ4k9POTJk=,3,8,4,0,0.018775,958|2122,証聲音樂圖書館 ECHO MUSIC,Gemsa;Eric,...,2017,Te Rappelles-Tu,,,2,1,2,1,0.0,1.0
7114566,5EVtwliFW5a/TsTvkheKQpWYCOvXQq36D6v8zCUXr7o=,B2rbzpB8DOKj5M+HK0ptaho1UiF4CZomtOzA76J02fo=,3,22,3,1,0.025844,458,柯有倫 (Alan Kuo),鍾國鋒,...,2017,零 (Ling),TWA470441001,2004.0,1,1,1,1,0.060478,0.027239
5931814,nd10iJRSeO6lmrUzkk/pgnufBtKjhqhhKOy262w4LeQ=,ZDrWG8swRDEs++R9CEPDU9ZKvGQ8uPMSs4pVNlNOvz4=,6,16,8,0,0.026913,465,李宗盛 (Jonathan Lee),no_composer,...,2017,我終於失去了你,TWK950700007,2007.0,1,1,0,0,0.018466,0.09084


In [21]:
#training dataset
X_train = train.drop(columns = ['target'] , axis = 1)
Y_train = train['target'].values
X_test = test.drop(columns = ['id'] , axis = 1)
ids = test['id'].values
del train , test
gc.collect()
train_set = lgb.Dataset(X_train , Y_train)

In [22]:
params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'learning_rate': 0.3 ,
        'verbose': 0,
        'num_leaves': 108,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 256,
        'max_depth': 10,
        'num_rounds': 200,
        'metric' : 'auc'
    }

%time model_f1 = lgb.train(params, train_set=train_set,  valid_sets=train_set, verbose_eval=5)

You can set `force_col_wise=true` to remove the overhead.
[5]	training's auc: 0.736167
[10]	training's auc: 0.748005
[15]	training's auc: 0.754421
[20]	training's auc: 0.759233
[25]	training's auc: 0.762544
[30]	training's auc: 0.765713
[35]	training's auc: 0.768136
[40]	training's auc: 0.771355
[45]	training's auc: 0.773379
[50]	training's auc: 0.775447
[55]	training's auc: 0.777483
[60]	training's auc: 0.779126
[65]	training's auc: 0.780748
[70]	training's auc: 0.782426
[75]	training's auc: 0.784057
[80]	training's auc: 0.785748
[85]	training's auc: 0.787138
[90]	training's auc: 0.78843
[95]	training's auc: 0.789651
[100]	training's auc: 0.790847
[105]	training's auc: 0.791955
[110]	training's auc: 0.792972
[115]	training's auc: 0.793971
[120]	training's auc: 0.794948
[125]	training's auc: 0.795891
[130]	training's auc: 0.798364
[135]	training's auc: 0.799259
[140]	training's auc: 0.799988
[145]	training's auc: 0.800807
[150]	training's auc: 0.801528
[155]	training's auc: 0.802236
[1

In [23]:
#Creating new csv file with prediction
pred_test = model_f1.predict(X_test)
print('Saving Predictions')
sub = pd.DataFrame()
sub['id'] = ids
sub['target'] = pred_test
sub.to_csv('submission.csv' , index = False , float_format ='%.5f' )

Saving Predictions


In [24]:
#Created dataset info
sub.head()

Unnamed: 0,id,target
0,0,0.384199
1,1,0.310859
2,2,0.198129
3,3,0.11901
4,4,0.097116
