# Data Science Project
## Jonathan Sahagun 301548551

https://www.kaggle.com/c/kkbox-music-recommendation-challenge

Submissions are evaluated on area under the ROC curve between the predicted probability and the observed target. Look at df_sample_clean to see how submittions should be.

Imports

In [1]:
import pandas as pd
import numpy as np

verbose = False


## Reading all the data
Read the all data. It can take a bit.

In [2]:
print('Reading csv files...')
df_members_clean     = pd.read_csv('data/members.csv')
df_sample_clean      = pd.read_csv('data/sample_submission.csv')
df_songs_extra_clean = pd.read_csv('data/song_extra_info.csv')
df_songs_clean       = pd.read_csv('data/songs.csv')
df_test_clean        = pd.read_csv('data/test.csv')
df_train_clean       = pd.read_csv('data/train.csv')

if verbose:
    print('train shape:\n', df_train_clean.shape)
    print(df_train_clean.head())

    print('test shape:\n', df_test_clean.shape)
    print(df_test_clean.head())

    print('songs shape:\n', df_songs_clean.shape)
    print(df_songs_clean.head())

    print('extra shape:\n', df_extra_clean.shape)
    print(df_extra_clean.head())
    
    print('member shape:\n', df_members_clean.shape)
    print(df_members_clean.head())

    print('sample shape:\n', df_sample_clean.shape)
    print(df_sample_clean.head())
print('Done reading the csv files.')

Reading csv files...
Done reading the csv files.


A way to reset the data with out rereading the csv files

In [3]:
df_members     = df_members_clean.copy()
df_sample      = df_sample_clean.copy()
df_songs_extra = df_songs_extra_clean.copy()
df_songs       = df_songs_clean.copy()
df_test        = df_test_clean.copy()
df_train       = df_train_clean.copy()

## Extracting features from data

members data

In [4]:
# parse expiration_date and registration_init_time to datetime formats
df_members['expiration_date'] = df_members['expiration_date'].apply(str).apply(pd.to_datetime)
df_members['registration_init_time'] = df_members['registration_init_time'].apply(str).apply(pd.to_datetime)

# Getting year month and day from registration date
df_members['registration_year']  = df_members['registration_init_time'].dt.year
df_members['registration_month'] = df_members['registration_init_time'].dt.month
df_members['registration_day']   = df_members['registration_init_time'].dt.day

# Getting year month and day from expiration date
df_members['expiration_year']  = df_members['expiration_date'].dt.year
df_members['expiration_month'] = df_members['expiration_date'].dt.month
df_members['expiration_day']   = df_members['expiration_date'].dt.day

# create a the number of day
df_members['membership_days'] = (df_members['expiration_date'].subtract(df_members['registration_init_time'])).dt.days

# keeping 'expiration_date' and 'registration_init_time' might add noise, so we might beed to remove them.
df_members.drop(['expiration_date', 'registration_init_time'], axis = 1, inplace = True)

# change gender to numbers
#df_members['gender'] = df_members['gender'].apply(lambda x: -1 if not (x == 'female' or x =='male') else x) # not a number
#df_members['gender'] = df_members['gender'].apply(lambda x: 0 if x =='male' else x)
#df_members['gender'] = df_members['gender'].apply(lambda x: 1 if x =='female' else x)

# make msno into a string vector
df_members['msno'] = df_members['msno'].apply(str)

df_members.head()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_year,registration_month,registration_day,expiration_year,expiration_month,expiration_day,membership_days
0,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,1,0,,7,2011,8,20,2017,9,20,2223
1,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,1,0,,7,2015,6,28,2017,6,22,725
2,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,1,0,,4,2016,4,11,2017,7,12,457
3,mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=,1,0,,9,2015,9,6,2015,9,7,1
4,q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=,1,0,,4,2017,1,26,2017,6,13,138


Songs extra data.
isrc http://www.usisrc.org/
from the isrc we can get the year of the song

In [5]:
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return int(1900 + int(isrc[5:7]))
        else:
            return int(2000 + int(isrc[5:7]))
    else:
        return np.nan

df_songs_extra['song_year'] = df_songs_extra['isrc'].apply(isrc_to_year)

# keeping 'isrc' and 'name' might add noise, so we might beed to remove them.
df_songs_extra.drop(['isrc', 'name'], axis = 1, inplace = True)

df_songs_extra.head()

Unnamed: 0,song_id,song_year
0,LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=,2012.0
1,ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=,2016.0
2,u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=,2008.0
3,92Fqsy0+p6+RHe2EoLKjHahORHR1Kq1TBJoClW9v+Ts=,2013.0
4,0QFmz/+rJy1Q56C1DuYqT9hKKqi5TUqx0sN0IwvoHrw=,2013.0


## Merge Data
This section adds the song and member data into the testing and training dataframes for convenience. This part takes a bit to run.

In [6]:
print('Merging Data...')

# adds the song info
df_train = df_train.merge(df_songs, on='song_id', how='left')
df_test  = df_test.merge(df_songs, on='song_id', how='left')

#addes the extra song info
df_train = df_train.merge(df_songs_extra, on = 'song_id', how = 'left')
df_test  = df_test.merge(df_songs_extra, on = 'song_id', how = 'left')

# adds the member info
df_train = df_train.merge(df_members, on='msno', how='left')
df_test  = df_test.merge(df_members, on='msno', how='left')

print('Done merging Data.')
df_train.head()

Merging Data...
Done merging Data.


Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,...,bd,gender,registered_via,registration_year,registration_month,registration_day,expiration_year,expiration_month,expiration_day,membership_days
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1,206471.0,359,Bastille,Dan Smith| Mark Crew,...,0,,7,2012,1,2,2017,10,5,2103
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1,284584.0,1259,Various Artists,,...,24,female,9,2011,5,25,2017,9,11,2301
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1,225396.0,1259,Nas,N. Jones、W. Adams、J. Lordan、D. Ingle,...,24,female,9,2011,5,25,2017,9,11,2301
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1,255512.0,1019,Soundway,Kwadwo Donkoh,...,24,female,9,2011,5,25,2017,9,11,2301
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1,187802.0,1011,Brett Young,Brett Young| Kelly Archer| Justin Ebach,...,0,,7,2012,1,2,2017,10,5,2103


## Data Science Fun Part

In [7]:
## replace nan values

features = df_train.columns.values

for f in features:
    # if numeric
    if np.issubdtype(df_train[f].dtype, np.number):
        ## Maybe not the best value
        df_train[f] = df_train[f].fillna(np.inf)
    else:
        ## Maybe not the best value
        df_train[f] = df_train[f].fillna('?')

        
features = df_test.columns.values

for f in features:
    # if numeric
    if np.issubdtype(df_test[f].dtype, np.number):
        ## Maybe not the best value
        df_test[f] = df_test[f].fillna(np.inf)
    else:
        ## Maybe not the best value
        df_test[f] = df_test[f].fillna('?')

In [20]:
# I start with a small n to make this faster.
# remove all head method calls to get the full data set
training_size = 50
testing_size = 40

# all columns except for the label, target
X_train = df_train.loc[:, df_train.columns != 'target'].head(training_size)

# the label vector
y_train = df_train['target'].head(training_size)

X_test = df_test.head(testing_size)

# We need to drop id because training doesn't have it
X_test.drop(['id'], axis = 1, inplace = True)

#df_train.head(50).to_csv('file_name_2.csv', sep=',', encoding='utf-8')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [21]:
## make everything numeric. This takes a while!
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
X_test = X_test.apply(le.fit_transform)
X_train = X_train.apply(le.fit_transform)

X_train.dtypes

msno                  int64
song_id               int64
source_system_tab     int64
source_screen_name    int64
source_type           int64
song_length           int64
genre_ids             int64
artist_name           int64
composer              int64
lyricist              int64
language              int64
song_year             int64
city                  int64
bd                    int64
gender                int64
registered_via        int64
registration_year     int64
registration_month    int64
registration_day      int64
expiration_year       int64
expiration_month      int64
expiration_day        int64
membership_days       int64
dtype: object

In [25]:
# fitting the data to a tree
from sklearn import tree
decisiontree = tree.DecisionTreeClassifier(criterion="entropy", min_samples_split=2)
decisiontree.fit(X_train, y_train)

## create the LogisticRegression object and fit it with the testing data
#from sklearn.linear_model import LogisticRegression
#logreg = LogisticRegression()
#logreg.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [34]:
# predicting the data from a tree
y_predict_tree = decisiontree.predict(X_test)
print(y_predict_tree)

# I think the competition wants us to give the probability of a recommendation
y_predict_prob_tree = decisiontree.predict_proba(X_test)
print('Probability for Predictions of 1 for target using a tree')
print(y_predict_prob_tree[:,1])

[1 1 1 1 1 1 1 1 1 1]
Probability for Predictions of 1 for target using a tree
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]


In [33]:
# Save as a csv to submit to competition
results = pd.DataFrame(y_predict_prob_tree[:,1])
results.to_csv('results.csv', encoding='utf-8', index=True, header=False)