In [1]:
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import warnings
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [2]:
from sklearn.model_selection import train_test_split

# 載入資料集

In [3]:
train = pd.read_csv('./kkbox-music-recommendation-challenge/train.csv')
train = train.sample(frac=0.5,random_state=1) #抽樣

In [4]:
train.columns

Index(['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'target'],
      dtype='object')

In [5]:
songs = pd.read_csv('./kkbox-music-recommendation-challenge/songs.csv')
songs.head()

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language
0,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,247640,465,張信哲 (Jeff Chang),董貞,何啟弘,3.0
1,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,197328,444,BLACKPINK,TEDDY| FUTURE BOUNCE| Bekuh BOOM,TEDDY,31.0
2,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,231781,465,SUPER JUNIOR,,,31.0
3,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,273554,465,S.H.E,湯小康,徐世珍,3.0
4,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=,140329,726,貴族精選,Traditional,Traditional,52.0


In [6]:
songs.columns

Index(['song_id', 'song_length', 'genre_ids', 'artist_name', 'composer',
       'lyricist', 'language'],
      dtype='object')

In [7]:
members = pd.read_csv('./kkbox-music-recommendation-challenge/members.csv')
members.head()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,expiration_date
0,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,1,0,,7,20110820,20170920
1,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,1,0,,7,20150628,20170622
2,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,1,0,,4,20160411,20170712
3,mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=,1,0,,9,20150906,20150907
4,q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=,1,0,,4,20170126,20170613


In [8]:
members.columns

Index(['msno', 'city', 'bd', 'gender', 'registered_via',
       'registration_init_time', 'expiration_date'],
      dtype='object')

# 資料合併，並刪除重複欄位

In [9]:
train = pd.merge(train, songs, on='song_id', how='left') #mrege到train
del songs
train = pd.merge(train, members, on='msno', how='left')
del members

In [10]:
train.columns

Index(['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'target', 'song_length', 'genre_ids', 'artist_name',
       'composer', 'lyricist', 'language', 'city', 'bd', 'gender',
       'registered_via', 'registration_init_time', 'expiration_date'],
      dtype='object')

# 填補缺失值

In [11]:
train.isnull().sum()

msno                            0
song_id                         0
source_system_tab           12494
source_screen_name         207821
source_type                 10813
target                          0
song_length                    59
genre_ids                   59023
artist_name                    59
composer                   837155
lyricist                  1588429
language                       78
city                            0
bd                              0
gender                    1482102
registered_via                  0
registration_init_time          0
expiration_date                 0
dtype: int64

在這個資料集中所有缺失值(nan)填入'unknown'，剩下的都是數值行欄位包括：song_length 和 language ，這裡將其空值補 0

In [12]:
for i in train.select_dtypes(include=['object']).columns:
    train[i][train[i].isnull()] = 'unknown'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [13]:
train.isnull().sum()

msno                       0
song_id                    0
source_system_tab          0
source_screen_name         0
source_type                0
target                     0
song_length               59
genre_ids                  0
artist_name                0
composer                   0
lyricist                   0
language                  78
city                       0
bd                         0
gender                     0
registered_via             0
registration_init_time     0
expiration_date            0
dtype: int64

In [14]:
train = train.fillna(value=0)

# 處理日期欄位

In [15]:
train.registration_init_time

0          20101209
1          20130730
2          20131226
3          20121223
4          20160211
             ...   
3688704    20111028
3688705    20161121
3688706    20050313
3688707    20160624
3688708    20101222
Name: registration_init_time, Length: 3688709, dtype: int64

In [16]:
train['registration_init_time'] = pd.to_datetime(train['registration_init_time'], format='%Y%m%d', errors='ignore')
train['registration_init_time_year'] = train['registration_init_time'].dt.year
train['registration_init_time_month'] = train['registration_init_time'].dt.month
train['registration_init_time_day'] = train['registration_init_time'].dt.day

# expiration_date
train['expiration_date'] = pd.to_datetime(train['expiration_date'],  format='%Y%m%d', errors='ignore')
train['expiration_date_year'] = train['expiration_date'].dt.year
train['expiration_date_month'] = train['expiration_date'].dt.month
train['expiration_date_day'] = train['expiration_date'].dt.day

# 選擇作為測試集資料建模的欄位

In [17]:
# Select columns
train = train[['msno', 'song_id', 'source_screen_name', 'source_type', 'target',
       'song_length', 'artist_name', 'composer', 'bd',
       'registration_init_time', 'registration_init_time_month',
       'registration_init_time_day', 'expiration_date_day']]

# 欄位型別轉換

In [18]:
# 日期欄位轉成類別行欄位
train['registration_init_time'] = train['registration_init_time'].astype('category')

# Object 型別欄位轉成類別型欄位
for col in train.select_dtypes(include=['object']).columns:
    train[col] = train[col].astype('category')

In [19]:
#將類別欄位的每一個值依照不同欄位進行編碼
for col in train.select_dtypes(include=['category']).columns:
    train[col] = train[col].cat.codes

In [20]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3688709 entries, 0 to 3688708
Data columns (total 13 columns):
msno                            int16
song_id                         int32
source_screen_name              int8
source_type                     int8
target                          int64
song_length                     float64
artist_name                     int16
composer                        int32
bd                              int64
registration_init_time          int16
registration_init_time_month    int64
registration_init_time_day      int64
expiration_date_day             int64
dtypes: float64(1), int16(3), int32(2), int64(5), int8(2)
memory usage: 253.3 MB


# 匯入測試集資料

In [41]:
test2 = pd.read_csv('./kkbox-music-recommendation-challenge/test.csv')

In [42]:
test2.columns

Index(['id', 'msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type'],
      dtype='object')

In [22]:
# Load test data
test = pd.read_csv('./kkbox-music-recommendation-challenge/test.csv')

# Load and join songs data
songs = pd.read_csv('./kkbox-music-recommendation-challenge/songs.csv')
test = pd.merge(test, songs, on='song_id', how='left')
del songs

# Load and join songs data
members = pd.read_csv('./kkbox-music-recommendation-challenge/members.csv')
test = pd.merge(test, members, on='msno', how='left')
del members

In [44]:
test_id = test.id
test_id

0                0
1                1
2                2
3                3
4                4
            ...   
2556785    2556785
2556786    2556786
2556787    2556787
2556788    2556788
2556789    2556789
Name: id, Length: 2556790, dtype: int64

In [None]:
測試集的資料跟訓練集的資料清理的方式一樣，會拆開來清理主要是因為怕將測試集的部分特徵也帶進訓練集資料裡建模，這會使模型的效果變差或出現問題。

In [24]:
test.registration_init_time = pd.to_datetime(test.registration_init_time, format='%Y%m%d', errors='ignore')
test['registration_init_time_year'] = test['registration_init_time'].dt.year
test['registration_init_time_month'] = test['registration_init_time'].dt.month
test['registration_init_time_day'] = test['registration_init_time'].dt.day

test.expiration_date = pd.to_datetime(test.expiration_date,  format='%Y%m%d', errors='ignore')
test['expiration_date_year'] = test['expiration_date'].dt.year
test['expiration_date_month'] = test['expiration_date'].dt.month
test['expiration_date_day'] = test['expiration_date'].dt.day

test = test[['msno', 'song_id', 'source_screen_name', 'source_type',
       'song_length', 'artist_name', 'composer', 'bd',
       'registration_init_time', 'registration_init_time_month',
       'registration_init_time_day', 'expiration_date_day']]

test['registration_init_time'] = test['registration_init_time'].astype('category')

for col in test.select_dtypes(include=['object']).columns:
    test[col] = test[col].astype('category')

for col in test.select_dtypes(include=['category']).columns:
    test[col] = test[col].cat.codes

test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2556790 entries, 0 to 2556789
Data columns (total 12 columns):
msno                            int16
song_id                         int32
source_screen_name              int8
source_type                     int8
song_length                     float64
artist_name                     int16
composer                        int32
bd                              int64
registration_init_time          int16
registration_init_time_month    int64
registration_init_time_day      int64
expiration_date_day             int64
dtypes: float64(1), int16(3), int32(2), int64(4), int8(2)
memory usage: 156.1 MB


將train資料集的target欄位提取出來，這是預測目標，並資料拆分成 X_train, X_test, y_train, y_test，訓練及佔全部的0.7 

In [None]:
target = train.pop('target')
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size = 0.3)

In [27]:
# Create model
rf = RandomForestClassifier(n_estimators=350, max_depth=40)
# rf.fit(X_train, y_train)

In [None]:
# Predicting
rf_predict = rf.predict(X_test)

In [28]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score , recall_score , f1_score
import xgboost as xgb

# 使用xgboost進行建模
 他跑很久-_-

In [29]:
# Create model
model = xgb.XGBClassifier(learning_rate=0.1, max_depth=10, min_child_weight=10, n_estimators=250)
model.fit(X_train, y_train)

# Predicting
# predict_labels2 = model2.predict(test)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=10, missing=None, n_estimators=250, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [31]:
predict_test = model.predict(test)

# 匯出提交檔案

In [45]:
sub = pd.DataFrame()
sub['id'] = test_id
sub['target'] = predict_test
sub.to_csv('kk_sub_xgb.csv' , index = False , float_format ='%.5f' )

In [40]:
test

Unnamed: 0,msno,song_id,source_screen_name,source_type,song_length,artist_name,composer,bd,registration_init_time,registration_init_time_month,registration_init_time_day,expiration_date_day
0,12934,122191,8,3,224130.0,24889,33218,0,3438,2,19,18
1,12934,217907,8,3,320470.0,24742,47677,0,3438,2,19,18
2,712,37385,-1,9,315899.0,21878,43194,0,3645,11,17,24
3,1383,224360,16,7,285210.0,20718,39607,30,724,7,25,30
4,1383,85597,16,7,197590.0,21761,30731,30,724,7,25,30
...,...,...,...,...,...,...,...,...,...,...,...,...
2556785,14024,212075,11,6,247640.0,19201,2559,41,1643,11,26,21
2556786,18800,78127,-1,6,197067.0,26482,46255,24,2786,5,6,12
2556787,18800,219419,-1,6,212950.0,23457,37581,24,2786,5,6,12
2556788,18800,121326,-1,6,164414.0,26763,-1,24,2786,5,6,12
