# TalkingData Mobile User Demographics

https://www.kaggle.com/c/talkingdata-mobile-user-demographics/overview

Score : 2.27826

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from scipy.sparse import csr_matrix

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
import random
import time
random.seed(2020)

In [32]:
from sklearn.preprocessing import LabelEncoder

# 数据预处理

In [4]:
def load_sparse(filename):
    data = np.load(filename)
    return csr_matrix((data['data'], data['indices'], data['indptr']), shape= data['shape'])


In [5]:
datadir = './drive/My Drive/talkingdata'

In [6]:
Xtrain = load_sparse(os.path.join(datadir,'Xtrain.npz'))
Xtest = load_sparse(os.path.join(datadir,'Xtest.npz'))

In [7]:
train = pd.read_csv(os.path.join(datadir,'gender_age_train.csv.zip'))

In [18]:
y_train = train.group

In [33]:
y_encoder = LabelEncoder().fit(train.group)
y_train = y_encoder.transform(train.group)
yclasses_ = len(y_encoder.classes_)
print('total number of target classes: ', yclasses_, y_encoder.classes_)

total number of target classes:  12 ['F23-' 'F24-26' 'F27-28' 'F29-32' 'F33-42' 'F43+' 'M22-' 'M23-26'
 'M27-28' 'M29-31' 'M32-38' 'M39+']


In [35]:
y_train

array([10, 10, 10, ...,  6, 10,  7])

# 机器学习模型

In [8]:
from sklearn.metrics import log_loss

In [11]:
class SklearnHelper(object):
    def __init__(self, clf, seed=None, params=None):
        if seed:
            params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def predict_proba(self, x):
        return self.clf.predict_proba(x)

    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def score(self, x, y):
        return self.clf.score(x, y)
    
    def get_name(self):
        return str(self.clf).split('(')[0]

    def get_clf(self):
        return clf


In [22]:
def get_oof(clf, x_train, y_train, x_test,label_num = 12):
    print(clf.get_name() + " start ..." )
    start = time.time()
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2020)
    oof_train = np.zeros((x_train.shape[0], label_num))
    oof_test = np.zeros((x_test.shape[0], label_num))
    for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        clf.train(x_tr, y_tr)
            
        oof_train[test_index] = clf.predict_proba(x_te)
        oof_test += clf.predict_proba(x_test)
        l_score = log_loss(y_train[test_index],oof_train[test_index])
        print('times: ' , str(time.time() - start))
        print('    Fold %d loss: %f' % (i , l_score))
            
    print('times: ' , str(time.time() - start))
    return oof_train, oof_test / kf.n_splits

In [12]:
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}


gb_params = {
    'n_estimators': 500,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

lgb_params = {
    'objective': 'multi:softprob',
     'eta': 0.03,
     'max_depth': 6,
     'silent': 0,
     'num_class': 3,
     'eval_metric': 'mlogloss',
     'min_child_weight': 1,
     'subsample': 0.7,
     'colsample_bytree': 0.7,
     'nthread': 12
}

xgb_params = {
    'objective': 'multi:softprob',
     'eta': 0.03,
     'max_depth': 6,
     'silent': 0,
     'num_class': 3,
     'eval_metric': 'mlogloss',
     'min_child_weight': 1,
     'subsample': 0.7,
     'colsample_bytree': 0.7,
     'nthread': 12
}

In [13]:
from sklearn import model_selection

In [14]:
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [36]:
SEED = 300
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
lgb = SklearnHelper(clf=LGBMClassifier, seed=SEED, params=lgb_params)
xgb = SklearnHelper(clf=XGBClassifier, seed=SEED, params=xgb_params)

In [37]:
%%time
et_oof_train, et_oof_test = get_oof(et, Xtrain, y_train, Xtest) 
rf_oof_train, rf_oof_test = get_oof(rf,Xtrain, y_train, Xtest) 
#ada_oof_train, ada_oof_test = get_oof(ada, Xtrain, y_train, Xtest) 
xgb_oof_train, xgb_oof_test = get_oof(xgb,Xtrain, y_train, Xtest)
lgb_oof_train, lgb_oof_test = get_oof(lgb,Xtrain, y_train, Xtest) 

ExtraTreesClassifier start ...
times:  33.73877763748169
    Fold 0 loss: 2.382847
times:  66.85583925247192
    Fold 1 loss: 2.382000
times:  98.86633443832397
    Fold 2 loss: 2.380802
times:  130.87490034103394
    Fold 3 loss: 2.381786
times:  163.3896198272705
    Fold 4 loss: 2.389944
times:  163.38984417915344
RandomForestClassifier start ...
times:  21.81128978729248
    Fold 0 loss: 2.392088


  warn("Warm-start fitting without increasing n_estimators does not "


times:  29.53079390525818
    Fold 1 loss: 2.382568


  warn("Warm-start fitting without increasing n_estimators does not "


times:  37.55006980895996
    Fold 2 loss: 2.380660


  warn("Warm-start fitting without increasing n_estimators does not "


times:  45.56214451789856
    Fold 3 loss: 2.382015


  warn("Warm-start fitting without increasing n_estimators does not "


times:  53.47763967514038
    Fold 4 loss: 2.389935
times:  53.477816343307495
XGBClassifier start ...
times:  123.18657445907593
    Fold 0 loss: 2.295664
times:  245.95054006576538
    Fold 1 loss: 2.288511
times:  368.4545660018921
    Fold 2 loss: 2.292495
times:  491.32444643974304
    Fold 3 loss: 2.294728
times:  614.5949122905731
    Fold 4 loss: 2.304465
times:  614.5951476097107
LGBMClassifier start ...
times:  73.75813722610474
    Fold 0 loss: 2.297780
times:  147.70374274253845
    Fold 1 loss: 2.288606
times:  223.45040249824524
    Fold 2 loss: 2.294831
times:  302.347368478775
    Fold 3 loss: 2.298991
times:  384.03445863723755
    Fold 4 loss: 2.308026
times:  384.03471183776855
CPU times: user 36min 47s, sys: 1min 16s, total: 38min 4s
Wall time: 20min 15s


In [38]:
len(lgb_oof_test)

112071

# stack 模型融合

In [39]:
params = {
    "n_estimators" : 2000,
    "objective": "multi:softprob",
    "num_class": 12,
    "booster" : "gbtree",
    "eval_metric": "mlogloss",
    "eta": 0.3,
    "max_depth": 3,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "silent": 1,
    "seed": 0,
    }

In [40]:
x_train = np.concatenate(( et_oof_train, rf_oof_train, 
                          xgb_oof_train, lgb_oof_train), axis=1)
x_test = np.concatenate((  et_oof_test, rf_oof_test, 
                          xgb_oof_test, lgb_oof_test), axis=1)

In [183]:
x_test.shape

(112071, 72)

In [27]:
import xgboost

In [41]:
dtrain = xgboost.DMatrix(x_train, y_train)
dtest = xgboost.DMatrix(x_test)

In [42]:
watchlist = [(dtrain, 'train')]

In [43]:
xgb_model = xgboost.train(params, dtrain, 100, early_stopping_rounds = 50, evals=watchlist, verbose_eval=True)

[0]	train-mlogloss:2.40739
Will train until train-mlogloss hasn't improved in 50 rounds.
[1]	train-mlogloss:2.36294
[2]	train-mlogloss:2.33043
[3]	train-mlogloss:2.30706
[4]	train-mlogloss:2.28811
[5]	train-mlogloss:2.27468
[6]	train-mlogloss:2.26405
[7]	train-mlogloss:2.25504
[8]	train-mlogloss:2.24781
[9]	train-mlogloss:2.24103
[10]	train-mlogloss:2.23617
[11]	train-mlogloss:2.23146
[12]	train-mlogloss:2.22736
[13]	train-mlogloss:2.22346
[14]	train-mlogloss:2.2198
[15]	train-mlogloss:2.2164
[16]	train-mlogloss:2.21326
[17]	train-mlogloss:2.20995
[18]	train-mlogloss:2.20748
[19]	train-mlogloss:2.20451
[20]	train-mlogloss:2.20174
[21]	train-mlogloss:2.19945
[22]	train-mlogloss:2.1974
[23]	train-mlogloss:2.19463
[24]	train-mlogloss:2.19238
[25]	train-mlogloss:2.19026
[26]	train-mlogloss:2.18847
[27]	train-mlogloss:2.18651
[28]	train-mlogloss:2.1846
[29]	train-mlogloss:2.18234
[30]	train-mlogloss:2.18039
[31]	train-mlogloss:2.17848
[32]	train-mlogloss:2.17639
[33]	train-mlogloss:2.17469


In [45]:
test_pred = xgb_model.predict(dtest)

In [46]:
len(test_pred)

112071

# 生成提交文件

In [51]:
test = pd.read_csv(os.path.join(datadir,"gender_age_test.csv.zip"), index_col='device_id')

In [49]:
submit = pd.DataFrame(test_pred, columns=list(y_encoder.classes_))

In [52]:
submit["device_id"] = test.index
submit = submit.set_index("device_id")

In [53]:
submit = submit[['F23-','F24-26','F27-28','F29-32','F33-42','F43+','M22-','M23-26','M27-28','M29-31','M32-38','M39+']]

In [54]:
len(submit)

112071

In [55]:
submit.to_csv('submission.csv', index=True,index_label='device_id')

In [56]:
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>