# TalkingData Mobile User Demographics

https://www.kaggle.com/c/talkingdata-mobile-user-demographics/overview

Private Score : 2.32616  
Public Score : 2.32153

In [1]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [2]:
import random
import time
random.seed(2020)

# 数据预处理

In [3]:
datadir = './talkingdata/'

In [4]:
events = pd.read_csv(os.path.join(datadir,'events.csv.zip'), dtype={'device_id': np.str})

In [5]:
events.head(2)

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude
0,1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97


In [6]:
events['counts'] = events.groupby(['device_id'])['event_id'].transform('count')

In [7]:
events_count = events[['device_id', 'counts']].drop_duplicates('device_id', keep='first')

In [8]:
events_count.head(2)

Unnamed: 0,device_id,counts
0,29182687948017175,256
1,-6401643145415154744,73


In [9]:
phone_brand_device = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv.zip'), dtype={'device_id': np.str})
phone_brand_device.drop_duplicates('device_id', keep='first', inplace=True)

In [10]:
def label_column(table, *col):
    maplst =[]
    for c in col:
        labels = list(table[c].value_counts().index)
        mappings = {labels[i]:i for i in range(len(labels))}
        table = table.replace({c: mappings})
        maplst.append(mappings)
    return table, maplst

In [11]:
phone_brand_device.head(2)

Unnamed: 0,device_id,phone_brand,device_model
0,-8890648629457979026,小米,红米
1,1277779817574759137,小米,MI 2


In [12]:
%%time
phone_brand_device, mappings = label_column(phone_brand_device, 'phone_brand','device_model')

CPU times: user 16.8 s, sys: 378 ms, total: 17.2 s
Wall time: 17.6 s


In [13]:
phone_brand_device.head(2)

Unnamed: 0,device_id,phone_brand,device_model
0,-8890648629457979026,0,11
1,1277779817574759137,0,29


In [14]:
train = pd.read_csv(os.path.join(datadir,'gender_age_train.csv.zip'), dtype={'device_id': np.str})

In [15]:
train, g_map = label_column(train,'group')

In [16]:
train.head(2)

Unnamed: 0,device_id,gender,age,group
0,-8076087639492063270,M,35,1
1,-2897161552818060146,M,35,1


In [207]:
device_info = pd.read_csv('device_info.csv', dtype={'device_id': np.str})
device_info.drop('Unnamed: 0', axis = 1, inplace = True)
device_info = device_info[['device_id','age_mean','age_median','age_mode','M_mean','F_mean']]

In [None]:
train = train.drop(['age'], axis=1)
train = train.drop(['gender'], axis=1)

In [19]:
train = pd.merge(train, phone_brand_device, how='left', on='device_id')
train = pd.merge(train, events_count, how='left', on='device_id')
train = pd.merge(train, device_info, how='left', on='device_id')
train.fillna(-1, inplace=True)

In [59]:
train.head(3)

Unnamed: 0,phone_brand,device_model,counts,age_mean,age_median,age_mode,M_mean,F_mean
0,0,29,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,0,29,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,0,29,1.0,31.156081,29.316327,26.673469,63.666168,36.333832


In [58]:
train.drop('device_id',1,inplace = True)

In [214]:
test = pd.read_csv(os.path.join(datadir,'gender_age_test.csv.zip'), dtype={'device_id': np.str})
test = pd.merge(test, phone_brand_device, how='left', on='device_id')
test = pd.merge(test, events_count, how='left', on='device_id')
test = pd.merge(test, device_info, on='device_id', how='left')
test.fillna(-1, inplace=True)

In [25]:
features = list(test.columns.values)

In [26]:
features.remove('device_id')

In [27]:
features

['phone_brand',
 'device_model',
 'counts',
 'age_mean',
 'age_median',
 'age_mode',
 'M_mean',
 'F_mean']

# 深度学习模型

In [29]:
import tensorflow as tf

In [30]:
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
import tensorflow.keras.backend as K

In [48]:
class FeaturesEmbedding(Layer):

    def __init__(self, field_dims, embed_dim):
        super().__init__()
        self.embedding = Embedding(np.sum(np.sum(field_dims)), embed_dim)
        self.offsets = tf.constant(np.expand_dims(np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.float32),0))
    
    def build(self, input_shape):    
        super().build(input_shape)
        
    def call(self, x):
        """
        :param x: [11,2,3] Long tensor of size ``(batch_size, num_fields)``
        """
        x = x + self.offsets
        return self.embedding(x)

In [104]:
def nn_modle(features_schema,sparse_features_schema,emb_dim = 5,sparse_units = [150,100,80,1], numeric_units = [150,100,80], units = [60,40,20] ,class_num = 12,dropout = 0.2):
    col = sum(features_schema)
    inputs = tf.keras.Input(shape=[col])
    x_1,x_2 = tf.split(inputs,features_schema,1)
    emb_x1 = FeaturesEmbedding(sparse_features_schema, emb_dim)(x_1)
    for u in sparse_units:
        x1 = BatchNormalization()(emb_x1)
        x1 = Dense(u, activation='relu')(x1)
        x1 = Dropout(dropout)(x1)
    x1 = tf.squeeze(x1, -1)

    for u in numeric_units:
        x_2 = BatchNormalization()(x_2)
        x_2 = Dense(u, activation='relu')(x_2)
        x_2 = Dropout(dropout)(x_2)
    x = tf.concat([x1,x_2],-1)
    
    for u in units:
        x = BatchNormalization()(x)
        x = Dense(u, activation='relu')(x)
        x = Dropout(dropout)(x)
    outputs = Dense(class_num, activation = 'softmax')(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

In [70]:
train.dtypes

phone_brand       int64
device_model      int64
counts          float64
age_mean        float64
age_median      float64
age_mode        float64
M_mean          float64
F_mean          float64
dtype: object

In [None]:
target = train.pop('group')

In [60]:
train_data = tf.data.Dataset.from_tensor_slices((train.values, target.values))
train_data = train_data.shuffle(1024).batch(32)

In [61]:
train_data

<BatchDataset shapes: ((None, 8), (None,)), types: (tf.float64, tf.int64)>

In [67]:
features_schema = [2,6]
sparse_features_schema = [len(x) for x in mappings]

In [105]:
kera_model = nn_modle(features_schema,sparse_features_schema)

In [106]:
kera_model.summary()

Model: "functional_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            [(None, 8)]          0                                            
__________________________________________________________________________________________________
tf_op_layer_split_7 (TensorFlow [(None, 2), (None, 6 0           input_8[0][0]                    
__________________________________________________________________________________________________
batch_normalization_37 (BatchNo (None, 6)            24          tf_op_layer_split_7[0][1]        
__________________________________________________________________________________________________
dense_42 (Dense)                (None, 150)          1050        batch_normalization_37[0][0]     
______________________________________________________________________________________

In [107]:
kera_model.compile(loss='sparse_categorical_crossentropy', optimizer='Adam', metrics=['accuracy']) 

In [108]:
kera_model.fit(train_data, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7fd7beb10890>

In [109]:
kera_model.save_weights('kera_model_weights.h5')

In [146]:
kera_pred = kera_model.predict(train_data)

In [147]:
kera_pred

array([[0.1560308 , 0.15686326, 0.10753277, ..., 0.03725632, 0.04308367,
        0.02958808],
       [0.19426873, 0.13401966, 0.07715599, ..., 0.02643442, 0.04772189,
        0.0299214 ],
       [0.17595544, 0.16216291, 0.09856718, ..., 0.02884315, 0.03918599,
        0.02488044],
       ...,
       [0.15301627, 0.14927465, 0.10379579, ..., 0.03987873, 0.04773144,
        0.03364967],
       [0.0827481 , 0.11233792, 0.10025129, ..., 0.0850806 , 0.06809048,
        0.06244756],
       [0.01935786, 0.24917814, 0.41720453, ..., 0.09369992, 0.00234167,
        0.00379758]], dtype=float32)

# XGBoost

In [110]:
features = list(test.columns.values)

In [112]:
features.remove('device_id')

In [113]:
features

['phone_brand',
 'device_model',
 'counts',
 'age_mean',
 'age_median',
 'age_mode',
 'M_mean',
 'F_mean']

In [114]:
params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster" : "gbtree",
    "eval_metric": "mlogloss",
    "eta": 0.3,
    "max_depth": 3,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "silent": 1,
    "seed": 0,
    }

In [137]:
num_boost_round = 200

In [None]:
train['target'] = target

In [139]:
X_train, X_valid = train_test_split(train, test_size=0.1, random_state=0)

In [140]:
y_train = X_train['target']
y_valid = X_valid['target']
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

In [133]:
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

In [141]:
xgb_model = xgb.train(params, dtrain, num_boost_round, evals=watchlist, verbose_eval=True)

[0]	train-mlogloss:2.43146	eval-mlogloss:2.43113
[1]	train-mlogloss:2.39733	eval-mlogloss:2.39676
[2]	train-mlogloss:2.37589	eval-mlogloss:2.37459
[3]	train-mlogloss:2.35818	eval-mlogloss:2.35674
[4]	train-mlogloss:2.34555	eval-mlogloss:2.34376
[5]	train-mlogloss:2.33588	eval-mlogloss:2.33376
[6]	train-mlogloss:2.32754	eval-mlogloss:2.32558
[7]	train-mlogloss:2.32128	eval-mlogloss:2.31936
[8]	train-mlogloss:2.31633	eval-mlogloss:2.31447
[9]	train-mlogloss:2.312	eval-mlogloss:2.30987
[10]	train-mlogloss:2.30819	eval-mlogloss:2.3062
[11]	train-mlogloss:2.30514	eval-mlogloss:2.30328
[12]	train-mlogloss:2.30206	eval-mlogloss:2.30039
[13]	train-mlogloss:2.2996	eval-mlogloss:2.29818
[14]	train-mlogloss:2.29754	eval-mlogloss:2.2962
[15]	train-mlogloss:2.29557	eval-mlogloss:2.29442
[16]	train-mlogloss:2.29408	eval-mlogloss:2.29303
[17]	train-mlogloss:2.29234	eval-mlogloss:2.29138
[18]	train-mlogloss:2.29079	eval-mlogloss:2.28992
[19]	train-mlogloss:2.28939	eval-mlogloss:2.28863
[20]	train-mlog

[164]	train-mlogloss:2.19613	eval-mlogloss:2.22096
[165]	train-mlogloss:2.19569	eval-mlogloss:2.22054
[166]	train-mlogloss:2.19525	eval-mlogloss:2.22034
[167]	train-mlogloss:2.19474	eval-mlogloss:2.21998
[168]	train-mlogloss:2.19419	eval-mlogloss:2.21979
[169]	train-mlogloss:2.1938	eval-mlogloss:2.21941
[170]	train-mlogloss:2.19333	eval-mlogloss:2.21901
[171]	train-mlogloss:2.19272	eval-mlogloss:2.21875
[172]	train-mlogloss:2.1922	eval-mlogloss:2.21843
[173]	train-mlogloss:2.19168	eval-mlogloss:2.21811
[174]	train-mlogloss:2.19124	eval-mlogloss:2.21772
[175]	train-mlogloss:2.19067	eval-mlogloss:2.21737
[176]	train-mlogloss:2.19024	eval-mlogloss:2.21705
[177]	train-mlogloss:2.18982	eval-mlogloss:2.21685
[178]	train-mlogloss:2.18932	eval-mlogloss:2.21645
[179]	train-mlogloss:2.18878	eval-mlogloss:2.21622
[180]	train-mlogloss:2.18824	eval-mlogloss:2.21589
[181]	train-mlogloss:2.18783	eval-mlogloss:2.2155
[182]	train-mlogloss:2.18728	eval-mlogloss:2.21532
[183]	train-mlogloss:2.18683	eval-

In [148]:
xgb_pred = xgb_model.predict(xgb.DMatrix(train[features]))

In [149]:
xgb_pred

array([[0.16382563, 0.12918161, 0.09303972, ..., 0.05826699, 0.06530543,
        0.0411852 ],
       [0.16382563, 0.12918161, 0.09303972, ..., 0.05826699, 0.06530543,
        0.0411852 ],
       [0.05535437, 0.08692884, 0.08729146, ..., 0.12650701, 0.1452902 ,
        0.02584402],
       ...,
       [0.12111522, 0.12626387, 0.10864934, ..., 0.05907835, 0.05936706,
        0.03945715],
       [0.20668621, 0.21970426, 0.06484413, ..., 0.03413047, 0.01756136,
        0.02316338],
       [0.38456577, 0.08229715, 0.02550577, ..., 0.00239092, 0.03114078,
        0.02531103]], dtype=float32)

# 融合模型

In [150]:
from sklearn.linear_model import LogisticRegression

In [152]:
lr_train = np.concatenate([kera_pred,xgb_pred],axis = 1)

In [153]:
lr_train

array([[0.1560308 , 0.15686326, 0.10753277, ..., 0.05826699, 0.06530543,
        0.0411852 ],
       [0.19426873, 0.13401966, 0.07715599, ..., 0.05826699, 0.06530543,
        0.0411852 ],
       [0.17595544, 0.16216291, 0.09856718, ..., 0.12650701, 0.1452902 ,
        0.02584402],
       ...,
       [0.15301627, 0.14927465, 0.10379579, ..., 0.05907835, 0.05936706,
        0.03945715],
       [0.0827481 , 0.11233792, 0.10025129, ..., 0.03413047, 0.01756136,
        0.02316338],
       [0.01935786, 0.24917814, 0.41720453, ..., 0.00239092, 0.03114078,
        0.02531103]], dtype=float32)

In [154]:
clf = LogisticRegression(C=0.02, multi_class='multinomial',solver='lbfgs')
clf.fit(lr_train, target)

LogisticRegression(C=0.02, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [157]:
clf.score(lr_train, target)

0.21776079316933683

In [142]:
from sklearn.ensemble import RandomForestClassifier

# 生成提交文件

In [217]:
test_kera_pred = kera_model.predict(test[features])
test_xgb_pred = xgb_model.predict(xgb.DMatrix(test[features]))
test_lr_data = np.concatenate([test_kera_pred,test_xgb_pred],axis = 1)

In [218]:
test_pred = clf.predict_proba(test_lr_data)
test_pred

array([[0.05814623, 0.34967505, 0.21094363, ..., 0.03975259, 0.02151034,
        0.01876983],
       [0.05678468, 0.26099491, 0.32795892, ..., 0.04377008, 0.02016195,
        0.01769925],
       [0.06006075, 0.09993612, 0.24621911, ..., 0.07537703, 0.04467262,
        0.04225589],
       ...,
       [0.14958088, 0.10468227, 0.08690363, ..., 0.05162138, 0.05899057,
        0.04431569],
       [0.14958088, 0.10468227, 0.08690363, ..., 0.05162138, 0.05899057,
        0.04431569],
       [0.15612113, 0.10546995, 0.08866825, ..., 0.05010767, 0.06123755,
        0.04481921]])

In [232]:
submit = pd.DataFrame(test_pred, columns=list(g_map[0].keys()))

In [233]:
submit["device_id"] = test["device_id"]
submit = submit.set_index("device_id")

In [234]:
submit = submit[['F23-','F24-26','F27-28','F29-32','F33-42','F43+','M22-','M23-26','M27-28','M29-31','M32-38','M39+']]

In [235]:
submit.to_csv('submission.csv', index=True,index_label='device_id')