In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
import random
import time
random.seed(2020)

In [5]:
datadir = './drive/My Drive/talkingdata'

In [6]:
train = pd.read_csv(os.path.join(datadir, "gender_age_train.csv.zip"), index_col='device_id')
test = pd.read_csv(os.path.join(datadir,"gender_age_test.csv.zip"), index_col='device_id')
phone_data = pd.read_csv(os.path.join(datadir,"phone_brand_device_model.csv.zip")).drop_duplicates('device_id',keep='first').set_index('device_id')
events = pd.read_csv(os.path.join(datadir,"events.csv.zip"), index_col='event_id')
app_events = pd.read_csv(os.path.join(datadir,"app_events.csv.zip"), usecols=['event_id','app_id','is_active'], dtype={'is_active': bool}, index_col='event_id')
app_labels = pd.read_csv(os.path.join(datadir,"app_labels.csv.zip"))
label_cate = pd.read_csv(os.path.join(datadir,"label_categories.csv.zip"))


  mask |= (ar1 == a)


In [7]:
train['train_index'], test['test_index'] = np.arange(train.shape[0]), np.arange(test.shape[0])
train = train.drop(['gender','age'], axis=1)

In [8]:
y_encoder = LabelEncoder().fit(train.group)
train['group'] = y_encoder.transform(train.group)
yclasses_ = len(y_encoder.classes_)
print('total number of target classes: ', yclasses_, y_encoder.classes_)

total number of target classes:  12 ['F23-' 'F24-26' 'F27-28' 'F29-32' 'F33-42' 'F43+' 'M22-' 'M23-26'
 'M27-28' 'M29-31' 'M32-38' 'M39+']


In [9]:
#Join App and label

In [10]:
app_labels = app_labels.loc[app_labels.app_id.isin(app_events.app_id.unique())]
app_encoder = LabelEncoder().fit(app_events.app_id)
label_encoder = LabelEncoder().fit(app_labels.label_id)
app_events['app'] = app_encoder.transform(app_events.app_id)
app_labels['app'] = app_encoder.transform(app_labels.app_id)
app_labels['label'] = label_encoder.transform(app_labels.label_id)
app_id_to_label = app_labels.join(label_cate, how='left', on='label_id', lsuffix='k')[['app_id','app','label','category']]
joined_app_events = app_events.join(events)
joined_app_events = joined_app_events.groupby(['device_id','app'])['app'].agg(['size'])
joined_app_events = ((joined_app_events.join(train['train_index'], how='left')).join(test['test_index'], how='left')).reset_index()

In [11]:
joined_labels = joined_app_events.merge(app_labels[['app','label']])[['device_id','label','app']]
joined_labels = joined_labels.groupby(['device_id','label'])['app'].agg(['size'])
joined_labels = ((joined_labels.join(train['train_index'], how='left')).join(test['test_index'], how='left')).reset_index()


In [12]:
joined_labels.head(2)

Unnamed: 0,device_id,label,size,train_index,test_index
0,-9222956879900151005,117,1,21594.0,
1,-9222956879900151005,120,1,21594.0,


In [13]:
""" 1 - Brand """
brand_encoder = LabelEncoder().fit(phone_data.phone_brand)
phone_data['converted_brand_only'] = brand_encoder.transform(phone_data.phone_brand)

train['brand_only'] = phone_data['converted_brand_only']
test['brand_only'] = phone_data['converted_brand_only']

sparse_train_brand_only = csr_matrix((np.ones(train.shape[0]), (train.train_index, train.brand_only)))
sparse_test_brand_only = csr_matrix((np.ones(test.shape[0]), (test.test_index, test.brand_only)))

In [14]:
train.head(3)

Unnamed: 0_level_0,group,train_index,brand_only
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-8076087639492063270,10,0,51
-2897161552818060146,10,1,51
-8260683887967679142,10,2,51


In [15]:
""" 2 - Event time """
events['hour'] = pd.to_datetime(events['timestamp']).dt.hour
stampping = events.groupby(['device_id','hour'])['timestamp'].agg(['size'])
stampping = ((stampping.join(train['train_index'], how='left')).join(test['test_index'], how='left')).reset_index()
d = stampping.dropna(subset=['train_index'])
sparse_event_time_train = csr_matrix((np.ones(d.shape[0]), (d.train_index, d.hour)), shape=(train.shape[0], 24))
d = stampping.dropna(subset=['test_index'])
sparse_event_time_test = csr_matrix((np.ones(d.shape[0]), (d.test_index,d.hour)), shape=(test.shape[0], 24))

In [16]:
events['timestamp'].head(3)

event_id
1    2016-05-01 00:55:25
2    2016-05-01 00:54:12
3    2016-05-01 00:08:05
Name: timestamp, dtype: object

In [17]:
phone_data.head(3)

Unnamed: 0_level_0,phone_brand,device_model,converted_brand_only
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-8890648629457979026,小米,红米,51
1277779817574759137,小米,MI 2,51
5137427614288105724,三星,Galaxy S4,15


In [18]:
model = phone_data.phone_brand.str.cat(phone_data.device_model)

In [19]:
model.head(3)

device_id
-8890648629457979026           小米红米
 1277779817574759137         小米MI 2
 5137427614288105724    三星Galaxy S4
Name: phone_brand, dtype: object

In [20]:
""" 3 - Device model """
model = phone_data.phone_brand.str.cat(phone_data.device_model)

phone_encoder = LabelEncoder().fit(model)
phone_data['converted_brand'] = phone_encoder.transform(model)

train['brand'] = phone_data['converted_brand']
test['brand'] = phone_data['converted_brand']

In [21]:
sparse_train_brand = csr_matrix((np.ones(train.shape[0]), (train.train_index, train.brand)))
sparse_test_brand = csr_matrix((np.ones(test.shape[0]), (test.test_index, test.brand)))

In [22]:
""" 4 - Bag of Apps """
d = joined_app_events.dropna(subset=['train_index'])
sparse_train_appusage = csr_matrix((np.ones(d.shape[0]), (d.train_index, d.app)), shape=(train.shape[0],len(app_encoder.classes_)))
d = joined_app_events.dropna(subset=['test_index'])
sparse_test_appusage = csr_matrix((np.ones(d.shape[0]), (d.test_index,d.app)), shape=(test.shape[0],len(app_encoder.classes_)))


In [23]:
joined_app_events

Unnamed: 0,device_id,app,size,train_index,test_index
0,-9222956879900151005,548,18,21594.0,
1,-9222956879900151005,1096,18,21594.0,
2,-9222956879900151005,1248,26,21594.0,
3,-9222956879900151005,1545,12,21594.0,
4,-9222956879900151005,1664,18,21594.0,
...,...,...,...,...,...
2369020,9222539910510672930,17358,1,,82667.0
2369021,9222539910510672930,17587,1,,82667.0
2369022,9222539910510672930,18039,1,,82667.0
2369023,9222539910510672930,18686,1,,82667.0


In [24]:
""" 5 - Bag of Label """
d = joined_labels.dropna(subset=['train_index'])
sparse_train_label = csr_matrix((np.ones(d.shape[0]), (d.train_index,d.label)), shape=(train.shape[0], len(label_encoder.classes_)))
d = joined_labels.dropna(subset=['test_index'])
sparse_test_label = csr_matrix((np.ones(d.shape[0]), (d.test_index,d.label)),shape=(test.shape[0], len(label_encoder.classes_)))


In [25]:
Xtrain = hstack((sparse_train_brand_only, sparse_train_brand,  sparse_train_appusage, sparse_train_label,sparse_event_time_train), format='csr')
Xtest =  hstack((sparse_test_brand_only, sparse_test_brand, sparse_test_appusage, sparse_test_label, sparse_event_time_test), format='csr')


In [26]:
params = {
    "n_estimators" : 2000,
    "objective": "multi:softprob",
    "num_class": 12,
    "booster" : "gbtree",
    "eval_metric": "mlogloss",
    "eta": 0.3,
    "max_depth": 3,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "silent": 1,
    "seed": 0,
    }

In [27]:
import xgboost as xgb

In [29]:
dtrain = xgb.DMatrix(Xtrain, train['group'])

In [30]:
watchlist = [(dtrain, 'train')]

In [32]:
xgb_model = xgb.train(params, dtrain, 100, early_stopping_rounds = 50, evals=watchlist, verbose_eval=True)

[0]	train-mlogloss:2.44971
Will train until train-mlogloss hasn't improved in 50 rounds.
[1]	train-mlogloss:2.4236
[2]	train-mlogloss:2.40438
[3]	train-mlogloss:2.38968
[4]	train-mlogloss:2.37771
[5]	train-mlogloss:2.3676
[6]	train-mlogloss:2.35913
[7]	train-mlogloss:2.35162
[8]	train-mlogloss:2.34514
[9]	train-mlogloss:2.33965
[10]	train-mlogloss:2.33467
[11]	train-mlogloss:2.33026
[12]	train-mlogloss:2.32595
[13]	train-mlogloss:2.32204
[14]	train-mlogloss:2.31835
[15]	train-mlogloss:2.31483
[16]	train-mlogloss:2.31144
[17]	train-mlogloss:2.30864
[18]	train-mlogloss:2.3057
[19]	train-mlogloss:2.30266
[20]	train-mlogloss:2.30001
[21]	train-mlogloss:2.2975
[22]	train-mlogloss:2.29511
[23]	train-mlogloss:2.29252
[24]	train-mlogloss:2.29013
[25]	train-mlogloss:2.28773
[26]	train-mlogloss:2.28561
[27]	train-mlogloss:2.28361
[28]	train-mlogloss:2.28162
[29]	train-mlogloss:2.27968
[30]	train-mlogloss:2.27772
[31]	train-mlogloss:2.27556
[32]	train-mlogloss:2.27364
[33]	train-mlogloss:2.2716
[

In [33]:
xgb_pred = xgb_model.predict(xgb.DMatrix(Xtest))

In [34]:
submit = pd.DataFrame(xgb_pred, columns=list(y_encoder.classes_))

In [38]:
test.index

Int64Index([ 1002079943728939269, -1547860181818787117,  7374582448058474277,
            -6220210354783429585, -5893464122623104785, -7560708697029818408,
              289797889702373958,  -402874006399730161,  5751283639860028129,
             -848943298935149395,
            ...
            -5345741084184024273,  8859796566213040200,  7583643168012624381,
              238294472934704817,  7208587612007205081,  4280900819321920929,
              818534825520551359, -8956851351560395765,  6097318236795836256,
              622421180514002079],
           dtype='int64', name='device_id', length=112071)

In [39]:
submit["device_id"] = test.index
submit = submit.set_index("device_id")

In [40]:
submit.head(3)

Unnamed: 0_level_0,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1002079943728939269,0.004736,0.015556,0.014993,0.020207,0.032152,0.032246,0.012041,0.063997,0.099256,0.189261,0.157068,0.358488
-1547860181818787117,0.010079,0.012855,0.020628,0.052011,0.073585,0.098416,0.019084,0.062634,0.061581,0.201847,0.150561,0.236718
7374582448058474277,0.060708,0.068033,0.032142,0.162179,0.129255,0.064611,0.0278,0.039294,0.053127,0.102959,0.148988,0.110904


In [None]:
#Private Score 2.28671