## TalkingData Mobile User Demographics (Kaggle Competition)
1. Data preprocessing
2. Benchmark models: random forest and naive bayes
3. Leave one out encoding
4. Hierarchical data of multiple levels 
5. XGBoost
6. Keras

In [1]:
import time
import random

# numpy, scipy, and pandas
import numpy as np
import pandas as pd
from scipy import sparse

# scikit-learn for machine learning
from sklearn import preprocessing, metrics, grid_search, cross_validation#, pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# xgboost and keras
import xgboost as xgb
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.layers import Input, Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, Nadam
from keras.layers.advanced_activations import SReLU
from keras.layers.core import Activation
from keras.utils import np_utils

Using Theano backend.


#### Part 1. Data Preprocessing

In [2]:
# load Data

print("# Load Data of Phone Brand and Device Model")
phone_brand = pd.read_csv("../input/phone_brand_device_model.csv", dtype={'device_id': np.str})
phone_brand.drop_duplicates('device_id', keep='first', inplace=True)

print("# Load Training Data")
train_data = pd.read_csv("../input/gender_age_train.csv", dtype={'device_id': np.str})

print("# Load Testing Data")
test_data = pd.read_csv("../input/gender_age_test.csv", dtype={'device_id': np.str})

full_data = pd.concat((train_data, test_data), axis=0, ignore_index=True)
train_size = len(train_data)
full_data = pd.merge(full_data, phone_brand, how='left', on='device_id', left_index=True)

print ("# Data Loaded.")
full_data.info()

# Load Data of Phone Brand and Device Model
# Load Training Data
# Load Testing Data
# Data Loaded.
<class 'pandas.core.frame.DataFrame'>
Int64Index: 186716 entries, 56800 to 106263
Data columns (total 6 columns):
age             74645 non-null float64
device_id       186716 non-null object
gender          74645 non-null object
group           74645 non-null object
phone_brand     186716 non-null object
device_model    186716 non-null object
dtypes: float64(1), object(5)
memory usage: 10.0+ MB


In [21]:
# label/encode target
LBL = preprocessing.LabelEncoder()
Y = LBL.fit_transform(full_data['group'][:train_size])
Y_labels = np_utils.to_categorical(Y)

target_names = LBL.classes_
print ("target group names:", target_names)
device_id = full_data[train_size:]["device_id"].values

('target group names:', array(['F23-', 'F24-26', 'F27-28', 'F29-32', 'F33-42', 'F43+', 'M22-',
       'M23-26', 'M27-28', 'M29-31', 'M32-38', 'M39+'], dtype=object))


#### Part 2. Benchmark models: random forest and naive bayes

In [4]:
# one hot encoding
full_ohe = pd.get_dummies(full_data[['phone_brand', 'device_model']], sparse=True)
full_ohe = sparse.csr_matrix(full_ohe)

# lable encoding
full_le = pd.DataFrame()
full_le['phone_brand'] = LBL.fit_transform(full_data['phone_brand'])
full_le['device_model'] = LBL.fit_transform(full_data['device_model'])

print full_ohe.shape, full_le.shape

(186716, 1730) (186716, 2)


In [5]:
# random forest with label encoding
model = grid_search.GridSearchCV(RandomForestClassifier(n_estimators=100), 
                                 param_grid={}, 
                                 scoring='log_loss',
                                 n_jobs=1,
                                 iid=True,
                                 cv=4, 
                                 refit=False,
                                 verbose=10)
model.fit(full_le[:train_size], Y)
print ("grid scores:", model.grid_scores_)

Fitting 4 folds for each of 1 candidates, totalling 4 fits
[CV]  ................................................................
[CV] ...................................... , score=-4.169452 -   3.3s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:    3.3s


[CV] ...................................... , score=-4.053912 -   3.3s
[CV]  ................................................................
[CV] ...................................... , score=-3.956223 -   3.3s
[CV]  ................................................................
[CV] ...................................... , score=-3.818557 -   3.8s
('grid scores:', [mean: -3.99956, std: 0.12890, params: {}])


[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:   13.6s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   13.7s finished


In [6]:
# random forest with one hot encoding
model = grid_search.GridSearchCV(RandomForestClassifier(n_estimators=100), 
                                 param_grid={}, 
                                 scoring='log_loss',
                                 n_jobs=1,
                                 iid=True,
                                 cv=4, 
                                 refit=False,
                                 verbose=10)   
model.fit(full_ohe[:train_size], Y)
print ("grid scores:", model.grid_scores_)

Fitting 4 folds for each of 1 candidates, totalling 4 fits
[CV]  ................................................................
[CV] ...................................... , score=-3.917806 -  43.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:   43.0s


[CV] ...................................... , score=-3.836170 -  43.3s
[CV]  ................................................................
[CV] ...................................... , score=-3.776902 -  45.3s
[CV]  ................................................................
[CV] ...................................... , score=-3.626046 -  42.7s
('grid scores:', [mean: -3.78925, std: 0.10667, params: {}])


[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:  2.9min
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.9min finished


In [7]:
# naive bayes with label encoding
model = grid_search.GridSearchCV(GaussianNB(), 
                                 param_grid={}, 
                                 scoring='log_loss',
                                 n_jobs=1,
                                 iid=True,
                                 cv=4, 
                                 refit=False,
                                 verbose=10)   
model.fit(full_le[:train_size], Y)
print ("grid scores:", model.grid_scores_)

Fitting 4 folds for each of 1 candidates, totalling 4 fits
[CV]  ................................................................
[CV] ...................................... , score=-2.420795 -   0.1s
[CV]  ................................................................
[CV] ...................................... , score=-2.421838 -   0.1s
[CV]  ................................................................
[CV] ...................................... , score=-2.426279 -   0.1s
[CV]  ................................................................
[CV] ...................................... , score=-2.426900 -   0.1s
('grid scores:', [mean: -2.42395, std: 0.00267, params: {}])


[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.3s finished


#### Part 3. Leave one out encoding

In [8]:
# define a function for leave one out encoding
def loo_encode(data,cat_col,target_col,train_size,random_rate=0.05):
    print ("leave one out encoding %s on %s" % (cat_col, target_col))
    aggr = data[:train_size].groupby(cat_col)[target_col].agg([np.mean,np.size,np.sum]).reset_index()
    data = pd.merge(data, aggr, how='left', on=cat_col)
    data['loo'] = data['mean']
    data['loo'][:train_size] = data[:train_size].apply(lambda row: 0 if row['size']<=1 
        else (row['sum']-row[target_col])/(row['size']-1)*random.uniform(1-random_rate, 1+random_rate), axis=1).values
 
    return data['loo'].fillna(0).values

In [9]:
# label/encode target "gender" 
full_data['gender'] = full_data['gender'].apply(lambda x:1 if x=='F' else 0)

# concatenate "phone_brand" and "device_model" to create a new categorical feature
full_data['brand_model'] = full_data['phone_brand'] + full_data['device_model']
cat_cols = ['phone_brand', 'device_model', 'brand_model']

In [10]:
# leave one out encoding for 3 categorical features on 2 targets
loo_cols = []
for c in cat_cols:
    for t in ['age','gender']:
        loo_col=c+'_'+t+'_loo'
        full_data[loo_col]=loo_encode(full_data[[c,t]],c,t,train_size,random_rate=0.05)
        loo_cols.append(loo_col)

leave one out encoding phone_brand on age


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


leave one out encoding phone_brand on gender
leave one out encoding device_model on age
leave one out encoding device_model on gender
leave one out encoding brand_model on age
leave one out encoding brand_model on gender


#### Part 4. Hierarchical data of multiple levels

In [11]:
# use XGBclassifier as baseline
X_train, X_val, y_train, y_val = train_test_split(full_data[loo_cols].values[:train_size], 
                                                  Y, 
                                                  train_size=.80, 
                                                  random_state=1234)

clf = xgb.XGBClassifier()
clf.fit(X_train,y_train)
pred_val=clf.predict_proba(X_val)
print ("mlogloss: %f" % (metrics.log_loss(y_val, pred_val)))

mlogloss: 2.398795


In [12]:
# load data: events, app_events, app_labels, and label_categories

start = time.time()
events = pd.read_csv("../input/events.csv", dtype={'device_id': np.str})
print ("Events loaded in %f seconds" %(time.time() - start))

start = time.time()
app_ev = pd.read_csv("../input/app_events.csv", dtype={'device_id': np.str})
print ("App Events loaded in %f seconds" %(time.time() - start))

start = time.time()
app_lab = pd.read_csv("../input/app_labels.csv", dtype={'device_id': np.str})
print ("App Labels loaded in %f seconds" %(time.time() - start))

start = time.time()
lab_cat = pd.read_csv("../input/label_categories.csv", dtype={'device_id': np.str})
print ("Label Categories loaded in %f seconds" %(time.time() - start))


Events loaded in 5.439940 seconds
App Events loaded in 16.932818 seconds
App Labels loaded in 0.176817 seconds
Label Categories loaded in 0.005235 seconds


In [13]:
# aggregate apps labels and categories by device
device_app = pd.merge(events[['device_id','event_id']], app_ev[['event_id','app_id']], 
                      on='event_id')[['device_id','app_id']].drop_duplicates()
device_label = pd.merge(device_app, app_lab, 
                        on='app_id')[['device_id','label_id']].drop_duplicates()
device_category = pd.merge(device_label, lab_cat, 
                           on='label_id')[['device_id','category']].drop_duplicates()
print ("device apps labels and categories aggregated in %f seconds" %(time.time() - start))


device apps labels and categories aggregated in 29.790449 seconds


In [14]:
# concatenate applications, labels, and categories to a big text column for each device
device_category = device_category.groupby("device_id")["category"].apply(list)
device_label = device_label.groupby("device_id")["label_id"].apply(list)
device_app = device_app.groupby("device_id")["app_id"].apply(list)
del app_ev,events, lab_cat, app_lab
print device_category.shape, device_label.shape, device_app.shape


(60822,) (60822,) (60822,)


In [15]:
# group categories/labels/apps by device id and merge them into one big list
full_data["category"] = full_data["device_id"].map(device_category).apply(
    lambda x:' '.join(c for c in x) if x==x else '') 
full_data["label"] = full_data["device_id"].map(device_label).apply(
    lambda x:' '.join(str(c) for c in x) if x==x else '') 
full_data["app"] = full_data["device_id"].map(device_app).apply(
    lambda x:' '.join(str(c) for c in x) if x==x else '') 

full_data['device_model'] = full_data['device_model'].apply(lambda x:x.replace(' ','')) 
full_data['category'] = full_data['category'].apply(lambda x:x.replace(' ','')) 

In [24]:
# count frequecies of each key word (brand, model, and app id), then convert the results to a sparse matrix
counter = CountVectorizer(min_df=1)
matrix = full_data[["phone_brand", "device_model", "app"]].astype(np.str).apply(
    lambda x: " ".join(s for s in x), axis=1)
matrix = counter.fit_transform(matrix)
num_of_feature = matrix.shape[1]

#### Part 5. XGBoost

In [17]:
# XGB baseline - brand, model, and application
X_train, X_val, y_train, y_val = train_test_split(matrix[:train_size], Y, train_size=.80, random_state=1234)

clf = xgb.XGBClassifier()
clf.fit(X_train,y_train)
pred_val = clf.predict_proba(X_val)
print ("mlogloss: %f" % (metrics.log_loss(y_val, pred_val)))

mlogloss: 2.333892


In [18]:
# use a trick called early stopping to find out the optimal number of iterations for XGB
clf = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.3)
clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='mlogloss', early_stopping_rounds=20)


[0]	validation_0-mlogloss:2.45477
Will train until validation_0-mlogloss hasn't improved in 20 rounds.
[1]	validation_0-mlogloss:2.43366
[2]	validation_0-mlogloss:2.41792
[3]	validation_0-mlogloss:2.40539
[4]	validation_0-mlogloss:2.39627
[5]	validation_0-mlogloss:2.38913
[6]	validation_0-mlogloss:2.38314
[7]	validation_0-mlogloss:2.3784
[8]	validation_0-mlogloss:2.37424
[9]	validation_0-mlogloss:2.37131
[10]	validation_0-mlogloss:2.36721
[11]	validation_0-mlogloss:2.36405
[12]	validation_0-mlogloss:2.36181
[13]	validation_0-mlogloss:2.35941
[14]	validation_0-mlogloss:2.35756
[15]	validation_0-mlogloss:2.35547
[16]	validation_0-mlogloss:2.3535
[17]	validation_0-mlogloss:2.35169
[18]	validation_0-mlogloss:2.35011
[19]	validation_0-mlogloss:2.34878
[20]	validation_0-mlogloss:2.34719
[21]	validation_0-mlogloss:2.34572
[22]	validation_0-mlogloss:2.34481
[23]	validation_0-mlogloss:2.34363
[24]	validation_0-mlogloss:2.34261
[25]	validation_0-mlogloss:2.34171
[26]	validation_0-mlogloss:2.3406

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.3, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [19]:
# best_iteration = clf.best_iteration_
# best_score = clf.best_score_
best_iteration = 325
best_score = 2.29486

print (best_iteration, best_score)

(325, 2.29486)


In [20]:
# create submission
clf = xgb.XGBClassifier(n_estimators=best_iteration, learning_rate=0.3)
clf.fit(matrix[:train_size], Y)
pred = clf.predict_proba(matrix[train_size:])

result = pd.DataFrame(pred, columns=target_names)
result["device_id"] = device_id
result = result.set_index("device_id")
result.to_csv('brand_model_app_xgb.csv', index=True, index_label='device_id')

#### Part 6. Keras

In [22]:
# convert sparse matrix to dense (in batches)

# generator for training
def batch_generator(X, y, batch_size, shuffle):
    #chenglong code for fiting from generator 
    #https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

# generator for predicting            
def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0

In [23]:
# split data for validation
train = matrix[:train_size, :]
test = matrix[train_size:, :]
# num_class = 12
X_train, X_val, y_train, y_val = train_test_split(train, Y_labels, train_size=.80, random_state=1234)

In [None]:
# create MLP model with Keras

def create_model(input_dim):
    model = Sequential()
    #     Input Layer
    model.add(Dense(512, 
                    input_dim=input_dim,
                    activation='relu'))
    model.add(Dropout(0.2))
    
    #     Hidden Layer
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.2))

    #     Output Layer
    model.add(Dense(12, activation='softmax'))

    #     Optimizer
    nadam = Nadam(lr=1e-4)

    # Compile Model
    model.compile(loss='categorical_crossentropy',
                  optimizer=nadam)
    return model

model = create_model(num_of_feature)
early_stop = EarlyStopping(monitor='val_loss', patience=0, verbose=0, mode='auto')
fit= model.fit_generator(generator=batch_generator(X_train, y_train, 128, True),
                         nb_epoch=30,
                         samples_per_epoch=train_size,
                         vali  dation_data=(X_val.todense(), y_val),
                         callbacks=[early_stop]
                         )

In [None]:
for i in range(5):
    print ("Training model %d" % (i+1))
    model=create_model(num_of_feature)
    fit= model.fit_generator(generator=batch_generator(train, Y_labels, 128, True),
                             nb_epoch=<epoch of best model>,
                             samples_per_epoch=train.shape[0]
                             )
    preds=preds+model.predict_generator(generator=batch_generatorp(test, 128, False), val_samples=test.shape[0])
    
preds = preds/60
submission = pd.DataFrame(preds, columns=label_group.classes_)
submission["device_id"] = device_id
submission = submission.set_index("device_id")
submission.to_csv('submission.csv', index=True, index_label='device_id')