## TalkingData Mobile User Demographics (Kaggle Competition)
1. Data preprocessing
2. Benchmark models: random forest and naive bayes
3. Hierarchical data of multiple levels
4. XGBoost
5. Keras

In [1]:
import time
import random

# numpy, scipy, and pandas
import numpy as np
import pandas as pd
from scipy import sparse

# scikit-learn for machine learning
from sklearn import preprocessing, pipeline, metrics, grid_search, cross_validation
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier

#### Data Preprocessing

In [2]:
# load Data

print("# Load Data of Phone Brand and Device Model")
phone_brand = pd.read_csv("../input/phone_brand_device_model.csv", dtype={'device_id': np.str})
phone_brand.drop_duplicates('device_id', keep='first', inplace=True)

print("# Load Training Data")
train_data = pd.read_csv("../input/gender_age_train.csv", dtype={'device_id': np.str})

print("# Load Testing Data")
test_data = pd.read_csv("../input/gender_age_test.csv", dtype={'device_id': np.str})

full_data = pd.concat((train_data, test_data), axis=0, ignore_index=True)
train_size = len(train_data)
full_data = pd.merge(full_data, phone_brand, how='left', on='device_id', left_index=True)

print ("# Data Loaded.")
full_data.info()

# Load Data of Phone Brand and Device Model
# Load Training Data
# Load Testing Data
# Data Loaded.
<class 'pandas.core.frame.DataFrame'>
Int64Index: 186716 entries, 56800 to 106263
Data columns (total 6 columns):
age             74645 non-null float64
device_id       186716 non-null object
gender          74645 non-null object
group           74645 non-null object
phone_brand     186716 non-null object
device_model    186716 non-null object
dtypes: float64(1), object(5)
memory usage: 10.0+ MB


In [3]:
# label/encode target   
LBL = preprocessing.LabelEncoder()
Y = LBL.fit_transform(full_data['group'][:train_size])
target_names = LBL.classes_
print ("target group names:", target_names)

('target group names:', array(['F23-', 'F24-26', 'F27-28', 'F29-32', 'F33-42', 'F43+', 'M22-',
       'M23-26', 'M27-28', 'M29-31', 'M32-38', 'M39+'], dtype=object))


In [4]:
# one-hot-encoding
full_ohe=pd.get_dummies(full_data[['phone_brand', 'device_model']], sparse=True)
full_ohe=sparse.csr_matrix(full_ohe)

# lable encoding
full_le = pd.DataFrame()
full_le['phone_brand']=LBL.fit_transform(full_data['phone_brand'])
full_le['device_model']=LBL.fit_transform(full_data['device_model'])

print full_ohe.shape, full_le.shape

(186716, 1730) (186716, 2)


#### Benchmark models: random forest and naive bayes

In [5]:
# random forest with label encoding
model = grid_search.GridSearchCV(RandomForestClassifier(n_estimators=100), 
                                 param_grid={}, 
                                 scoring='log_loss',
                                 n_jobs=1,
                                 iid=True,
                                 cv=4, 
                                 refit=False,
                                 verbose=10)   
model.fit(full_le[:train_size], Y)
print ("best params:", model.best_params_)

Fitting 4 folds for each of 1 candidates, totalling 4 fits
[CV]  ................................................................
[CV] ...................................... , score=-4.174543 -   6.3s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:    6.3s


[CV] ...................................... , score=-4.073983 -   3.7s
[CV]  ................................................................
[CV] ...................................... , score=-3.968685 -   3.8s
[CV]  ................................................................
[CV] ...................................... , score=-3.829474 -   3.7s
('best params:', {})


[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:   17.6s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   17.6s finished


In [6]:
# random forest with one-hot-encoding
model = grid_search.GridSearchCV(RandomForestClassifier(n_estimators=100), 
                                 param_grid={}, 
                                 scoring='log_loss',
                                 n_jobs=1,
                                 iid=True,
                                 cv=4, 
                                 refit=False,
                                 verbose=10)   
model.fit(full_ohe[:train_size], Y)
print ("best params:", model.best_params_)

Fitting 4 folds for each of 1 candidates, totalling 4 fits
[CV]  ................................................................
[CV] ...................................... , score=-3.896093 -  45.5s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:   45.5s


[CV] ...................................... , score=-3.853874 -  43.1s
[CV]  ................................................................
[CV] ...................................... , score=-3.784981 -  42.8s
[CV]  ................................................................
[CV] ...................................... , score=-3.659586 -  43.5s
('best params:', {})


[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:  2.9min
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.9min finished


In [7]:
# naive bayes with label encoding
model = grid_search.GridSearchCV(GaussianNB(), 
                                 param_grid={}, 
                                 scoring='log_loss',
                                 n_jobs=1,
                                 iid=True,
                                 cv=4, 
                                 refit=False,
                                 verbose=10)   
model.fit(full_le[:train_size], Y)
print ("best params:", model.best_params_)

Fitting 4 folds for each of 1 candidates, totalling 4 fits
[CV]  ................................................................
[CV] ...................................... , score=-2.420795 -   0.1s
[CV]  ................................................................
[CV] ...................................... , score=-2.421838 -   0.1s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:    0.1s


[CV] ...................................... , score=-2.426279 -   0.1s
[CV]  ................................................................
[CV] ...................................... , score=-2.426900 -   0.1s
('best params:', {})


[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.3s finished
