## TalkingData Mobile User Demographics (Kaggle Competition)
1. Data preprocessing
2. Benchmark models: random forest and naive bayes
3. Leave one out encoding
4. Hierarchical data of multiple levels 
5. XGBoost
6. Keras

In [None]:
import time
import random

# numpy, scipy, and pandas
import numpy as np
import pandas as pd
from scipy import sparse

# scikit-learn for machine learning
from sklearn import preprocessing, metrics, grid_search, cross_validation#, pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# xgboost and keras
import xgboost as xgb
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.layers import Input, Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, Nadam
from keras.layers.advanced_activations import SReLU
from keras.layers.core import Activation
from keras.utils import np_utils

#### Part 1. Data Preprocessing

In [None]:
# load Data

print("# Load Data of Phone Brand and Device Model")
phone_brand = pd.read_csv("../input/phone_brand_device_model.csv", dtype={'device_id': np.str})
phone_brand.drop_duplicates('device_id', keep='first', inplace=True)

print("# Load Training Data")
train_data = pd.read_csv("../input/gender_age_train.csv", dtype={'device_id': np.str})

print("# Load Testing Data")
test_data = pd.read_csv("../input/gender_age_test.csv", dtype={'device_id': np.str})

full_data = pd.concat((train_data, test_data), axis=0, ignore_index=True)
train_size = len(train_data)
full_data = pd.merge(full_data, phone_brand, how='left', on='device_id', left_index=True)

print ("# Data Loaded.")
full_data.info()

In [None]:
# label/encode target
LBL = preprocessing.LabelEncoder()
Y = LBL.fit_transform(full_data['group'][:train_size])
Y_labels = np_utils.to_categorical(Y)

target_names = LBL.classes_
print ("target group names:", target_names)
device_id = full_data[train_size:]["device_id"].values

#### Part 2. Benchmark models: random forest and naive bayes

In [None]:
# one hot encoding
full_ohe = pd.get_dummies(full_data[['phone_brand', 'device_model']], sparse=True)
full_ohe = sparse.csr_matrix(full_ohe)

# lable encoding
full_le = pd.DataFrame()
full_le['phone_brand'] = LBL.fit_transform(full_data['phone_brand'])
full_le['device_model'] = LBL.fit_transform(full_data['device_model'])

print full_ohe.shape, full_le.shape

In [None]:
# random forest with label encoding
model = grid_search.GridSearchCV(RandomForestClassifier(n_estimators=100), 
                                 param_grid={}, 
                                 scoring='log_loss',
                                 n_jobs=1,
                                 iid=True,
                                 cv=4, 
                                 refit=False,
                                 verbose=10)
model.fit(full_le[:train_size], Y)
print ("grid scores:", model.grid_scores_)

In [None]:
# random forest with one hot encoding
model = grid_search.GridSearchCV(RandomForestClassifier(n_estimators=100), 
                                 param_grid={}, 
                                 scoring='log_loss',
                                 n_jobs=1,
                                 iid=True,
                                 cv=4, 
                                 refit=False,
                                 verbose=10)   
model.fit(full_ohe[:train_size], Y)
print ("grid scores:", model.grid_scores_)

In [None]:
# naive bayes with label encoding
model = grid_search.GridSearchCV(GaussianNB(), 
                                 param_grid={}, 
                                 scoring='log_loss',
                                 n_jobs=1,
                                 iid=True,
                                 cv=4, 
                                 refit=False,
                                 verbose=10)   
model.fit(full_le[:train_size], Y)
print ("grid scores:", model.grid_scores_)

#### Part 3. Leave one out encoding

In [None]:
# define a function for leave one out encoding
def loo_encode(data,cat_col,target_col,train_size,random_rate=0.05):
    print ("leave one out encoding %s on %s" % (cat_col, target_col))
    aggr = data[:train_size].groupby(cat_col)[target_col].agg([np.mean,np.size,np.sum]).reset_index()
    data = pd.merge(data, aggr, how='left', on=cat_col)
    data['loo'] = data['mean']
    data['loo'][:train_size] = data[:train_size].apply(lambda row: 0 if row['size']<=1 
        else (row['sum']-row[target_col])/(row['size']-1)*random.uniform(1-random_rate, 1+random_rate), axis=1).values
 
    return data['loo'].fillna(0).values

In [None]:
# label/encode target "gender" 
full_data['gender'] = full_data['gender'].apply(lambda x:1 if x=='F' else 0)

# concatenate "phone_brand" and "device_model" to create a new categorical feature
full_data['brand_model'] = full_data['phone_brand'] + full_data['device_model']
cat_cols = ['phone_brand', 'device_model', 'brand_model']

In [None]:
# leave one out encoding for 3 categorical features on 2 targets
loo_cols = []
for c in cat_cols:
    for t in ['age','gender']:
        loo_col=c+'_'+t+'_loo'
        full_data[loo_col]=loo_encode(full_data[[c,t]],c,t,train_size,random_rate=0.05)
        loo_cols.append(loo_col)

#### Part 4. Hierarchical data of multiple levels

In [None]:
# use XGBclassifier as baseline
X_train, X_val, y_train, y_val = train_test_split(full_data[loo_cols].values[:train_size], 
                                                  Y, 
                                                  train_size=.80, 
                                                  random_state=1234)

clf = xgb.XGBClassifier()
clf.fit(X_train,y_train)
pred_val=clf.predict_proba(X_val)
print ("mlogloss: %f" % (metrics.log_loss(y_val, pred_val)))

In [None]:
# load data: events, app_events, app_labels, and label_categories

start = time.time()
events = pd.read_csv("../input/events.csv", dtype={'device_id': np.str})
print ("Events loaded in %f seconds" %(time.time() - start))

start = time.time()
app_ev = pd.read_csv("../input/app_events.csv", dtype={'device_id': np.str})
print ("App Events loaded in %f seconds" %(time.time() - start))

start = time.time()
app_lab = pd.read_csv("../input/app_labels.csv", dtype={'device_id': np.str})
print ("App Labels loaded in %f seconds" %(time.time() - start))

start = time.time()
lab_cat = pd.read_csv("../input/label_categories.csv", dtype={'device_id': np.str})
print ("Label Categories loaded in %f seconds" %(time.time() - start))


In [None]:
# aggregate apps labels and categories by device
device_app = pd.merge(events[['device_id','event_id']], app_ev[['event_id','app_id']], 
                      on='event_id')[['device_id','app_id']].drop_duplicates()
device_label = pd.merge(device_app, app_lab, 
                        on='app_id')[['device_id','label_id']].drop_duplicates()
device_category = pd.merge(device_label, lab_cat, 
                           on='label_id')[['device_id','category']].drop_duplicates()
print ("device apps labels and categories aggregated in %f seconds" %(time.time() - start))


In [None]:
# concatenate applications, labels, and categories to a big text column for each device
device_category = device_category.groupby("device_id")["category"].apply(list)
device_label = device_label.groupby("device_id")["label_id"].apply(list)
device_app = device_app.groupby("device_id")["app_id"].apply(list)
del app_ev,events, lab_cat, app_lab
print device_category.shape, device_label.shape, device_app.shape


In [None]:
# group categories/labels/apps by device id and merge them into one big list
full_data["category"] = full_data["device_id"].map(device_category).apply(
    lambda x:' '.join(c for c in x) if x==x else '') 
full_data["label"] = full_data["device_id"].map(device_label).apply(
    lambda x:' '.join(str(c) for c in x) if x==x else '') 
full_data["app"] = full_data["device_id"].map(device_app).apply(
    lambda x:' '.join(str(c) for c in x) if x==x else '') 

full_data['device_model'] = full_data['device_model'].apply(lambda x:x.replace(' ','')) 
full_data['category'] = full_data['category'].apply(lambda x:x.replace(' ','')) 

In [None]:
# count frequecies of each key word (brand, model, and app id), then convert the results to a sparse matrix
counter = CountVectorizer(min_df=1)
matrix = full_data[["phone_brand", "device_model", "app"]].astype(np.str).apply(
    lambda x: " ".join(s for s in x), axis=1)
matrix = counter.fit_transform(matrix)
num_of_feature = matrix.shape[1]

#### Part 5. XGBoost

In [None]:
# XGB baseline - brand, model, and application
X_train, X_val, y_train, y_val = train_test_split(matrix[:train_size], Y, train_size=.80, random_state=1234)

clf = xgb.XGBClassifier()
clf.fit(X_train,y_train)
pred_val = clf.predict_proba(X_val)
print ("mlogloss: %f" % (metrics.log_loss(y_val, pred_val)))

In [None]:
# use a trick called early stopping to find out the optimal number of iterations for XGB
clf = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.3)
clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='mlogloss', early_stopping_rounds=20)


In [None]:
# best_iteration = clf.best_iteration_
# best_score = clf.best_score_
best_iteration = 325
best_score = 2.29486

print (best_iteration, best_score)

In [None]:
# create submission
clf = xgb.XGBClassifier(n_estimators=best_iteration, learning_rate=0.3)
clf.fit(matrix[:train_size], Y)
pred = clf.predict_proba(matrix[train_size:])

result = pd.DataFrame(pred, columns=target_names)
result["device_id"] = device_id
result = result.set_index("device_id")
result.to_csv('brand_model_app_xgb.csv', index=True, index_label='device_id')

#### Part 6. Keras

In [None]:
# convert sparse matrix to dense (in batches)

# generator for training
def batch_generator(X, y, batch_size, shuffle):
    #chenglong code for fiting from generator 
    #https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

# generator for predicting            
def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0

In [None]:
# split data for validation
train = matrix[:train_size, :]
test = matrix[train_size:, :]
X_train, X_val, y_train, y_val = train_test_split(train, Y_labels, train_size=.80, random_state=1234)

In [None]:
# create MLP model with Keras

def create_model(input_dim):
    model = Sequential()
    #     Input Layer
    model.add(Dense(512, 
                    input_dim=input_dim,
                    activation='relu'))
    model.add(Dropout(0.2))
    
    #     Hidden Layer
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.2))

    #     Output Layer
    model.add(Dense(12, activation='softmax'))

    #     Optimizer
    nadam = Nadam(lr=1e-4)

    # Compile Model
    model.compile(loss='categorical_crossentropy',
                  optimizer=nadam)
    return model

model = create_model(num_of_feature)
early_stop = EarlyStopping(monitor='val_loss', patience=0, verbose=0, mode='auto')
fit = model.fit_generator(generator=batch_generator(X_train, y_train, 128, True),
                          nb_epoch=30,
                          samples_per_epoch=train_size,
                          validation_data=(X_val.todense(), y_val),
                          callbacks=[early_stop])

In [None]:
# create submission

for i in range(5):
    print ("Training model %d" % (i+1))
    model = create_model(num_of_feature)
    fit = model.fit_generator(generator=batch_generator(train, Y_labels, 128, True),
                              nb_epoch=<epoch of best model>,
                              samples_per_epoch=train.shape[0])
    
    preds=preds+model.predict_generator(generator=batch_generatorp(test, 128, False), val_samples=test.shape[0])
    
preds = preds/60
submission = pd.DataFrame(preds, columns=label_group.classes_)
submission["device_id"] = device_id
submission = submission.set_index("device_id")
submission.to_csv('brand_model_app_keras.csv', index=True, index_label='device_id')