#### Week 3 - Model Tuning

This week we are going to learn how to use Keras to create Multlayer Peceptron (MLP) models and how to tune the models.

We will use the datasets we created last week which, however, may need to be massaged a bit to work for Keras.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import time
import random
from sklearn import preprocessing, pipeline, metrics, grid_search, cross_validation
import xgboost as xgb
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.layers import Input, Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD,Nadam
from keras.layers.advanced_activations import SReLU
from keras.layers.core import Activation
from keras.utils import np_utils



In [None]:
# Load Data

print("# Load Phone Brand")
phone_brand = pd.read_csv("../input/phone_brand_device_model.csv",
                  dtype={'device_id': np.str})
phone_brand.drop_duplicates('device_id', keep='first', inplace=True)


print("# Load Train and Test")
train_data = pd.read_csv("../input/gender_age_train.csv",
                    dtype={'device_id': np.str})

test_data = pd.read_csv("../input/gender_age_test.csv",
                   dtype={'device_id': np.str})


full_data = pd.concat((train_data, test_data), axis=0, ignore_index=True)
train_size = len(train_data)
full_data = pd.merge(full_data, phone_brand, how='left',
                on='device_id', left_index=True)

print ("Data Loaded.")

In [None]:
#Group columns - categorical, numerical, target and id
data_types = full_data.dtypes  

#ID
id_col = 'device_id'
#Target
target_col = 'group'

#Categorical columns:
cat_cols = list(data_types[data_types=='object'].index)
cat_cols.remove('group')
cat_cols.remove('gender')
cat_cols.remove('device_id')

#Numeric columns:
num_cols = list(data_types[data_types=='int64'].index) + list(data_types[data_types=='float64'].index)
num_cols.remove('age')


print ("ID column:", id_col)
print ("Target column:",target_col)
print ("Categorical column:",cat_cols)
print ("Numeric column:",num_cols)

In [None]:
#Label target
LBL = preprocessing.LabelEncoder()

Y=LBL.fit_transform(full_data[target_col][:train_size])
Y_labels = np_utils.to_categorical(Y)

target_names=LBL.classes_
print ("target group names:", target_names)

full_data['gender']=full_data['gender'].apply(lambda x:1 if x=='F' else 0)

device_id = full_data[train_size:]["device_id"].values


** Aggregate transactional data onto higher granularity **

We will load events, app_events, app_labels seperately, then aggregate them by device.

In [None]:
start = time.time()
app_ev = pd.read_csv("../input/app_events.csv", dtype={'device_id': np.str})
print ("App Events loaded in %f seconds" %(time.time() - start))

start = time.time()
events = pd.read_csv("../input/events.csv", dtype={'device_id': np.str})
print ("Events loaded in %f seconds" %(time.time() - start))

start = time.time()
app_lab = pd.read_csv("../input/app_labels.csv", dtype={'device_id': np.str})
print ("App Labels loaded in %f seconds" %(time.time() - start))

start = time.time()
lab_cat = pd.read_csv("../input/label_categories.csv", dtype={'device_id': np.str})
print ("Label Categories loaded in %f seconds" %(time.time() - start))


** Concatenate applications, labels and label categories to a big text column for each device**

In [None]:
device_app = pd.merge(events[['device_id','event_id']]
                      , app_ev[['event_id','app_id']], on='event_id')[['device_id','app_id']].drop_duplicates()
device_label = pd.merge(device_app
                        , app_lab, on='app_id')[['device_id','label_id']].drop_duplicates()
device_category= pd.merge(device_label
                          , lab_cat, on='label_id')[['device_id','category']].drop_duplicates()
print ("device apps labels and categories aggregated in %f seconds" %(time.time() - start))


** Group categoris/labels/apps by device id and merge them into one big list **

In [None]:
device_category = device_category.groupby("device_id")["category"].apply(list)
device_label = device_label.groupby("device_id")["label_id"].apply(list)
device_app = device_app.groupby("device_id")["app_id"].apply(list)
del app_ev,events, lab_cat, app_lab
print device_category.shape, device_label.shape, device_app.shape


In [None]:
full_data["category"] = full_data["device_id"].map(device_category).apply(
    lambda x:' '.join(c for c in x) if x==x else '') 
full_data["label"] = full_data["device_id"].map(device_label).apply(
    lambda x:' '.join(str(c) for c in x) if x==x else '') 
full_data["app"] = full_data["device_id"].map(device_app).apply(lambda x:' '.join(str(c) for c in x) if x==x else '') 

full_data['device_model'] = full_data['device_model'].apply(lambda x:x.replace(' ','')) 
full_data['category'] = full_data['category'].apply(lambda x:x.replace(' ','')) 

** count frequecies of each key word (brand, model and app id), then convert the results to a sparse matrix**

In [None]:
counter = CountVectorizer(min_df=1)
matrix = full_data[["phone_brand", "device_model", "app","label"]].astype(np.str).apply(
    lambda x: " ".join(s for s in x), axis=1)
matrix = counter.fit_transform(matrix)
num_of_feature = matrix.shape[1]

#### Create MLP model with Keras

To create a data model we need to define:

1. Input Layer
    * input_dim = number of features
    * activation = relu
2. Hidden Layers  
    * number of units: let's get started with 512 and tune it later. Typically more units tend to give better performance and take longer time to train. However, the performance/ # of units curve is like a "U": at one point more units wouldn't gain more performance
    * activation = relu
3. Output Layer
    * number of units = number of classes, i.e. 12
    * activation = softmax, which is ued for multiclasses
4. Optimizer 
    * We will use Nadam to start - you can also try SGD or others.


In [None]:
def create_model(input_dim):
    model = Sequential()
    #     Input Layer
    model.add(Dense(512, 
                    input_dim=input_dim,
                    activation='relu'))

    #     Hidden Layer
    model.add(Dense(128, activation='relu'))

    #     Output Layer
    model.add(Dense(12, activation='softmax'))

    #     Optimizer
    nadam = Nadam(lr=1e-4)

    # Compile Model
    model.compile(loss='categorical_crossentropy',
                  optimizer=nadam)
    return model


** Convert Sparse Matrix to Dense, in batches **

In [None]:
# generator for training
def batch_generator(X, y, batch_size, shuffle):
    #chenglong code for fiting from generator (https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices)
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

# generator for predicting            
def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0

** Split data for validation **

In [None]:
train = matrix[:train_size, :]
test = matrix[train_size:, :]
num_class = 12
X_train, X_val, y_train, y_val = train_test_split(train, Y_labels, train_size=.80, random_state=1234)

** Train and validate the model we defined using early stopping **

In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=0, verbose=0, mode='auto')

model = create_model(num_of_feature)
fit= model.fit_generator(generator=batch_generator(X_train, y_train, 128, True),
                         nb_epoch=30,
                         samples_per_epoch=train_size,
                         validation_data=(X_val.todense(), y_val),
                         callbacks=[early_stop]
                         )

** Add dropout **
* Starting with drop rate 0.2, but you can try different rates to see which is the best one

In [None]:
def create_model(input_dim):
    model = Sequential()
    #     Input Layer
    model.add(Dense(512, 
                    input_dim=input_dim,
                    activation='relu'))
    model.add(Dropout(0.2))
    
    #     Hidden Layer
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.2))

    #     Output Layer
    model.add(Dense(12, activation='softmax'))

    #     Optimizer
    nadam = Nadam(lr=1e-4)

    # Compile Model
    model.compile(loss='categorical_crossentropy',
                  optimizer=nadam)
    return model

model = create_model(num_of_feature)
fit= model.fit_generator(generator=batch_generator(X_train, y_train, 128, True),
                         nb_epoch=30,
                         samples_per_epoch=train_size,
                         validation_data=(X_val.todense(), y_val),
                         callbacks=[early_stop]
                         )

** Tune number of units of hidden layer **
* Let's try 256 (1/2 of input units), 341 （2/3 of input units) and 512 (same as input units)

In [None]:
def create_model(input_dim):
    model = Sequential()
    #     Input Layer
    model.add(Dense(512, 
                    input_dim=input_dim,
                    activation='relu'))
    model.add(Dropout(0.2))
    
    #     Hidden Layer
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.2))

    #     Output Layer
    model.add(Dense(12, activation='softmax'))

    #     Optimizer
    nadam = Nadam(lr=1e-4)

    # Compile Model
    model.compile(loss='categorical_crossentropy',
                  optimizer=nadam)
    return model

model = create_model(num_of_feature)
fit= model.fit_generator(generator=batch_generator(X_train, y_train, 128, True),
                         nb_epoch=30,
                         samples_per_epoch=train_size,
                         validation_data=(X_val.todense(), y_val),
                         callbacks=[early_stop]
                         )

** Use the tuned model to create submission **

We will train 5 models using the same settings and create a sumbission by simply averaging the predictions. This will give us better results.

In [None]:
for i in range(5):
    print ("Training model %d" % (i+1))
    model=create_model(num_of_feature)
    fit= model.fit_generator(generator=batch_generator(train, Y_labels, 128, True),
                             nb_epoch=<epoch of best model>,
                             samples_per_epoch=train.shape[0]
                             )
    preds=preds+model.predict_generator(generator=batch_generatorp(test, 128, False), val_samples=test.shape[0])
    
preds = preds/60
submission = pd.DataFrame(preds, columns=label_group.classes_)
submission["device_id"] = device_id
submission = submission.set_index("device_id")
submission.to_csv('submission.csv', index=True, index_label='device_id')

#### Homework
1. Try different activations and optimizers and see the differences in performance.
2. Change learning rate to see it effects the traning time and accuracy.
3. Change batch size to see it effects the traning time and accuracy.
4. If possible, try two hidden layers.
