# Setup

In [None]:
import matplotlib
matplotlib.use('Agg')

In [None]:
%matplotlib inline

In [None]:
# import tensorflow as tf
import matplotlib.pyplot as plt

# from sklearn import datasets, cross_validation, metrics
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
# from sklearn import preprocessing

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
from keras.utils import np_utils

import csv

# from copy import copy

In [None]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 30) # 27 columns of data in training set

In [None]:
def make_features(data):    
#     for col in ['site_name', 'posa_continent']:
#         dummies = pd.get_dummies(data[col])
#         data[col[0:3]+"_"+dummies.columns] = dummies
    # Extract date-time features
    data['datetime'] = pd.to_datetime(data['date_time'])
    data['srch_ci_datetime'] = pd.to_datetime(data['srch_ci'])
    data['srch_co_datetime'] = pd.to_datetime(data['srch_co'])
#     data['year'] = data['Dates'].dt.year
#     data['month'] = data['Dates'].dt.month
#     data['day'] = data['Dates'].dt.day
#     data['hour'] = data['Dates'].dt.hour
#     data['minute'] = data['Dates'].dt.minute
#     data['dayofyear'] = data['Dates'].dt.dayofyear
#     data['dayofweek'] = data['Dates'].dt.dayofweek

#     data[['X','Y','Z']] = preprocessing.normalize(data[['X','Y','Z']], norm='l2')
    
    return data

In [None]:
def make_PCA(X, n_comp):
    pca = PCA(n_components=n_comp)
    pca.fit(X)
    return pca

def build_model(input_dim, output_dim, hn=32, dp=0.5, layers=1,
                init_mode='glorot_uniform',
                batch_norm=True):
    model = Sequential()
    model.add(Dense(hn, input_dim=input_dim, init=init_mode))
    model.add(Activation('relu'))
    if batch_norm:
        model.add(BatchNormalization())
    model.add(Dropout(dp))

    for i in range(layers):
        model.add(Dense(hn, init=init_mode))
        model.add(Activation('relu'))
        if batch_norm:
            model.add(BatchNormalization())
        model.add(Dropout(dp))

    model.add(Dense(output_dim, init=init_mode))
    model.add(Activation('softmax'))

    return model


def save_model_weights(model, name):
    try:
        model.save_weights(name, overwrite=True)
    except:
        print("failed to save classifier weights")
    pass

def load_model_weights(model, name):
    try:
        model.load_weights(name)
    except:
        print("Can't load weights!")


def run_model(X, y, model, batch_size, nb_epoch, lr, load_name='expedia.h5', save_name='expedia.h5'):
    adam = Adam(lr=lr)
    model.compile(loss='categorical_crossentropy', optimizer=adam)
    load_model_weights(model, load_name)
    model.fit(X,
              y,
              nb_epoch=nb_epoch,
              batch_size=batch_size,
              validation_split=0.1,
              show_accuracy=True,
              verbose=True)

    save_model_weights(model, save_name)
    return model

In [None]:
use_PCA = False
save_preds = True

# Import data

In [None]:
sample_sub = pd.read_csv('data/sample_submission.csv')
train_data_raw = pd.read_csv('data/train.csv', nrows=25000) # 37,670,294 total lines
# test_data_raw = pd.read_csv('data/test.csv')   # 2,528,244 total lines
test_data_raw = pd.read_csv('data/test.csv', nrows=1000)   # 2,528,244 total lines
destinations_data = pd.read_csv('data/destinations.csv')

# Make features

In [None]:
# train_data = make_features(train_data_raw)

In [None]:
train_data = train_data_raw
# train_data.drop(['date_time', 'srch_ci', 'srch_co', 'orig_destination_distance', 'is_booking', 'cnt'], axis=1)

In [None]:
# dummies = pd.get_dummies(train_data, columns=['site_name', 'posa_continent', 'user_location_country',
#        'user_location_region', 'user_location_city', 'user_id', 'is_mobile', 'is_package',
#        'channel','srch_destination_id', 'srch_destination_type_id',
#        'hotel_continent', 'hotel_country', 'hotel_market'])

In [None]:
# features = dummies.columns

In [None]:
features = ['site_name', 'posa_continent', 'user_location_country',
       'user_location_region', 'user_location_city',
       'user_id', 'is_mobile', 'is_package',
       'channel', 'srch_adults_cnt', 'srch_children_cnt',
       'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id',
       'hotel_continent', 'hotel_country', 'hotel_market']

test_features = ['site_name', 'posa_continent', 'user_location_country',
       'user_location_region', 'user_location_city',
       'user_id', 'is_mobile', 'is_package',
       'channel', 'srch_adults_cnt', 'srch_children_cnt',
       'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id',
       'hotel_continent', 'hotel_country', 'hotel_market']

# 'srch_ci', 'srch_co', 'orig_destination_distance', 'is_booking', 'cnt',

In [None]:
# pd.isnull(test_data).sum()

In [None]:
X_all = train_data.ix[:,features]
y_all = train_data.ix[:,'hotel_cluster']
# X_test = test_data.ix[:,features]

In [None]:
# # 
# pd.tools.plotting.scatter_matrix(X_all, alpha=0.2)

In [None]:
# checking correlation of features
plt.matshow(X_all.corr())
plt.colorbar()

In [None]:
y = y_all.astype('category').cat.codes

X = X_all.as_matrix()
if use_PCA:
    pca = make_PCA(X, 15)
    X = pca.transform(X)

In [None]:
X_train, X_cv, y_train, y_cv = train_test_split(X_all, y_all, train_size=.5, random_state=1)

# NN Model

In [None]:
y_OH = np_utils.to_categorical(y.as_matrix(), y.nunique())

In [None]:
input_dim = X.shape[1]
output_dim = y_OH.shape[1]

In [None]:
model = build_model(input_dim, output_dim, hn=512, dp=0.5, layers=3, init_mode='glorot_normal')

In [None]:
model = run_model(X, y_OH, model, 128, 50, 1e-3, load_name='expedia-512x3.h5', save_name='expedia-512x3.h5')

In [None]:
X_final_test = X_test[test_features].as_matrix()
#     X_final_test = pca.transform(X_final_test)
pred = model.predict_proba(X_final_test, batch_size=64, verbose=1)

labels = list(pd.get_dummies(train_data['hotel_cluster']).columns)

In [None]:
#take top 5 results
top_pred_hotel_cluster = [' '.join([str(hotel) for hotel in row]) for row in np.argsort(pred)[:,-5:]]

In [None]:
results = pd.DataFrame(top_pred_hotel_cluster, columns=['hotel_cluster'])

In [None]:
results.to_csv('expedia-nn-2016-04-16-s3.csv', index_label='id')