# Setup

In [1]:
import matplotlib
matplotlib.use('Agg')

In [2]:
%matplotlib inline

In [3]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 30) # 27 columns of data in training set

# import tensorflow as tf
import matplotlib.pyplot as plt

# from sklearn import datasets, cross_validation, metrics
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer
# from sklearn import preprocessing

# Random Forest
from sklearn.ensemble import RandomForestClassifier

# from sklearn.grid_search import GridSearchCV

In [18]:
def make_features(data):    
    '''
    Extract date-time features from dataframe 'data'.
    Converts date_time, srch_ci, and srch_co fields into
    components (year, month, day, etc) and drops the 
    original field.
    '''
    extract_datetimes(data, 'date_time')
    extract_datetimes(data, 'srch_ci')
    extract_datetimes(data, 'srch_co')
    
    data = data.drop(['date_time', 'srch_ci', 'srch_co'], axis=1)

#     imp = Imputer(missing_values='NaN', strategy='mean', axis=0, copy=False)
#     data = pd.DataFrame(imp.fit_transform(data), index=data.index, columns=data.columns)
    return data

def extract_datetimes(data, field):
    data[field] = pd.to_datetime(data[field],errors='coerce')
    data[field+'_year'] = data[field].dt.year
    data[field+'_month'] = data[field].dt.month
    data[field+'_day'] = data[field].dt.day
    data[field+'_hour'] = data[field].dt.hour
    data[field+'_minute'] = data[field].dt.minute
    data[field+'_dayofyear'] = data[field].dt.dayofyear
    data[field+'_dayofweek'] = data[field].dt.dayofweek

def make_PCA(X, n_comp):
    pca = PCA(n_components=n_comp)
    pca.fit(X)
    return pca

In [11]:
pd.DataFrame?

In [5]:
use_PCA = False
save_preds = True

# Import data

In [19]:
sample_sub = pd.read_csv('data/sample_submission.csv')
train_data = make_features(pd.read_csv('data/train.csv', nrows=10000)) # 37,670,294 total lines
test_data = make_features(pd.read_csv('data/test.csv', nrows=10000))   # 2,528,244 total lines
# test_data_raw = pd.read_csv('data/test.csv', nrows=1000)   # 2,528,244 total lines
destinations_data = pd.read_csv('data/destinations.csv')

# Make features

In [21]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 40 columns):
id                           10000 non-null int64
site_name                    10000 non-null int64
posa_continent               10000 non-null int64
user_location_country        10000 non-null int64
user_location_region         10000 non-null int64
user_location_city           10000 non-null int64
orig_destination_distance    5893 non-null float64
user_id                      10000 non-null int64
is_mobile                    10000 non-null int64
is_package                   10000 non-null int64
channel                      10000 non-null int64
srch_adults_cnt              10000 non-null int64
srch_children_cnt            10000 non-null int64
srch_rm_cnt                  10000 non-null int64
srch_destination_id          10000 non-null int64
srch_destination_type_id     10000 non-null int64
hotel_continent              10000 non-null int64
hotel_country                10000 non-nu

In [None]:
# train_data_raw['orig_destination_distance'][pd.isnull(train_data_raw['orig_destination_distance'])] = train_data_raw['orig_destination_distance'].mean()

In [None]:
train_data['srch_destination_id'][0]

In [None]:
destinations_data[destinations_data['srch_destination_id']==train_data['srch_destination_id'][3]]

In [None]:
# dummies = pd.get_dummies(train_data, columns=['site_name', 'posa_continent', 'user_location_country',
#        'user_location_region', 'user_location_city', 'user_id', 'is_mobile', 'is_package',
#        'channel','srch_destination_id', 'srch_destination_type_id',
#        'hotel_continent', 'hotel_country', 'hotel_market'])

In [None]:
features = ['site_name', 'posa_continent', 'user_location_country',
       'user_location_region', 'user_location_city',
       'orig_destination_distance', 'user_id', 'is_mobile', 'is_package',
       'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt',
       'srch_destination_id', 'srch_destination_type_id',
       'hotel_continent', 'hotel_country', 'hotel_market', 'hotel_cluster',
       'date_time_year', 'date_time_month', 'date_time_day', 'date_time_hour',
       'date_time_minute', 'date_time_dayofyear', 'date_time_dayofweek',
       'srch_ci_year', 'srch_ci_month', 'srch_ci_day', 'srch_ci_hour',
       'srch_ci_minute', 'srch_ci_dayofyear', 'srch_ci_dayofweek',
       'srch_co_year', 'srch_co_month', 'srch_co_day', 'srch_co_hour',
       'srch_co_minute', 'srch_co_dayofyear', 'srch_co_dayofweek']

test_features = ['site_name', 'posa_continent', 'user_location_country',
       'user_location_region', 'user_location_city',
       'user_id', 'is_mobile', 'is_package',
       'channel', 'srch_adults_cnt', 'srch_children_cnt',
       'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id',
       'hotel_continent', 'hotel_country', 'hotel_market']

# 'srch_ci', 'srch_co', 'orig_destination_distance', 'is_booking', 'cnt',

In [None]:
# pd.isnull(test_data).sum()

In [None]:
X_all = train_data.ix[:,features]
y_all = train_data.ix[:,'hotel_cluster']
X_test = test_data.ix[:,features]

In [None]:
# # 
# pd.tools.plotting.scatter_matrix(X_all, alpha=0.2)

In [None]:
# checking correlation of features
plt.matshow(X_all.corr())
plt.colorbar()

In [None]:
y = y_all.astype('category').cat.codes

X = X_all.as_matrix()
if use_PCA:
    pca = make_PCA(X, 15)
    X = pca.transform(X)

In [None]:
X_train = X_all
y_train = y_all

In [None]:
# generate training and cross-validation features
X_train, X_cv, y_train, y_cv = train_test_split(X_all, y_all, train_size=.7, random_state=1)

# Random Forest Model

In [None]:
expedia_rfc = RandomForestClassifier(n_estimators=50, 
                                     max_leaf_nodes=12, 
                                     criterion='entropy')

In [None]:
%time expedia_rfc = expedia_rfc.fit(X_train, y_train)

In [None]:
%%time

score_train = expedia_rfc.score(X_train, y_train)
score_cv = expedia_rfc.score(X_cv, y_cv)

# test/train
# 20/80 split Training Score: 0.944199898638 , CV Score: 0.217073344343
# 50/50 split Training Score: 0.894782517584 , CV Score: 0.242728773988
# 80/20 split Training Score: 0.943824063687 , CV Score: 0.219235806617
print ('Training Score:', score_train, ', CV Score:', score_cv) 

In [None]:
feature_importance = zip(features, expedia_rfc.feature_importances_)
for x in sorted(feature_importance, key=lambda x: -x[1]):
    print (x)

In [None]:
prob_prediction = expedia_rfc.predict_proba(X_test)

In [None]:
submission = pd.DataFrame(prob_prediction, 
                          index=X_test.index, 
                          columns=expedia_rfc.classes_)

In [None]:
submission.to_csv('expedia-rf-2016-04-18-s1.csv', index_label='Id')

In [None]:
y_OH = np_utils.to_categorical(y.as_matrix(), y.nunique())

In [None]:
input_dim = X.shape[1]
output_dim = y_OH.shape[1]

In [None]:
model = build_model(input_dim, output_dim, hn=512, dp=0.5, layers=3, init_mode='glorot_normal')

In [None]:
model = run_model(X, y_OH, model, 128, 50, 1e-3, load_name='expedia-512x3.h5', save_name='expedia-512x3.h5')

In [None]:
X_final_test = X_test[test_features].as_matrix()
#     X_final_test = pca.transform(X_final_test)
pred = model.predict_proba(X_final_test, batch_size=64, verbose=1)

labels = list(pd.get_dummies(train_data['hotel_cluster']).columns)

In [None]:
#take top 5 results
top_pred_hotel_cluster = [' '.join([str(hotel) for hotel in row]) for row in np.argsort(pred)[:,-5:]]

In [None]:
results = pd.DataFrame(top_pred_hotel_cluster, columns=['hotel_cluster'])

In [None]:
results.to_csv('expedia-nn-2016-04-16-s3.csv', index_label='id')