# Setup

In [1]:
import matplotlib
matplotlib.use('Agg')

In [2]:
%matplotlib notebook

In [3]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 30) # 27 columns of data in training set

# import tensorflow as tf
import matplotlib.pyplot as plt

# from sklearn import datasets, cross_validation, metrics
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer
# from sklearn import preprocessing

# Random Forest
from sklearn.ensemble import RandomForestClassifier

# from sklearn.grid_search import GridSearchCV

In [4]:
def make_features(data):    
    '''
    Extract date-time features from dataframe 'data'.
    Converts date_time, srch_ci, and srch_co fields into
    components (year, month, day, etc) and drops the 
    original field.
    '''
    extract_datetimes(data, 'date_time')
    extract_datetimes(data, 'srch_ci')
    extract_datetimes(data, 'srch_co')
    
    data = data.drop(['date_time', 'srch_ci', 'srch_co'], axis=1)
    
    has_null = ['orig_destination_distance', 'srch_ci_year', 'srch_ci_month', 
                'srch_ci_day', 'srch_ci_hour', 'srch_ci_minute', 
                'srch_ci_dayofyear', 'srch_ci_dayofweek', 'srch_co_year', 
                'srch_co_month', 'srch_co_day', 'srch_co_hour', 'srch_co_minute',
                'srch_co_dayofyear', 'srch_co_dayofweek']
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0, copy=False)
    data[has_null] = imp.fit_transform(data[has_null])
    
    return data

def extract_datetimes(data, field):
    data[field] = pd.to_datetime(data[field],errors='coerce')
    data[field+'_year'] = data[field].dt.year
    data[field+'_month'] = data[field].dt.month
    data[field+'_day'] = data[field].dt.day
    data[field+'_hour'] = data[field].dt.hour
    data[field+'_minute'] = data[field].dt.minute
    data[field+'_dayofyear'] = data[field].dt.dayofyear
    data[field+'_dayofweek'] = data[field].dt.dayofweek

def make_PCA(X, n_comp):
    pca = PCA(n_components=n_comp)
    pca.fit(X)
    return pca

In [5]:
use_PCA = True
save_preds = True

# Make features

In [53]:
sample_sub = pd.read_csv('data/sample_submission.csv')
train_data = make_features(pd.read_csv('data/train.csv', nrows=10)) # 37,670,294 total lines
test_data = make_features(pd.read_csv('data/test.csv', nrows=10))   # 2,528,244 total lines
destinations_data = pd.read_csv('data/destinations.csv', index_col=0)

In [32]:
train_data #['srch_destination_id'].apply()

Unnamed: 0,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,channel,srch_adults_cnt,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,...,date_time_dayofweek,srch_ci_year,srch_ci_month,srch_ci_day,srch_ci_hour,srch_ci_minute,srch_ci_dayofyear,srch_ci_dayofweek,srch_co_year,srch_co_month,srch_co_day,srch_co_hour,srch_co_minute,srch_co_dayofyear,srch_co_dayofweek
0,2,3,66,348,48862,2234.2641,12,0,1,9,2,0,1,8250,1,...,0,2014.0,8.0,27.0,0.0,0.0,239.0,2.0,2014.0,8.0,31.0,0.0,0.0,243.0,6.0
1,2,3,66,348,48862,2234.2641,12,0,1,9,2,0,1,8250,1,...,0,2014.0,8.0,29.0,0.0,0.0,241.0,4.0,2014.0,9.0,2.0,0.0,0.0,245.0,1.0
2,2,3,66,348,48862,2234.2641,12,0,0,9,2,0,1,8250,1,...,0,2014.0,8.0,29.0,0.0,0.0,241.0,4.0,2014.0,9.0,2.0,0.0,0.0,245.0,1.0
3,2,3,66,442,35390,913.1932,93,0,0,3,2,0,1,14984,1,...,5,2014.0,11.0,23.0,0.0,0.0,327.0,6.0,2014.0,11.0,28.0,0.0,0.0,332.0,4.0
4,2,3,66,442,35390,913.6259,93,0,0,3,2,0,1,14984,1,...,5,2014.0,11.0,23.0,0.0,0.0,327.0,6.0,2014.0,11.0,28.0,0.0,0.0,332.0,4.0
5,2,3,66,442,35390,911.5142,93,0,0,3,2,0,1,14984,1,...,5,2014.0,11.0,23.0,0.0,0.0,327.0,6.0,2014.0,11.0,28.0,0.0,0.0,332.0,4.0
6,2,3,66,189,10067,1573.520933,501,0,0,2,2,0,1,8267,1,...,2,2014.0,8.0,1.0,0.0,0.0,213.0,4.0,2014.0,8.0,2.0,0.0,0.0,214.0,5.0
7,2,3,66,189,10067,1573.520933,501,0,1,2,2,0,1,8267,1,...,2,2014.0,8.0,1.0,0.0,0.0,213.0,4.0,2014.0,8.0,2.0,0.0,0.0,214.0,5.0
8,2,3,66,189,10067,1573.520933,501,0,0,2,2,0,1,8267,1,...,2,2014.0,8.0,1.0,0.0,0.0,213.0,4.0,2014.0,8.0,2.0,0.0,0.0,214.0,5.0
9,2,3,66,189,10067,1573.520933,501,0,0,2,2,0,1,8267,1,...,2,2014.0,8.0,1.0,0.0,0.0,213.0,4.0,2014.0,8.0,2.0,0.0,0.0,214.0,5.0


In [58]:
destinations_data

Unnamed: 0_level_0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14,d15,...,d135,d136,d137,d138,d139,d140,d141,d142,d143,d144,d145,d146,d147,d148,d149
srch_destination_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
0,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-1.897627,-2.198657,-2.198657,-1.897627,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,...,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657
1,-2.181690,-2.181690,-2.181690,-2.082564,-2.181690,-2.165028,-2.181690,-2.181690,-2.031597,-2.181690,-2.181690,-2.181690,-2.181690,-2.181690,-2.181690,...,-2.181690,-2.181690,-2.148982,-2.181690,-2.181690,-2.165028,-2.181690,-2.165028,-2.181690,-2.181690,-2.165028,-2.181690,-2.181690,-2.181690,-2.181690
2,-2.183490,-2.224164,-2.224164,-2.189562,-2.105819,-2.075407,-2.224164,-2.118483,-2.140393,-2.224164,-2.209855,-2.224164,-2.110723,-2.186008,-2.224164,...,-2.224164,-2.224164,-2.214572,-2.186008,-2.191569,-2.224164,-2.224164,-2.196379,-2.224164,-2.192009,-2.224164,-2.224164,-2.224164,-2.224164,-2.057548
3,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.115485,-2.177409,-2.177409,-2.177409,-2.177409,-2.161081,-2.177409,-2.177409,-2.177409,-2.177409,...,-2.177409,-2.177409,-2.115485,-2.177409,-2.161081,-2.161081,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409
4,-2.189562,-2.187783,-2.194008,-2.171153,-2.152303,-2.056618,-2.194008,-2.194008,-2.145911,-2.194008,-2.089094,-2.194008,-2.155205,-2.070995,-2.194008,...,-2.194008,-2.194008,-2.151470,-2.194008,-2.163242,-2.187356,-2.194008,-2.191779,-2.194008,-2.194008,-2.185161,-2.194008,-2.194008,-2.194008,-2.188037
5,-2.174489,-2.174489,-2.174489,-2.174489,-2.174489,-2.155473,-2.174489,-2.174489,-2.174489,-2.174489,-2.174489,-2.174489,-2.174489,-2.174489,-2.174489,...,-2.174489,-2.174489,-2.155473,-2.174489,-2.174489,-2.174489,-2.174489,-2.174489,-2.174489,-2.174489,-2.174489,-2.174489,-2.174489,-2.174489,-2.174489
6,-2.174610,-2.174610,-2.174610,-2.174610,-2.174610,-2.137590,-2.174610,-2.174610,-2.174610,-2.174610,-2.174610,-2.174610,-2.174610,-2.174610,-2.174610,...,-2.174610,-2.174610,-2.137590,-2.174610,-2.174610,-2.174610,-2.174610,-2.174610,-2.174610,-2.174610,-2.174610,-2.174610,-2.174610,-2.174610,-2.174610
7,-2.221932,-2.226591,-2.226591,-2.226591,-2.095756,-2.019335,-2.207045,-2.217996,-2.224797,-2.199692,-2.193044,-2.226591,-2.226591,-2.098555,-2.226591,...,-2.226591,-2.131885,-2.207019,-2.226591,-2.058776,-2.221932,-2.226591,-2.094537,-2.226591,-2.226591,-2.226591,-2.226591,-2.226591,-2.226591,-2.226591
8,-2.201047,-2.201047,-2.201047,-2.150858,-2.150858,-2.030768,-2.194575,-2.195658,-2.201047,-2.182411,-2.201047,-2.201047,-2.201047,-2.102984,-2.201047,...,-2.201047,-2.074210,-2.151259,-2.201047,-2.106235,-2.201047,-2.201047,-2.201047,-2.201047,-2.201047,-2.201047,-2.201047,-2.201047,-2.201047,-2.144392
9,-2.175979,-2.175979,-2.175979,-2.175979,-2.175979,-2.141488,-2.175979,-2.175979,-2.175979,-2.175979,-2.175979,-2.175979,-2.175979,-2.175979,-2.175979,...,-2.175979,-2.175979,-2.175979,-2.175979,-2.136317,-2.175979,-2.175979,-2.175979,-2.175979,-2.175979,-2.175979,-2.175979,-2.175979,-2.175979,-2.175979


In [54]:
destinations_data[destinations_data['srch_destination_id']==8250]

KeyError: 'srch_destination_id'

In [24]:
d = {'user_id':1, 'user':'user1', 'group_id':3, 'group_name':'ordinary users'}
m = {'user_id':'uid', 'group_id':'gid', 'group_name':'group'}

In [25]:
b = dict((m.get(k, k), v) for (k, v) in d.items())

In [30]:
m.get?

In [44]:
train_data['srch_destination_id'].get(0)

8250

In [72]:
destinations_data.ix[8250]

d1     -2.283946
d2     -2.346645
d3     -2.400912
d4     -2.367190
d5     -2.273226
d6     -1.722868
d7     -2.392862
d8     -2.381387
d9     -2.239271
d10    -2.259266
d11    -2.305731
d12    -2.400912
d13    -2.369545
d14    -1.712277
d15    -2.399467
d16    -1.901553
d17    -2.065535
d18    -2.384672
d19    -2.399649
d20    -1.140066
d21    -2.354545
d22    -2.400596
d23    -2.400912
d24    -2.365183
d25    -2.397435
d26    -2.365331
d27    -2.397981
d28    -2.363630
d29    -1.675819
d30    -2.194497
          ...   
d120   -2.392041
d121   -1.925548
d122   -1.868743
d123   -2.400912
d124   -2.389131
d125   -2.390027
d126   -2.363872
d127   -2.396035
d128   -2.007101
d129   -2.400837
d130   -2.120344
d131   -2.275190
d132   -2.203799
d133   -2.373795
d134   -2.365093
d135   -2.400912
d136   -2.370761
d137   -2.343093
d138   -2.320423
d139   -1.732640
d140   -2.384553
d141   -2.345528
d142   -2.396591
d143   -2.399953
d144   -2.388116
d145   -2.394294
d146   -2.400667
d147   -2.3987

In [None]:
# train_data['srch_destination_id'][0]

In [None]:
# destinations_data[destinations_data['srch_destination_id']==train_data['srch_destination_id'][3]]

In [None]:
features = ['site_name', 'posa_continent', 'user_location_country',
       'user_location_region', 'user_location_city',
       'orig_destination_distance', 'user_id', 'is_mobile', 'is_package',
       'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt',
       'srch_destination_id', 'srch_destination_type_id',
       'hotel_continent', 'hotel_country', 'hotel_market',
       'date_time_year', 'date_time_month', 'date_time_day', 'date_time_hour',
       'date_time_minute', 'date_time_dayofyear', 'date_time_dayofweek',
       'srch_ci_year', 'srch_ci_month', 'srch_ci_day', 'srch_ci_hour',
       'srch_ci_minute', 'srch_ci_dayofyear', 'srch_ci_dayofweek',
       'srch_co_year', 'srch_co_month', 'srch_co_day', 'srch_co_hour',
       'srch_co_minute', 'srch_co_dayofyear', 'srch_co_dayofweek']

# 'srch_ci', 'srch_co', 'orig_destination_distance', 'is_booking', 'cnt',

In [None]:
X_all = train_data.ix[:,features]
y_all = train_data.ix[:,'hotel_cluster']
X_test = test_data.ix[:,features]

In [None]:
# # checking correlation of features
# plt.matshow(X_all.corr())
# plt.colorbar()

In [None]:
if use_PCA:
    X = X_all.as_matrix()
    pca = make_PCA(X, 15)
    X = pca.transform(X)

In [None]:
# generate training and cross-validation features
X_train, X_cv, y_train, y_cv = train_test_split(X, y_all, train_size=.99, random_state=1)

# Train: Random Forest Model

In [None]:
expedia_rfc = RandomForestClassifier(n_estimators=100, 
                                     max_leaf_nodes=6, 
                                     criterion='entropy')

In [None]:
%time expedia_rfc = expedia_rfc.fit(X_train, y_train)

In [None]:
%%time

score_train = expedia_rfc.score(X_train, y_train)
score_cv = expedia_rfc.score(X_cv, y_cv)

# train/cv
print ('Training Score:', score_train, ', CV Score:', score_cv) 

In [None]:
feature_importance = zip(features, expedia_rfc.feature_importances_)
for x in sorted(feature_importance, key=lambda x: -x[1]):
    print (x)

# Predict

In [None]:
if use_PCA:
    X_test = pca.transform(X_test.as_matrix())

In [1]:
# break the test set into n = num_split sets to predict on
split_size = int(X_test.shape[0]*.1)
num_splits = 10
n_test = X_test.shape[0]
top_pred_hotel_cluster = []

NameError: name 'X_test' is not defined

In [None]:
%%time

for i in range(num_splits):
    prob_prediction = expedia_rfc.predict_proba(X_test[int(i*n_test/num_splits):int((i+1)*n_test/num_splits)])
    top_pred_hotel_cluster.append([' '.join([str(hotel) for hotel in row]) for row in np.argsort(prob_prediction)[:,-5:]])

In [None]:
%%time 
prob_prediction = expedia_rfc.predict_proba(X_test[:split_size])
top_pred_hotel_cluster.append([' '.join([str(hotel) for hotel in row]) for row in np.argsort(prob_prediction)[:,-5:]])

In [None]:
a = np.array(top_pred_hotel_cluster)

In [None]:
submission = pd.DataFrame(np.concatenate((a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7],a[8],a[9]), axis=0), columns=['hotel_cluster'])

In [None]:
submission.to_csv('expedia-rf-2016-04-23-s1.csv', index_label='Id')