# Expedia Hotel Recommendations

# Setup

In [None]:
import matplotlib
matplotlib.use('Agg')

In [None]:
%matplotlib notebook

In [None]:
import numpy as np
import pandas as pd

# pd.set_option('display.max_columns', 30) # 27 columns of data in training set

import matplotlib.pyplot as plt

# from sklearn import datasets, cross_validation, metrics
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer
# from sklearn import preprocessing

# Random Forest
from sklearn.ensemble import RandomForestClassifier

# from sklearn.grid_search import GridSearchCV

In [None]:
def make_features(data, destinations_data):    
    '''
    Extract date-time features from dataframe 'data'.
    Converts date_time, srch_ci, and srch_co fields  (if they exist) 
    into components (year, month, day, etc) and drops the original field.
    '''
    
    # Extract date-time features
    fields = ['date_time', 'srch_ci', 'srch_co']
    for field in fields:
        if field in data.keys():
            extract_datetimes(data, field)
            data = data.drop(field, axis=1)
    
    
    # merge in srch_destination_id d1-d149 fields
    data = pd.merge(data, destinations_data, on='srch_destination_id', how='left')

#     has_null = ['orig_destination_distance', 'srch_ci_year', 'srch_ci_month', 
#                 'srch_ci_day', 'srch_ci_hour', 'srch_ci_minute', 
#                 'srch_ci_dayofyear', 'srch_ci_dayofweek', 'srch_co_year', 
#                 'srch_co_month', 'srch_co_day', 'srch_co_hour', 'srch_co_minute',
#                 'srch_co_dayofyear', 'srch_co_dayofweek']
#     has_null = ['orig_destination_distance', 'srch_ci_year', 'srch_ci_hour',
#                 'srch_ci_dayofyear', 'srch_ci_dayofweek', 'srch_co_year', 
#                 'srch_co_hour', 'srch_co_dayofyear', 'srch_co_dayofweek']
#     has_null.extend(destinations_data.columns[1:])
    
    # Only impute columns with nulls
    has_null = data.columns[data.isnull().sum()>0]
    
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0, copy=False)
    data[has_null] = imp.fit_transform(data[has_null])
    return data

def extract_datetimes(data, field):
    data[field] = pd.to_datetime(data[field],errors='coerce')
    data[field+'_year'] = data[field].dt.year
#     data[field+'_month'] = data[field].dt.month
#     data[field+'_day'] = data[field].dt.day
#   data[field+'_hour'] = data[field].dt.hour
#     data[field+'_minute'] = data[field].dt.minute
    data[field+'_dayofyear'] = data[field].dt.dayofyear
    data[field+'_dayofweek'] = data[field].dt.dayofweek

def make_PCA(X, n_comp):
    pca = PCA(n_components=n_comp)
    pca.fit(X)
    return pca

def apk(actual, predicted, k=5):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=5):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

def mapk_score(X, y, estimator, num_splits=10):
# Score MAP@5 for X using estimator against target y, 
# splits into num_splits sets to reduce memory requirement

    n_test = X.shape[0]
    top_pred_hotel_cluster = []

    for i in range(num_splits):
        prob_prediction = estimator.predict_proba(X_train[int(i*n_test/num_splits):int((i+1)*n_test/num_splits)])
        top_pred_hotel_cluster.extend((np.argsort(prob_prediction)[:,-5:]).tolist())

    return mapk([[i] for i in y.values.tolist()], top_pred_hotel_cluster)

In [None]:
use_PCA = False
save_preds = True

# Make features

In [None]:
cols = ['site_name', 'posa_continent', 'user_location_country',
       'user_location_region', 'user_location_city',
       'orig_destination_distance', 'user_id', 'is_mobile', 'is_package',
       'channel', 'srch_ci', 'srch_co', 'srch_adults_cnt', 'srch_children_cnt',
       'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id',
       'is_booking', 'cnt', 'hotel_country',
       'hotel_cluster']
# 'date_time', 'hotel_market', 'hotel_continent',

cols_test = cols[:17]
cols_test.extend(cols[19:-1])

In [None]:
# sample_sub = pd.read_csv('data/sample_submission.csv')
destinations_data = pd.read_csv('data/destinations.csv')
train_data = make_features(pd.read_csv('data/train.csv', usecols=cols, nrows=1000000),
                           destinations_data) # 37,670,294 total lines
test_data = make_features(pd.read_csv('data/test.csv', usecols=cols_test, nrows=5), 
                          destinations_data)   # 2,528,244 total lines
# test_data = pd.read_csv('data/test_maked.csv') #full csv, pre-made

In [None]:
# ('posa_continent', 0.0)
# ('user_location_country', 0.0)
# ('srch_adults_cnt', 0.0)
# ('srch_destination_id', 0.0)
# ('srch_destination_type_id', 0.0)
# ('hotel_continent', 0.0)
# ('hotel_country', 0.0)
# ('hotel_market', 0.0)
# ('date_time_hour', 0.0)
# ('date_time_dayofyear', 0.0)

In [None]:
# take features from columns in test data, ignoring some fields uniquely in train
features = test_data.columns.tolist()[1:]

X_all = train_data.ix[:,features]
y_all = train_data.ix[:,'hotel_cluster']
X_test = test_data.ix[:,features]

In [None]:
# # checking correlation of features
# plt.matshow(X_all.corr())
# plt.colorbar()

In [None]:
if use_PCA:
    X = X_all.as_matrix()
    pca = make_PCA(X, 50)
    X = pca.transform(X)
else:
    X = X_all   

In [None]:
# generate training and cross-validation features
X_train, X_cv, y_train, y_cv = train_test_split(X, y_all, train_size=.9, random_state=1)

# Train: Random Forest Model

In [None]:
from sklearn.externals import joblib

In [None]:
expedia_rfc = RandomForestClassifier(n_estimators=1200, 
                                     max_leaf_nodes=10)

In [None]:
%time expedia_rfc = expedia_rfc.fit(X_train, y_train)

In [None]:
joblib.dump(expedia_rfc, 'model/expedia_rfc.pkl') 

In [None]:
%%time 

num_splits=500

score_train = mapk_score(X_train, y_train, expedia_rfc, num_splits=num_splits)
score_cv = mapk_score(X_cv, y_cv, expedia_rfc, num_splits=num_splits)

# train/cv
print ('Training Score:', score_train, '\nCV Score:', score_cv) 

In [None]:
feature_importance = zip(features, expedia_rfc.feature_importances_)
for x in sorted(feature_importance, key=lambda x: -x[1]):
    print (x)

# Predict

In [None]:
expedia_rfc = joblib.load('model/expedia_rfc.pkl')

In [None]:
if use_PCA:
    X_test = pca.transform(X_test.as_matrix())

In [None]:
%%time

# break the test set into n = num_split sets to predict on
num_splits = 10
n_test = X_test.shape[0]
top_pred_hotel_cluster = []

for i in range(num_splits):
    prob_prediction = expedia_rfc.predict_proba(X_test[int(i*n_test/num_splits):int((i+1)*n_test/num_splits)])
    top_pred_hotel_cluster.extend([' '.join([str(hotel) for hotel in row]) for row in np.argsort(prob_prediction)[:,-5:]])

In [None]:
submission = pd.DataFrame(top_pred_hotel_cluster, columns=['hotel_cluster'])

In [None]:
if submission.shape[0] == 2528243:
    submission.to_csv('expedia-rf-2016-05-01-s1.csv', index_label='Id')
else:
    print('submission size does not match correct value')

# Plot learning curves

In [None]:
# from sklearn.naive_bayes import GaussianNB
# from sklearn.svm import SVC
# from sklearn.datasets import load_digits
from sklearn import cross_validation
from sklearn.learning_curve import learning_curve

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and traning learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : integer, cross-validation generator, optional
        If an integer is passed, it is the number of folds (defaults to 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [None]:
%%time 

estimator = expedia_lr

# from sklearn.ensemble import GradientBoostingClassifier
# estimator = GradientBoostingClassifier()
title = "Learning Curves (Logistic Regression)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = cross_validation.ShuffleSplit(X.shape[0], n_iter=5,
                                   test_size=0.3, random_state=0)

plot_learning_curve(estimator, title, X, y_all, n_jobs=2, cv=None)
plt.show()

# Exhaustive Grid Search

In [None]:
# Split again, generate training and cross-validation features for grid search
X_grid_train, X_grid_cv, y_grid_train, y_grid_cv = cross_validation.train_test_split(X_train, 
                                                                                     y_train, 
                                                                                     test_size=0.40, 
                                                                                     random_state=1)

In [None]:
param_grid = [
    {'n_estimators': [200], 'min_samples_split': [1, 2]}
]
scores = ['precision', 'recall']
# , 'max_features': [2, 3, 5]

In [None]:
%%time
clf = GridSearchCV(RandomForestClassifier(), param_grid, error_score=0, n_jobs=1)
clf.fit(X_grid_train, y_grid_train)

print(clf.best_score_, clf.best_params_)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
for params, mean_score, scores in clf.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r"
          % (mean_score, scores.std() * 2, params))

In [None]:
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(OneVsRestClassifier(SVC()), param_grid,
                       scoring='%s_weighted' % score)
    clf.fit(X_grid_train, y_grid_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_grid_cv, clf.predict(X_grid_cv)
    print(classification_report(y_true, y_pred))
    print()