## This is data preprocess explained in "capstone_airbnb"

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from datetime import datetime, date
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from IPython.display import display
import gc
import xgboost as xgb
import operator

# Draw inline
%matplotlib inline

# Set figure aesthetics
sns.set_style("white", {'ytick.major.size': 10.0})
sns.set_context("poster", font_scale=1.1)

In [9]:
train_users_path = 'data/train_users_2.csv'
test_users_path = 'data/test_users.csv'

# Load the data into DataFrames

# train_users
train_users = pd.read_csv(train_users_path)

# original training data
display(train_users.head())

target = train_users['country_destination']
train_users = train_users.drop(['country_destination'], axis=1)

# test_users
test_users = pd.read_csv(test_users_path)

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [10]:
# Uncomment to Load from Checkpoint
users = pd.read_pickle('users_checkpoint1')

# NDCG Scoring Function

In [11]:
country_data = 'data/countries.csv'

# Country Data
data_contries = pd.read_csv(country_data)

for idx in range(data_contries.shape[0]):
    if data_contries['lng_destination'][idx] < 0:
        data_contries['lng_destination'][idx] = 360+data_contries['lng_destination'][idx]

dictLat={};
dictLng={};
for idx in range(data_contries.shape[0]):
    dictLat[data_contries['country_destination'][idx]] = data_contries['lat_destination'][idx]
    dictLng[data_contries['country_destination'][idx]] = data_contries['lng_destination'][idx]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [12]:
def getGain(target, pred):
    if target in dictLat:
        tar_lat = dictLat[target]
        tar_lng = dictLng[target]
    else:
        return 0
    
    if pred in dictLat:
        pred_lat = dictLat[pred]
        pred_lng = dictLng[pred]
    else:
        return 0
    
    if abs(tar_lng-pred_lng) <= 20.0 and abs(tar_lat-pred_lat) <= 40.0:
        return 3
    elif abs(tar_lng-pred_lng) <= 40.0 and abs(tar_lat-pred_lat) <= 80.0:
        return 2
    elif abs(tar_lng-pred_lng) <= 60.0 and abs(tar_lat-pred_lat) <= 120.0:
        return 1
    else:
        return 0
    
def dcg_at_k(r, k, method=1):
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k=5, method=1):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max


def score_predictions(preds, truth, n_modes=5):
    """
    preds: pd.DataFrame
      one row for each observation, one column for each prediction.
      Columns are sorted from left to right descending in order of likelihood.
    truth: pd.Series
      one row for each obeservation.
    """
    assert(len(preds)==len(truth))
    r = pd.DataFrame(0, index=preds.index, columns=preds.columns, dtype=np.float64)
    #print truth
    for row in np.arange(preds.shape[0]):
        target=truth[row]
        for col in preds.columns:
            #print preds[col][row]
            r[col][row] = getGain(target,preds[col][row])
    
    score = pd.Series(r.apply(ndcg_at_k, axis=1, reduce=True), name='score')
    return score.mean()

In [13]:
"""Metrics to compute the model performance."""
# ref: https://www.kaggle.com/davidgasquez/airbnb-recruiting-new-user-bookings/ndcg-scorer
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer

def dcg_score(y_true, y_score, k=5):
    """Discounted cumulative gain (DCG) at rank K.
    
    Parameters
    ----------
    y_true : array, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array, shape = [n_samples, n_classes]
        Predicted scores.
    k : int
        Rank.
        
    Returns
    -------
    score : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)

def ndcg_score(ground_truth, predictions, k=5):
    """Normalized discounted cumulative gain (NDCG) at rank K.

    Normalized Discounted Cumulative Gain (NDCG) measures the performance of a
    recommendation system based on the graded relevance of the recommended
    entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal
    ranking of the entities.

    Parameters
    ----------
    ground_truth : array, shape = [n_samples]
        Ground truth (true labels represended as integers).
    predictions : array, shape = [n_samples, n_classes]
        Predicted probabilities.
    k : int
        Rank.

    Returns
    -------
    score : float

    Example
    -------
    >>> ground_truth = [1, 0, 2]
    >>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    1.0
    >>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    0.6666666666
    """
    lb = LabelBinarizer()
    #lb.fit(range(len(predictions) + 1))  ## original
    #lb.fit(range(predictions.shape[1] + 1))
    #T = lb.transform(ground_truth)
    
    T = lb.fit_transform(ground_truth) 

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        score = float(actual) / float(best)
        scores.append(score)

    return np.mean(scores)

# NDCG Scorer function
ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=5)

In [14]:
preds = pd.DataFrame([['US','FR']])
truth = pd.Series(['FR'])

print score_predictions(preds, truth)

0.630929753571




# Random Forest

In [15]:
# Create numeric label for each of the 12 target labels
labels = target.values
le = LabelEncoder()
y = le.fit_transform(labels)
print y

[ 7  7 10 ...  7  7  7]


In [16]:
#Split data back into train and test sets
vals = users.values
piv_train = len(target)
X = vals[:piv_train]
X_test = vals[piv_train:]

# Create numeric label for each of the 12 target labels
labels = target.values
le = LabelEncoder()
y = le.fit_transform(labels)

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import grid_search

clf = RandomForestClassifier(n_estimators=25, random_state=101)
parameters = {'min_samples_split': [2, 20],
              'max_depth': [6, 8]
             }

reg = grid_search.GridSearchCV(clf, parameters, scoring=ndcg_scorer, cv=3)

reg.fit(X, y)



GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1,
            oob_score=False, random_state=101, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [2, 20], 'max_depth': [6, 8]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(ndcg_score, needs_proba=True, k=5), verbose=0)

In [18]:
est = reg.best_estimator_
est

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=20,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1,
            oob_score=False, random_state=101, verbose=0, warm_start=False)

In [19]:
y_pred = est.predict_proba(X)
if len(y) > 0:
    truth = pd.Series(le.inverse_transform(y))

In [None]:
y_pred = est.predict_proba(X)

truth = pd.Series(le.inverse_transform(y))
preds = []

for i in range(len(X)):
    preds += [le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()]

preds = pd.DataFrame(preds)
print "Score for Random Forrest: " + str(score_predictions(preds, truth))