### Introduction

In this competition Airbnb challenged Kagglers to predict in which country a new user will make his or her first booking. 
https://www.kaggle.com/c/airbnb-recruiting-new-user-bookings

In [1]:
import pandas as pd
import os
import numpy as np

test = pd.read_csv('test_users.csv', header=0, parse_dates=[1,2,3])
train = pd.read_csv('train_users_2.csv', header=0, parse_dates=[1,2,3])
sessions = pd.read_csv("sessions.csv", encoding='utf8')

In [20]:
test.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,5uwns89zht,2014-07-01,2014-07-01 00:00:06,NaT,FEMALE,35.0,facebook,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
1,jtl0dijy2j,2014-07-01,2014-07-01 00:00:51,NaT,-unknown-,,basic,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
2,xx0ulgorjt,2014-07-01,2014-07-01 00:01:48,NaT,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome
3,6c6puo6ix0,2014-07-01,2014-07-01 00:02:15,NaT,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,IE
4,czqhjk3yfe,2014-07-01,2014-07-01 00:03:05,NaT,-unknown-,,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Safari


In [3]:
def transform_user_features(train, test):
    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import LabelEncoder
    from sklearn.feature_extraction import DictVectorizer
    import scipy.sparse as sp
    #encoding country destinations in train dataset
    outcome = train['country_destination']
    labels = outcome.values
    le = LabelEncoder()
    y = le.fit_transform(labels)
    train = train.drop(['country_destination'], axis=1)

    #storing user ids in test set
    id_test = test['id']

    #appending test to train and dropping date first booking which is redundant
    data = pd.concat((train, test), axis=0, ignore_index=True)
    data = data.drop(['date_first_booking'], axis=1)

    #extracting features from date_account_created
    data['dac_year'] = data['date_account_created'].apply(lambda x: x.year)
    data['dac_month'] = data['date_account_created'].apply(lambda x: x.month)
    data['dac_weekday'] = data['date_account_created'].apply(lambda x: x.weekday())
    data = data.drop(['date_account_created'], axis=1)

    #extracting features from timestamp_first_active
    data['tfa_year'] = data['timestamp_first_active'].apply(lambda x: x.year)
    data['tfa_month'] = data['timestamp_first_active'].apply(lambda x: x.month)
    data['tfa_weekday'] = data['timestamp_first_active'].apply(lambda x: x.weekday())
    data = data.drop(['timestamp_first_active'], axis=1)

    #filling age nan with age median
    data.age = data['age'].fillna(data['age'].median())

    #group age column
    bins = list(np.arange(15, 85, 5))
    bins.insert(0,0)
    bins.append(int(max(data['age'])))
    group_names = ['<15', '15-20', '20-25', '25-30', '30-35', '35-40', '40-45', '45-50',
                   '50-55', '55-60', '60-65', '65-70', '70-75', '75-80', '>80']
    data['age_bucket'] = pd.cut(data['age'], bins, labels=group_names)

    #cleaning gender column and filling nan in all dataframe with 'unknown'
    data['gender'] = data['gender'].replace('-unknown-','unknown')
    data.ix[:, data.columns != 'age_bucket'] = data.ix[:, data.columns != 'age_bucket'].fillna('unknown')

    #generating dummy variables in top of categorical columns
    dummified = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser','age_bucket']
    for f in dummified:
        dummies = pd.get_dummies(data[f], prefix=f)
        data = data.drop([f], axis=1)
        data = pd.concat((data, dummies), axis=1)

    return data[:train.shape[0]], data[train.shape[0]:], y, le

In [4]:
def transform_sessions_features(data, df_sessions):
    from sklearn.feature_extraction import DictVectorizer
    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import LabelEncoder
    from sklearn.feature_extraction import DictVectorizer
    import scipy.sparse as sp
    
    # Drop row with nan values from the "user_id" column as they're useless
    df_sessions = df_sessions.dropna(subset=["user_id"])

    # Frequency of devices - by user
    device_freq = df_sessions.groupby('user_id').device_type.value_counts()
    
    # Frequency of actions taken - by user
    action_freq = df_sessions.groupby('user_id').action.value_counts()

    # Total list of users
    users = data['id'].values
    def feature_dict(df):
        f_dict = dict(list(df.groupby(level='user_id')))
        res = {}
        for k, v in f_dict.items():
            v.index = v.index.droplevel('user_id')
            res[k] = v.to_dict()
        return res

    # Make a dictionary with the frequencies { 'user_id' : {"IPhone": 2, "Windows": 1}}
    action_dict = feature_dict(action_freq)
    device_dict = feature_dict(device_freq)

    # Transform to a list of dictionaries
    action_rows = [action_dict.get(k, {}) for k in users]
    device_rows = [device_dict.get(k, {}) for k in users]

    device_transf = DictVectorizer()
    tf = device_transf.fit_transform(device_rows)

    action_transf = DictVectorizer()
    tf2 = action_transf.fit_transform(action_rows)

    # Concatenate the two datasets
    # Those are row vectors with the frequencies of both device and actions [0, 0, 0, 2, 0, 1, ...]
    features = sp.hstack([tf, tf2])

    # We create a dataframe with the new features and we write it to disk
    df_sess_features = pd.DataFrame(features.todense())
    
    df_sess_features['id'] = users

    #left joining data and sessions on user_id
    final = pd.merge(data, df_sess_features, how='left', left_on='id', right_on='id')
    final.ix[:, final.columns != 'age_bucket'].fillna(-1, inplace=True)


    final.drop(['id'], axis=1, inplace=True)
    return final

In [5]:
train, test, y, le = transform_user_features(train, test)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix


In [6]:
data = pd.concat((train, test), axis=0, ignore_index=True)
final = transform_sessions_features(data, df_sessions)

del data

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix


In [7]:
X_train = final.ix[:train.shape[0]-1]
X_test = final.ix[train.shape[0]:]

In [8]:
#assert train.shape[0] == y.shape[0]

In [9]:
#assert X_train.shape[0] == train.shape[0]
#assert X_train.shape[0] == y.shape[0]
print(X_train.shape)
print(X_test.shape)
print(y.shape)

(213451, 549)
(62096, 549)
(213451,)


In [10]:
del final

In [11]:
from sklearn.ensemble import BaggingClassifier
def bagging_prediction(X_train, y_train, X_test, 
                       n_estimators=100, 
                       max_samples=0.1, 
                       max_features=1.0, 
                       random_state=None):

#    unimportant_features = np.load("unimportant_features.npy")
    bagg = BaggingClassifier(random_state=random_state, 
                             n_estimators=n_estimators, 
                             max_samples=max_samples, 
                             max_features=max_features)
    bagg.fit(X_train, y_train)
    return bagg.predict_proba(X_test)

In [18]:
probs = []
for i in range(3):
    p = bagging_prediction(X_train, y, 
                           X_test,
                           n_estimators=100,
                           random_state=i)
    probs.append(p)

# We take the average
avg_probs = sum(probs)/len(probs)

In [16]:
avg_probs

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.2       ,  0.04      ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.05333333,  0.00333333],
       [ 0.        ,  0.00666667,  0.00333333, ...,  0.        ,
         0.05333333,  0.01      ],
       ..., 
       [ 0.        ,  0.00666667,  0.00333333, ...,  0.        ,
         0.04666667,  0.01666667],
       [ 0.        ,  0.00333333,  0.        , ...,  0.        ,
         0.04666667,  0.01666667],
       [ 0.00333333,  0.01      ,  0.00666667, ...,  0.        ,
         0.3       ,  0.03333333]])

In [19]:
y_pred = avg_probs
ids = []  #list of ids
cts = []  #list of countries
id_test = pretest['id'].values
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('sub.csv',index=False)

This solution has score in the Kaggle Leaderboard of 0.87656 (corresponding to position 336 of 1462 participants).