In [1]:
import numpy as np
import pandas as pd
from math import isnan
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier
import xgboost as xgb

## Load data

In [2]:
df_train = pd.read_csv('../data/train_users_2.csv')#.set_index('id')
n_train = df_train.shape[0]
df_train.drop(['id'], axis=1, inplace=True)
labels = df_train['country_destination'].values
df_train.drop(['country_destination'], axis=1, inplace=True)

df_test = pd.read_csv('../data/test_users.csv')#.set_index('id')
id_test = df_test['id']

df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)#.set_index('id')
# df_all = df_all.fillna(-1)

## Apply pre-treatment

In [3]:
def pretreat(df):
    def group_rare_items(col, n_out=10, impute='rare_item'):
        """ Give all rare items of a pandas Series the same name/value """
        val_counts = col.value_counts(normalize=True)
        group = lambda x: x if pd.isnull(x) or x in val_counts[:n_out] else impute
        return col.apply(group)
    
    df.replace('-unknown-', np.nan, inplace=True)
    df.loc[df_all.age > 80, 'age'] = np.nan
    df.loc[df_all.age < 18, 'age'] = np.nan
    df['timestamp_first_active'] = pd.to_datetime(df.timestamp_first_active.astype(str), format='%Y%m%d%H%M%S')

    #date_first_active = date_first_active.date # only keep date
    df['date_account_created'] = pd.to_datetime(df['date_account_created'])#.dt #.date

    for c in ['first_browser', 'affiliate_provider', 'signup_flow', 'language']:
        df[c] = group_rare_items(df[c]) # 52 modalities to 10 
        
    # df.drop(['timestamp_first_active', 'date_account_created', 'date_first_booking'], axis=1, inplace=True)

    
pretreat(df_all)

  mask = arr == x


In [8]:
df_all['timestamp_first_active'].dt.hour.value_counts()

21    18111
19    18025
20    17749
18    17651
22    17613
23    17205
4     16088
0     15885
17    15725
5     15397
3     14738
1     14431
2     13910
6     12475
16    11997
7      8475
15     8025
8      5137
14     4825
9      3266
13     2821
10     2188
11     1928
12     1882
Name: timestamp_first_active, dtype: int64

## Feature engineering

In [146]:
df_all['delta_create_active'] = (df_all['date_account_created']-df_all['timestamp_first_active']).dt.days
# is_delta = delta_create_active.apply(lambda x: 1 if x > 0 else 0)
# is_delta.name = 'is_delta'

def split_dates(df, col_name):
    date_col = pd.to_datetime(df[col_name]).dt
    df[col_name + '_year'] = date_col.year
    df[col_name + '_month'] = date_col.month
    df[col_name + '_weekday'] = date_col.weekday

split_dates(df_all, 'date_account_created')

In [149]:
df_all.drop(['timestamp_first_active', 'date_account_created', 'date_first_booking'], axis=1, inplace=True)

In [151]:
#One-hot-encoding features
ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for f in ohe_feats:
    df_all_dummy = pd.get_dummies(df_all[f], prefix=f, dummy_na=True)
    df_all = df_all.drop([f], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)

In [152]:
df_all.head()

Unnamed: 0,id,age,delta_create_active,date_account_created_year,date_account_created_month,date_account_created_weekday,gender_FEMALE,gender_MALE,gender_OTHER,gender_nan,...,first_browser_Android Browser,first_browser_Chrome,first_browser_Chrome Mobile,first_browser_Firefox,first_browser_IE,first_browser_Mobile Safari,first_browser_Opera,first_browser_Safari,first_browser_rare_item,first_browser_nan
0,gxn3p5htnn,,465,2010,6,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
1,820tgsjxq7,38.0,731,2011,5,2,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
2,4ft3gnwmtx,56.0,475,2010,9,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,bjjt8pjhuk,42.0,764,2011,12,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,87mebub9p4,41.0,279,2010,9,1,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
