In [1]:
import pandas as pd
import numpy as np
import math
import datetime

In [2]:
train = pd.read_csv('../data/airbnb/train_users.csv')
test = pd.read_csv('../data/airbnb/test_users.csv')

In [3]:
countries = pd.read_csv('../data/airbnb/countries.csv')
user_demo = pd.read_csv('../data/airbnb/age_gender_bkts.csv')
sessions = pd.read_csv('../data/airbnb/sessions.csv')

In [4]:
#changing into datetime
train.date_account_created = pd.to_datetime(train.date_account_created)
train.timestamp_first_active = pd.to_datetime(train.timestamp_first_active, format = "%Y%m%d%H%M%S")
train.date_first_booking = pd.to_datetime(train.date_first_booking)
test.timestamp_first_active = pd.to_datetime(test.timestamp_first_active, format = "%Y%m%d%H%M%S")
test.date_account_created = pd.to_datetime(test.date_account_created)

train_destination = train.iloc[:,-1]

In [5]:
unknowntrain = [i for i, j in enumerate(train.gender) if j == '-unknown-']
train.loc[unknowntrain, 'gender'] = 'NA'
unknowntest = [i for i, j in enumerate(test.gender) if j == '-unknown-']
test.loc[unknowntest, 'gender'] = 'NA'

#unknowntest = test.gender.index('-unknown-')
#test.loc[unknowntest, 'gender'] = 'NA'

In [6]:
np.unique(train.gender)

array(['FEMALE', 'MALE', 'NA', 'OTHER'], dtype=object)

In [7]:
#sessions grouping by user

#Group by user_id, aggregate by number of counts (counting device_type as it is never NA), 
#and total sum of elapsed time in seconds
group_sessions = sessions.groupby("user_id").agg({'device_type':'count', 'secs_elapsed':'sum'})
#rename columns
group_sessions.columns = ['sum_secs_elapsed', 'counts']
#group by variable turns into index, I'm reseting the index and putting user_id back as a column
group_sessions.reset_index(level=0, inplace=True)

In [8]:
group_sessions.head() # to be deleted

Unnamed: 0,user_id,sum_secs_elapsed,counts
0,00023iyk9l,867896.0,40
1,0010k6l0om,586543.0,63
2,001wyh0pz8,282965.0,90
3,0028jgx1x1,297010.0,31
4,002qnbzfs5,6487080.0,789


In [9]:
#bucket all ages into format that user_demo is in for age
def agebuckets(ages):
    ageless =  [i for i in range(5,101,5)] # 5, 10, 15, 20...95, 100
    buckets = ['%d-%d' %(i, i+4) for i in range(0,100,5)] # 0-4, 5-9, 10-14...90-94, 95-99
    newlist = []
    for i in range(len(ages)):
        if math.isnan(ages[i]):
            newlist.append('NA')
        elif ages[i] <ageless[0]:
            newlist.append(buckets[0])
        elif ages[i] < ageless[1]:
            newlist.append(buckets[1])
        elif ages[i] < ageless[2]:
            newlist.append(buckets[2])
        elif ages[i] < ageless[3]:
            newlist.append(buckets[3])
        elif ages[i] < ageless[4]:
            newlist.append(buckets[4])
        elif ages[i] < ageless[5]:
            newlist.append(buckets[5])
        elif ages[i] < ageless[6]:
            newlist.append(buckets[6])
        elif ages[i] < ageless[7]:
            newlist.append(buckets[7])
        elif ages[i] < ageless[8]:
            newlist.append(buckets[8])
        elif ages[i] < ageless[9]:
            newlist.append(buckets[9])
        elif ages[i] < ageless[10]:
            newlist.append(buckets[10])
        elif ages[i] < ageless[11]:
            newlist.append(buckets[11])
        elif ages[i] < ageless[12]:
            newlist.append(buckets[12]) 
        elif ages[i] < ageless[13]:
            newlist.append(buckets[13]) 
        elif ages[i] < ageless[14]:
            newlist.append(buckets[14])
        elif ages[i] < ageless[15]:
            newlist.append(buckets[15])
        elif ages[i] < ageless[16]:
            newlist.append(buckets[16])
        elif ages[i] < ageless[17]:
            newlist.append(buckets[17])
        elif ages[i] < ageless[18]:
            newlist.append(buckets[18])
        elif ages[i] < ageless[19]:
            newlist.append(buckets[19]) 
        else:
            newlist.append('100+')
    return newlist

In [10]:
train.age[0:20] #to be deleted

0      NaN
1     38.0
2     56.0
3     42.0
4     41.0
5      NaN
6     46.0
7     47.0
8     50.0
9     46.0
10    36.0
11    47.0
12     NaN
13    37.0
14    36.0
15    33.0
16     NaN
17    31.0
18     NaN
19    29.0
Name: age, dtype: float64

In [11]:
train.age = agebuckets(train.age)
test.age = agebuckets(test.age)

In [12]:
train.age[0:20] # to be deleted

0        NA
1     35-39
2     55-59
3     40-44
4     40-44
5        NA
6     45-49
7     45-49
8     50-54
9     45-49
10    35-39
11    45-49
12       NA
13    35-39
14    35-39
15    30-34
16       NA
17    30-34
18       NA
19    25-29
Name: age, dtype: object

In [13]:
def timedif(L1, L2):
    timediflist = []
    for i in range(len(L1)):
        try:
            if (L1[i]-L2[i]).days <= -1:#datetime.timedelta(days=0):
                timediflist.append('before')
            elif (L1[i]-L2[i]).days ==0: #datetime.timedelta(days=1):
                timediflist.append('same day')
            else:
                timediflist.append('greater 1 day')
        except:
            timediflist.append('NB')
            
    return timediflist

In [14]:
np.unique(timedif(train.date_first_booking, train.date_account_created)) #testing to be deleted

array(['NB', 'before', 'greater 1 day', 'same day'], 
      dtype='|S13')

In [None]:
np.unique(timedif(test.date_first_booking, test.date_account_created)) #testing to be deleted 

In [16]:
np.unique(timedif(train.date_first_booking, train.timestamp_first_active)) # testing to be deleted 

array(['NB', 'before', 'greater 1 day', 'same day'], 
      dtype='|S13')

In [17]:
np.unique(timedif(test.date_first_booking, test.timestamp_first_active)) # testing to be deleted 

array(['NB'], 
      dtype='|S2')

In [15]:
#adding time lag columns
train['lag_account_created'] = timedif(train.date_first_booking, train.date_account_created)
train['lag_first_active'] = timedif(train.date_first_booking, train.timestamp_first_active)
train['lag_account_created_first_active'] = timedif(train.date_account_created, train.timestamp_first_active)
test['lag_account_created_first_active'] = timedif(test.date_account_created, test.timestamp_first_active)

In [16]:
def bookings(L1, L2, L3, L4):
    timediflist = []
    for i in range(len(L1)):
        if L1[i] == 'same day' or L2[i] == 'same day':
            timediflist.append('early')
        elif L1[i] == 'before' and L2[i] == 'before' and L3[i] == 'same day':
            timediflist.append('early')
        elif L1[i] == 'greater 1 day' and L2[i] == 'greater 1 day':
            timediflist.append('waited')
        elif L1[i] == 'greater 1 day' and L2[i] == 'before':
            timediflist.append('waited')
        elif L1[i] == 'before' and L2[i] == 'greater 1 day':
            timediflist.append('waited')
        elif L1[i] == 'before' and L2[i] == 'before' and L3[i] == 'greater 1 day':
            timediflist.append('waited')
        elif L4[i] == 'NDF':
            timediflist.append('NB')
        else:
            timediflist.append('NA')

            
    return timediflist

In [17]:
booking = bookings(train.lag_account_created, train.lag_first_active, train.lag_account_created_first_active, train.country_destination)

In [18]:
train['bookings'] = booking

In [19]:
#given the train data gender, age, and country_desination produce the corresponding population in thousands
population_in_thous = []
for i in range(train.shape[0]):
    if train.country_destination[i] == 'NDF':
        population_in_thous.append('NB')    
    elif train.gender[i] == 'NA' or train.age[i] == 'NA' or train.gender[i] == 'nan': 
        population_in_thous.append('NA')
    elif train.gender[i] == 'OTHER':
        population_in_thous.append(0)  
    elif train.country_destination[i] == 'other':
        gendersi = user_demo.loc[user_demo.gender == train.gender[i].lower(),:] 
        ages = gendersi.loc[gendersi.age_bucket == train.age[i], :]
        ages = list(map(lambda x: float(x), ages.population_in_thousands))
        population_in_thous.append(np.mean(ages))
    else:
        genders = user_demo.loc[user_demo.gender == train.gender[i].lower(),:] 
        dests = genders.loc[genders.country_destination == train.country_destination[i] ,:]    
        population_in_thous.append(float((dests.loc[dests.age_bucket == train.age[i], 'population_in_thousands'])))
        
population_in_thous[0:10]

['NB',
 'NB',
 11264.0,
 2458.8000000000002,
 'NA',
 'NA',
 10659.0,
 10659.0,
 11413.0,
 'NA']

In [20]:
#merging gender age bucket with train data
train['population_in_thousands'] = population_in_thous

In [21]:
#merging with grouped sessions and countries, **note most of training data is not in sessions. see below 
test = pd.merge(test, group_sessions, left_on='id', right_on ='user_id', how='left')
train = pd.merge(train, group_sessions, left_on='id', right_on ='user_id', how='left')
test = test.drop('user_id', 1)
train = train.drop('user_id', 1)
print train.iloc[0:5, 0:10] #to be deleted?
print train.iloc[0:5, 10:]  # to be deleted?

           id date_account_created timestamp_first_active date_first_booking  \
0  gxn3p5htnn           2010-06-28    2009-03-19 04:32:55                NaT   
1  820tgsjxq7           2011-05-25    2009-05-23 17:48:09                NaT   
2  4ft3gnwmtx           2010-09-28    2009-06-09 23:12:47         2010-08-02   
3  bjjt8pjhuk           2011-12-05    2009-10-31 06:01:29         2012-09-08   
4  87mebub9p4           2010-09-14    2009-12-08 06:11:05         2010-02-18   

   gender    age signup_method  signup_flow language affiliate_channel  
0      NA     NA      facebook            0       en            direct  
1    MALE  35-39      facebook            0       en               seo  
2  FEMALE  55-59         basic            3       en            direct  
3  FEMALE  40-44      facebook            0       en            direct  
4      NA  40-44         basic            0       en            direct  
  affiliate_provider first_affiliate_tracked signup_app first_device_type  \
0   

In [22]:
#NDFtrain = [i for i, j in enumerate(train.country_destination) if j == 'NDF']
#train.loc[NDFtrain, 'lat_destination'] = 'NB'
#train.loc[NDFtrain, 'lng_destination'] = 'NB'
#train.loc[NDFtrain, 'distance_km'] = 'NB'
#train.loc[NDFtrain, 'destination_km2'] = 'NB'
#train.loc[NDFtrain, 'destination_language '] = 'NB'
#t#rain.loc[NDFtrain, 'language_levenshtein_distance'] = 'NB'
print train.iloc[0:5, 0:10] #to be deleted?
print train.iloc[0:5, 10:]  # to be deleted?

           id date_account_created timestamp_first_active date_first_booking  \
0  gxn3p5htnn           2010-06-28    2009-03-19 04:32:55                NaT   
1  820tgsjxq7           2011-05-25    2009-05-23 17:48:09                NaT   
2  4ft3gnwmtx           2010-09-28    2009-06-09 23:12:47         2010-08-02   
3  bjjt8pjhuk           2011-12-05    2009-10-31 06:01:29         2012-09-08   
4  87mebub9p4           2010-09-14    2009-12-08 06:11:05         2010-02-18   

   gender    age signup_method  signup_flow language affiliate_channel  
0      NA     NA      facebook            0       en            direct  
1    MALE  35-39      facebook            0       en               seo  
2  FEMALE  55-59         basic            3       en            direct  
3  FEMALE  40-44      facebook            0       en            direct  
4      NA  40-44         basic            0       en            direct  
  affiliate_provider first_affiliate_tracked signup_app first_device_type  \
0   

In [23]:
print train.iloc[0:5,0:10]
print train.iloc[0:5,10:20]
print train.iloc[0:5,20:]

           id date_account_created timestamp_first_active date_first_booking  \
0  gxn3p5htnn           2010-06-28    2009-03-19 04:32:55                NaT   
1  820tgsjxq7           2011-05-25    2009-05-23 17:48:09                NaT   
2  4ft3gnwmtx           2010-09-28    2009-06-09 23:12:47         2010-08-02   
3  bjjt8pjhuk           2011-12-05    2009-10-31 06:01:29         2012-09-08   
4  87mebub9p4           2010-09-14    2009-12-08 06:11:05         2010-02-18   

   gender    age signup_method  signup_flow language affiliate_channel  
0      NA     NA      facebook            0       en            direct  
1    MALE  35-39      facebook            0       en               seo  
2  FEMALE  55-59         basic            3       en            direct  
3  FEMALE  40-44      facebook            0       en            direct  
4      NA  40-44         basic            0       en            direct  
  affiliate_provider first_affiliate_tracked signup_app first_device_type  \
0   

In [97]:
#delete all  but one time row now that we have lag times?
#remove either train['lag_account_created'] or train['lag_first_active'] to take into account leakage
#note country destination still in training

In [24]:
train.to_csv("../data/airbnb/train_starting.csv")
test.to_csv('../data/airbnb/test_starting.csv')

In [None]:
#appendix showing the missinginess of the training ids in the sessions csv

In [35]:
strgroupids = ' '.join(group_sessions.user_id) #making a huge string of all the users ids in group_sesssions

In [36]:
sum(map(lambda x: strgroupids.find(x) != -1, test.id))

61668

In [37]:
sum(map(lambda x: strgroupids.find(x) != -1, train.id))

73815

In [45]:
print 'test shape ', test.shape
print 'train shape', train.shape

print '# test ids in sessions/#test ids', 61668.0/62096
print '# train ids in sessions/#train ids', 73815.0/213451

test shape  (62096, 15)
train shape (213451, 16)
# test ids in sessions/#test ids 0.993107446534
# train ids in sessions/#train ids 0.345817072771


In [2]:
import pandas as pd
train = pd.read_csv('train_starting.csv')
test = pd.read_csv('test_starting.csv')

In [3]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-0.4a30.tar.gz (753kB)
[K    100% |████████████████████████████████| 757kB 930kB/s 
Building wheels for collected packages: xgboost
  Running setup.py bdist_wheel for xgboost ... [?25ldone
[?25h  Stored in directory: /Users/YannickMac/Library/Caches/pip/wheels/d3/43/37/f902e214730441ba23bfc73621fad90dd6634e7fb34090a804
Successfully built xgboost
Installing collected packages: xgboost
Successfully installed xgboost-0.4a30


In [8]:
lists = list(train.columns)

In [9]:
lists.remove('country_destination')

In [10]:
lists

['Unnamed: 0',
 'id',
 'date_account_created',
 'timestamp_first_active',
 'date_first_booking',
 'gender',
 'age',
 'signup_method',
 'signup_flow',
 'language',
 'affiliate_channel',
 'affiliate_provider',
 'first_affiliate_tracked',
 'signup_app',
 'first_device_type',
 'first_browser',
 'lag_account_created',
 'lag_first_active',
 'lag_account_created_first_active',
 'bookings',
 'population_in_thousands',
 'sum_secs_elapsed',
 'counts',
 'lat_destination',
 'lng_destination',
 'distance_km',
 'destination_km2',
 'destination_language ',
 'language_levenshtein_distance']

In [11]:
train_x = train.loc[:,lists]

In [13]:
train_y = train.loc[:, 'country_destination']

In [17]:
import xgboost as xgb
params = {}
params["objective"] = "multi:softmax"
params["eta"] = 0.005
params["min_child_weight"] = 6
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 1
params["silent"] = 1
params["max_depth"] = 9
params['eval_metric'] = 'ndcg@5'
params['nthread'] = 4
params['missing'] = "NA"
plst = list(params.items())

dtrain = xgb.DMatrix(train_x, label=train_y)

num_round = 5
model = xgb.train(plst, dtrain, num_round)


Exception AttributeError: "'DMatrix' object has no attribute 'handle'" in <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x116359a90>> ignored


ValueError: DataFrame.dtypes for data must be int, float or bool