In [240]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

In [241]:
%matplotlib inline

## Load and process country data

In [242]:
countries = pd.read_csv('countries.csv')
countries.head()

Unnamed: 0,country_destination,lat_destination,lng_destination,distance_km,destination_km2,destination_language,language_levenshtein_distance
0,AU,-26.853388,133.27516,15297.744,7741220.0,eng,0.0
1,CA,62.393303,-96.818146,2828.1333,9984670.0,eng,0.0
2,DE,51.165707,10.452764,7879.568,357022.0,deu,72.61
3,ES,39.896027,-2.487694,7730.724,505370.0,spa,92.25
4,FR,46.232193,2.209667,7682.945,643801.0,fra,92.06


In [243]:
def one_hot(df, cols, prefixes=None):
    for col in cols:
        dummies = pd.get_dummies(df[col])
        if prefixes:
            dummies = dummies.rename(columns={x:prefixes(col, x) for x in list(dummies)})
        df = pd.concat((df, dummies), axis=1)
        df = df.drop(col, axis=1)
    return df

countries = one_hot(countries, ["destination_language "], lambda col, x: "dest_%s"%(x))

## Load and process session data

In [244]:
sessions = pd.read_csv('sessions.csv')
sessions = sessions[(sessions["action_type"] != "-unknown-") & (sessions["device_type"] != "-unknown-")]
sessions = one_hot(sessions, ["action_type", "device_type"])
sessions = sessions.drop(["action", "action_detail"], axis=1)
sessions.head()

Unnamed: 0,user_id,secs_elapsed,booking_request,booking_response,click,data,message_post,modify,partner_callback,submit,...,Chromebook,Linux Desktop,Mac Desktop,Opera Phone,Tablet,Windows Desktop,Windows Phone,iPad Tablet,iPhone,iPodtouch
0,d1mm9tcy42,319.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,d1mm9tcy42,67753.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,d1mm9tcy42,301.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,d1mm9tcy42,22141.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,d1mm9tcy42,435.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [245]:
user_sessions = sessions.groupby(sessions["user_id"])
user_sessions = user_sessions.sum().fillna(0)
user_sessions.head()

Unnamed: 0_level_0,secs_elapsed,booking_request,booking_response,click,data,message_post,modify,partner_callback,submit,view,...,Chromebook,Linux Desktop,Mac Desktop,Opera Phone,Tablet,Windows Desktop,Windows Phone,iPad Tablet,iPhone,iPodtouch
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00023iyk9l,867896.0,1.0,0.0,4.0,9.0,1.0,0.0,1.0,0.0,21.0,...,0.0,0.0,36.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
0010k6l0om,463510.0,0.0,0.0,16.0,9.0,0.0,0.0,1.0,0.0,17.0,...,0.0,0.0,58.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
001wyh0pz8,260784.0,0.0,0.0,66.0,2.0,0.0,0.0,0.0,3.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0028jgx1x1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
002qnbzfs5,5714193.0,1.0,0.0,138.0,138.0,16.0,0.0,0.0,15.0,215.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,600.0,0.0


## Load and process user data

In [267]:
users = pd.read_csv('train_users_2.csv')
users = users[(users["country_destination"] != "NDF") & (users["country_destination"] != "other")]
users.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US
5,osr2jwljor,2010-01-01,20100101215619,2010-01-02,-unknown-,,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome,US
6,lsw9q7uk0j,2010-01-02,20100102012558,2010-01-05,FEMALE,46.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,US
7,0d01nltbrs,2010-01-03,20100103191905,2010-01-13,FEMALE,47.0,basic,0,en,direct,direct,omg,Web,Mac Desktop,Safari,US


In [268]:
# drop rows with infrequent values
print(users.shape)
threshold = 100
for col in ["signup_method", "signup_flow", "language", "affiliate_channel", "affiliate_provider", "first_affiliate_tracked", "signup_app", "first_device_type", "first_browser"]:
    counts = users[col].value_counts()
    users = users[users[col].isin(counts.index[counts > threshold])]
print(users.shape)

(78814, 16)
(75973, 16)


In [269]:
# one-hot encoding time!
users = one_hot(users, ["language", "signup_method", "signup_flow", "affiliate_channel", "affiliate_provider", "first_affiliate_tracked", "signup_app", "first_device_type", "first_browser"], lambda col, x: "%s_%s"%(col, x))

In [270]:
# join on sessions
users = users.join(user_sessions, on="id").fillna(0)
users.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,country_destination,language_de,language_en,language_es,...,Chromebook,Linux Desktop,Mac Desktop,Opera Phone,Tablet,Windows Desktop,Windows Phone,iPad Tablet,iPhone,iPodtouch
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,US,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,US,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,osr2jwljor,2010-01-01,20100101215619,2010-01-02,-unknown-,0.0,US,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,lsw9q7uk0j,2010-01-02,20100102012558,2010-01-05,FEMALE,46.0,US,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0d01nltbrs,2010-01-03,20100103191905,2010-01-13,FEMALE,47.0,US,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [271]:
# join on destination
missing_data = users.merge(countries, on="country_destination", suffixes=("", ""))
missing_data = missing_data.drop(["id", "date_account_created", "timestamp_first_active", "date_first_booking", "country_destination"], axis=1)
users_matrix = missing_data.as_matrix()

# save data with missing values to csv for julia code to import
missing_data = one_hot(missing_data, ["gender"])
missing_data = missing_data.drop(["-unknown-", "OTHER"], axis=1)

# store these to decode missing data results
# norm_data = missing_data[(missing_data["age"] > 15) & (missing_data["age"] < 105)]
# md_range = norm_data.max() - norm_data.min()
# md_mean = norm_data.mean()

# missing_data = ((missing_data - md_mean) / md_range).fillna(0)
missing_data.to_csv("missing_data.csv", sep=",", index=False)

In [272]:
# generate observations
obs = np.zeros((users_matrix.shape[0] * users_matrix.shape[1], 2), dtype=int)
idx = 0
for i in range(users_matrix.shape[0]):
    if users_matrix[i, 0] in ["MALE", "FEMALE"]:
        obs[idx] = [i, 0]
        idx += 1
    if users_matrix[i, 1] > 15 and users_matrix[i, 1] < 105:
        obs[idx] = [i, 1]
        idx += 1
    for j in range(2, users_matrix.shape[1]):
        obs[idx] = [i, j]
        idx += 1
obs = obs[:idx, :]
np.savetxt("obs.csv", obs, delimiter=",", fmt="%d")

In [273]:
# GLRMs aren't doing too hot, going to train regression models instead.
missing_data = users.merge(countries, on="country_destination", suffixes=("", ""))
missing_data = missing_data.drop(["id", "date_account_created", "timestamp_first_active", "date_first_booking", "country_destination"], axis=1)
missing_data = missing_data[(missing_data["age"] > 0) & (missing_data["gender"].str.contains("MALE"))]
missing_data = one_hot(missing_data, ["gender"])
missing_data.to_csv("clean_missing_data.csv", sep=",", index=False)

## At this point, switch over to Julia so we can use the low rank models library to predict the missing ages and genders

In [276]:
# Predicting missing values didn't work out
users = users[users["age"] > 0]
users = one_hot(users, ["gender"])
users = users.drop(["-unknown-", "OTHER", "timestamp_first_active", "id", "first_browser_-unknown-"], axis=1)

In [277]:
# break apart dates
for col in ["date_account_created", "date_first_booking"]:
    users[col] = pd.to_datetime(users[col])
    df = pd.concat((users[col].dt.year, users[col].dt.month, users[col].dt.day), axis=1)
    df.columns = ["%s_year"%col, "%s_month"%col, "%s_day"%col]
    users = pd.concat((users, df), axis=1)
is_weekend = pd.to_datetime(users["date_first_booking"]).dt.weekday >= 5
users = users.drop(["date_account_created", "date_first_booking"], axis=1)
users = pd.concat((users, is_weekend), axis=1)
users = users.rename(columns={"date_first_booking": "is_weekend"})
users.head()

Unnamed: 0,age,country_destination,language_de,language_en,language_es,language_fr,language_ko,language_zh,signup_method_basic,signup_method_facebook,...,iPodtouch,FEMALE,MALE,date_account_created_year,date_account_created_month,date_account_created_day,date_first_booking_year,date_first_booking_month,date_first_booking_day,is_weekend
2,56.0,US,0,1,0,0,0,0,1,0,...,0.0,1,0,2010,9,28,2010,8,2,False
4,41.0,US,0,1,0,0,0,0,1,0,...,0.0,0,0,2010,9,14,2010,2,18,False
6,46.0,US,0,1,0,0,0,0,1,0,...,0.0,1,0,2010,1,2,2010,1,5,False
7,47.0,US,0,1,0,0,0,0,1,0,...,0.0,1,0,2010,1,3,2010,1,13,False
8,50.0,US,0,1,0,0,0,0,1,0,...,0.0,1,0,2010,1,4,2010,7,29,False


In [278]:
users.to_csv("final_dataset.csv", sep=",", index=False)

In [279]:
users.shape

(59603, 93)