In [1]:
import pandas as pd
import numpy as np

In [2]:
train_users = pd.read_csv("../data/raw/train_users_2.csv")
display(train_users.head())

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [3]:
test_users = pd.read_csv("../data/raw/test_users.csv")
display(test_users.head())

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,5uwns89zht,2014-07-01,20140701000006,,FEMALE,35.0,facebook,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
1,jtl0dijy2j,2014-07-01,20140701000051,,-unknown-,,basic,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
2,xx0ulgorjt,2014-07-01,20140701000148,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome
3,6c6puo6ix0,2014-07-01,20140701000215,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,IE
4,czqhjk3yfe,2014-07-01,20140701000305,,-unknown-,,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Safari


In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

def clean_up(df):
    
    # remove date_first_booking feature
    df = df.drop(['date_first_booking'], axis = 1)
    
    # split date account created feature into seperate year, month and day columns
    df[["create_year", "create_month", "create_day"]] = df["date_account_created"].str.split("-", expand=True).astype('int64')
    df = df.drop(['date_account_created'], axis = 1)
    
    # locate outlier ages and replace with nan values
    df.loc[df['age'] > 90, 'age'] = np.nan
    df.loc[df['age'] < 16, 'age'] = np.nan
    
    # replace nan age values with mean ages
    df.loc[df['age'].isnull(), 'age'] = int(df['age'].mean())
    
    df.loc[df['first_affiliate_tracked'].isnull(), 'first_affiliate_tracked'] = 'untracked'
    return df

def setup_encode(df):
    
    df_cat = df.select_dtypes(include=[object])
    enc = OneHotEncoder(handle_unknown='ignore',sparse=False)
    enc_fit = enc.fit(df_cat)
    
    return enc_fit

def encode(df, encoding):
    
    df_cat = df.select_dtypes(include=[object])
    df_num = df.select_dtypes(exclude=[object])
    encoded_data = encoding.transform(df_cat)
    encoded_df = pd.DataFrame(encoded_data)
    
    df = pd.concat([df_num, encoded_df], axis=1)
    
    return df

In [5]:
Y_train = train_users['country_destination']
display(Y_train.head())
print(Y_train.shape)

0      NDF
1      NDF
2       US
3    other
4       US
Name: country_destination, dtype: object

(213451,)


In [6]:
df_train = clean_up(train_users).drop(['id','country_destination'], axis = 1)
df_test = clean_up(test_users).drop(['id'], axis = 1)

In [7]:
encoded_train = encode(df_train, setup_encode(df_train))
encoded_test = encode(df_test, setup_encode(df_train))

In [8]:
X_train = encoded_train
display(X_train.head())
print(X_train.shape)

Unnamed: 0,timestamp_first_active,age,signup_flow,create_year,create_month,create_day,0,1,2,3,...,120,121,122,123,124,125,126,127,128,129
0,20090319043255,36.0,0,2010,6,28,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,20090523174809,38.0,0,2011,5,25,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20090609231247,56.0,3,2010,9,28,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20091031060129,42.0,0,2011,12,5,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20091208061105,41.0,0,2010,9,14,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(213451, 136)


In [9]:
X_test = encoded_test
display(X_test.head())

Unnamed: 0,timestamp_first_active,age,signup_flow,create_year,create_month,create_day,0,1,2,3,...,120,121,122,123,124,125,126,127,128,129
0,20140701000006,35.0,0,2014,7,1,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,20140701000051,34.0,0,2014,7,1,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20140701000148,34.0,0,2014,7,1,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20140701000215,34.0,0,2014,7,1,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20140701000305,34.0,0,2014,7,1,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
one_hot = enc = OneHotEncoder(handle_unknown='ignore',sparse=False)
Y_train = pd.DataFrame(one_hot.fit_transform(train_users['country_destination'].values.reshape(-1,1)))
display(Y_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
213446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
213447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
213448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
213449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [11]:
Y_train.columns = one_hot.categories_[0]

display(X_train)
display(Y_train)

Unnamed: 0,timestamp_first_active,age,signup_flow,create_year,create_month,create_day,0,1,2,3,...,120,121,122,123,124,125,126,127,128,129
0,20090319043255,36.0,0,2010,6,28,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,20090523174809,38.0,0,2011,5,25,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20090609231247,56.0,3,2010,9,28,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20091031060129,42.0,0,2011,12,5,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20091208061105,41.0,0,2010,9,14,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213446,20140630235636,32.0,0,2014,6,30,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
213447,20140630235719,36.0,0,2014,6,30,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
213448,20140630235754,32.0,0,2014,6,30,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
213449,20140630235822,36.0,25,2014,6,30,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,AU,CA,DE,ES,FR,GB,IT,NDF,NL,PT,US,other
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
213446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
213447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
213448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
213449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [18]:
from sklearn.preprocessing import LabelEncoder# creating initial dataframe
labelencoder = LabelEncoder()# Assigning numerical values and storing in another column
# Y_train = labelencoder.fit_transform(train_users['country_destination'])
Y_train = train_users['country_destination']
display(Y_train)

0           NDF
1           NDF
2            US
3         other
4            US
          ...  
213446      NDF
213447      NDF
213448      NDF
213449      NDF
213450      NDF
Name: country_destination, Length: 213451, dtype: object

In [19]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, multi_class='multinomial')
clf.fit(X_train, Y_train)

# >>> clf.predict(X[:2, :])
# array([0, 0])
# >>> clf.predict_proba(X[:2, :]).shape
# (2, 3)
# >>> clf.score(X, y)

LogisticRegression(multi_class='multinomial', random_state=0)

In [20]:
preds = clf.predict(X_test)
display(preds)

array(['NDF', 'NDF', 'NDF', ..., 'NDF', 'NDF', 'NDF'], dtype=object)

In [15]:
predictions = labelencoder.inverse_transform(preds)

In [16]:
print(predictions)

['NDF' 'NDF' 'NDF' ... 'NDF' 'NDF' 'NDF']


In [21]:
from collections import Counter
c = Counter(preds)
print(c)

Counter({'NDF': 62096})
