In [67]:
# Imports
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV,KFold,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,roc_auc_score,auc,accuracy_score

import numpy as np
import string
import itertools

import numpy as np
from scipy.sparse import csr_matrix,hstack

import boto3
import pandas as pd
from sagemaker import get_execution_role

In [96]:
# data 

role = get_execution_role()
bucket='cat-in-middle'
train_key = 'train.csv'
test_key = 'test.csv'

data_location_train = 's3://{}/{}'.format(bucket, train_key)
data_location_test = 's3://{}/{}'.format(bucket, test_key)

train = pd.read_csv(data_location_train)
test = pd.read_csv(data_location_test)

In [97]:
# getting train test data id's
train_id = train['id']
test_id = test['id']
target = train['target']

# merging train test data sets into one data frame for feature engineering 
df = pd.concat([train.drop(labels=['id','target'],axis=1), test.drop(labels='id',axis=1)])

In [6]:
# binary features
df['bin_3'] = df['bin_3'].map({'T':1,'F':0})
df['bin_4'] = df['bin_4'].map({'Y':1,'N':0})


In [10]:
df.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,0,0,0,1,1,Green,Triangle,Snake,Finland,Bassoon,...,c389000ab,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2
1,0,1,0,1,1,Green,Trapezoid,Hamster,Russia,Piano,...,4cd920251,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8
2,0,0,0,0,1,Blue,Trapezoid,Lion,Russia,Theremin,...,de9c9f684,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2
3,0,1,0,0,1,Red,Trapezoid,Snake,Canada,Oboe,...,4ade6ab69,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1
4,0,0,0,0,0,Red,Trapezoid,Lion,Canada,Oboe,...,cb43ab175,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8


In [9]:
# ordinal feature

def orde_scal(num_ordnl):
    
    step = 1/num_ordnl
    seq = np.arange(start=0, stop=1, step=round(step,5),dtype=np.float64)
    
    return seq

In [15]:
ord_2_dict = {'Freezing':1,'Cold':2,'Warm':3,'Hot':4,'Boiling Hot':5,'Lava Hot':6}
ord_1_dict = {'Novice':1,'Contributor':2,'Expert':3,'Master':4,'Grandmaster':5}

In [11]:
ord_0_list = df['ord_0'].unique().tolist()
ord_0_list_len = len(ord_0_list)
ord_0_list.sort()

ord_1_list = df['ord_1'].unique().tolist()
ord_1_list_len = len(ord_1_list)

ord_2_list = df['ord_2'].unique().tolist()
ord_2_list_len = len(ord_2_list)

ord_3_list = df['ord_3'].unique().tolist()
ord_3_list_len = len(ord_3_list)

ord_4_list = df['ord_4'].unique().tolist()
ord_4_list_len = len(ord_4_list)

ord_5_list = df['ord_5'].unique().tolist()
ord_5_list_len = len(ord_5_list)

In [16]:
ord_0_uni_map = dict(zip(ord_0_list, orde_scal(ord_0_list_len)))

ord_1_uni_map = dict(zip(ord_1_dict, orde_scal(ord_1_list_len)))

ord_2_uni_map = dict(zip(ord_2_dict, orde_scal(ord_2_list_len)))

low_alfa = dict(zip(string.ascii_lowercase[0:15], orde_scal(15)))

upr_alfa = dict(zip(string.ascii_uppercase, orde_scal(26)))

ord_feat_lst = df['ord_5'].value_counts().index.values.tolist()

ord_feat_lst.sort()

upw = dict(zip(ord_feat_lst, orde_scal(192)))

In [17]:
%%time
df['ord_0'] = df['ord_0'].map(ord_0_uni_map)
df['ord_1'] = df['ord_1'].map(ord_1_uni_map)
df['ord_2'] = df['ord_2'].map(ord_2_uni_map)
df['ord_3'] = df['ord_3'].map(low_alfa)
df['ord_4'] = df['ord_4'].map(upr_alfa)
df['ord_5'] = df['ord_5'].map(upw)

CPU times: user 564 ms, sys: 8.55 ms, total: 572 ms
Wall time: 289 ms


In [18]:
df.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,0,0,0,1,1,Green,Triangle,Snake,Finland,Bassoon,...,c389000ab,2f4cb3d51,0.33333,0.8,0.16667,0.46669,0.11538,0.70856,2,2
1,0,1,0,1,1,Green,Trapezoid,Hamster,Russia,Piano,...,4cd920251,f83c56c21,0.0,0.8,0.50001,0.0,0.0,0.48453,7,8
2,0,0,0,0,1,Blue,Trapezoid,Lion,Russia,Theremin,...,de9c9f684,ae6800dd0,0.0,0.4,0.83335,0.46669,0.65382,0.16151,7,2
3,0,1,0,0,1,Red,Trapezoid,Snake,Canada,Oboe,...,4ade6ab69,8270f0d71,0.0,0.8,0.66668,0.53336,0.11538,0.69814,2,1
4,0,0,0,0,0,Red,Trapezoid,Lion,Canada,Oboe,...,cb43ab175,b164b72a7,0.0,0.8,0.0,0.0,0.65382,0.82318,7,8


In [19]:
# Cyclic feature encoding

In [20]:
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

In [21]:
df= encode(df,'day',7)
df= encode(df,'month',12)

df.drop(labels=['day','month'],axis=1,inplace=True)
df.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day_sin,day_cos,month_sin,month_cos
0,0,0,0,1,1,Green,Triangle,Snake,Finland,Bassoon,...,0.33333,0.8,0.16667,0.46669,0.11538,0.70856,0.9749279,-0.222521,0.866025,0.5
1,0,1,0,1,1,Green,Trapezoid,Hamster,Russia,Piano,...,0.0,0.8,0.50001,0.0,0.0,0.48453,-2.449294e-16,1.0,-0.866025,-0.5
2,0,0,0,0,1,Blue,Trapezoid,Lion,Russia,Theremin,...,0.0,0.4,0.83335,0.46669,0.65382,0.16151,-2.449294e-16,1.0,0.866025,0.5
3,0,1,0,0,1,Red,Trapezoid,Snake,Canada,Oboe,...,0.0,0.8,0.66668,0.53336,0.11538,0.69814,0.9749279,-0.222521,0.5,0.866025
4,0,0,0,0,0,Red,Trapezoid,Lion,Canada,Oboe,...,0.0,0.8,0.0,0.0,0.65382,0.82318,-2.449294e-16,1.0,-0.866025,-0.5


In [22]:
# nominal features

In [39]:
%%time

# Label encoding

colms = ['nom_0','nom_1','nom_2','nom_3','nom_4','nom_5','nom_6','nom_7','nom_8','nom_9']

df[colms] = df[colms].apply(LabelEncoder().fit_transform)


CPU times: user 2.05 s, sys: 393 ms, total: 2.44 s
Wall time: 1.79 s


In [62]:
# One Hot encoding

nom_sparse = OneHotEncoder().fit_transform(df[colms])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [64]:
type(nom_sparse),nom_sparse.shape

(scipy.sparse.csr.csr_matrix, (500000, 16276))

In [56]:
colhs = df.columns.tolist()

In [58]:
colhs = ['bin_0','bin_1','bin_2','bin_3','bin_4','ord_0','ord_1','ord_2','ord_3','ord_4','ord_5','day_sin','day_cos','month_sin','month_cos']


In [59]:
df_csr = csr_matrix(df[colhs])

In [71]:
type(df_csr),df_csr.shape

(scipy.sparse.csr.csr_matrix, (500000, 15))

In [72]:
df_f = hstack([df_csr,nom_sparse])

In [137]:
test_x = df_f.tocsr()[300000:,:]

In [138]:
test_x.shape

(200000, 16291)

In [99]:
train_x = df_f.tocsr()[0:300000,:]

In [101]:
train_x.shape

(300000, 16291)

In [102]:
# logistic rigression

In [127]:
%%time
logit = LogisticRegression(solver='lbfgs', max_iter=5000,C=0.12)
logit.fit(train_x,target)

CPU times: user 32.5 s, sys: 7.11 s, total: 39.6 s
Wall time: 19.9 s


In [128]:
pred = logit.predict_proba(train_x)[:,1]

In [129]:
fpr, tpr, _ = roc_curve(target, pred)
roc_auc = auc(fpr, tpr)

In [130]:
roc_auc

0.8252877437551849

In [123]:
pred_prob = logit.predict_proba(train_x)[:,1]

roc_auc_score(target,pred_prob)

0.8259927218982717

In [131]:
# cross validation
cv = KFold(n_splits=10, random_state=42)

In [132]:
%%time
scores = cross_val_score(logit,train_x,target,cv=cv,scoring='roc_auc')

CPU times: user 4min 53s, sys: 1min 5s, total: 5min 59s
Wall time: 3min 1s


In [134]:
print(scores.mean())

0.8023996165134285


In [109]:
## Grid Search

In [124]:
%%time
param_grid = { 'C': [0.100, 0.150, 0.120, 0.125, 0.130, 0.135, 0.140, 0.145, 0.150] }

logit_grid = GridSearchCV(logit, param_grid,scoring='roc_auc', cv=cv)

logit_grid.fit(train_x, target)

best_C = logit_grid.best_params_['C']
# best_C = C = 0.12345

print('Best C:', best_C)

Best C: 0.12


In [126]:
pred_prob = logit_grid.predict_proba(train_x)[:,1]

roc_auc_score(target,pred_prob)

0.8252877437551849

In [125]:
# Results

In [139]:
pred_prob = logit_grid.predict_proba(test_x)[:,1]

#roc_auc_score(target,pred_prob)

pred=logit_grid.predict_proba(test_x)[:,1]

pd.DataFrame({"id": test["id"], "target": pred_prob}).to_csv("submission.csv", index=False)