In [0]:
# Imports
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV,KFold,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,roc_auc_score,auc,accuracy_score
from sklearn.pipeline import Pipeline

import numpy as np
import string
import itertools

import numpy as np
from scipy.sparse import csr_matrix,hstack

import boto3
import pandas as pd
#from sagemaker import get_execution_role

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
train = pd.read_csv('/content/drive/My Drive/dataset/cat/train.csv')
test = pd.read_csv('/content/drive/My Drive/dataset/cat/test.csv')

In [0]:
# getting train test data id's
train_id = train['id']
test_id = test['id']
target = train['target']

# merging train test data sets into one data frame for feature engineering 
df = pd.concat([train.drop(labels=['id','target'],axis=1), test.drop(labels='id',axis=1)])

In [0]:
from sklearn.base import BaseEstimator,MetaEstimatorMixin

# feature engineering

In [0]:
bin_attr = ['bin_3','bin_4']

ord_dict = {'Freezing':1,'Cold':2,'Warm':3,'Hot':4,
                  'Boiling Hot':5,'Lava Hot':6,'Novice':1,
                  'Contributor':2,'Expert':3,'Master':4,'Grandmaster':5}

ord_attr = ['ord_0','ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']

nom_attr = ['nom_0','nom_1','nom_2','nom_3','nom_4','nom_5','nom_6','nom_7','nom_8','nom_9','day','month']

## bin attribute transformer

In [0]:

class BinAttrTransform(BaseEstimator,MetaEstimatorMixin):
  def __init__(self,bin_attr):
    self.bin_attr = bin_attr
  def fit(self,X,y=None):
    return self
  def transform(self,X):
    bin_map = {'T':1,'F':0,'Y':1,'N':0}
    for att in bin_attr:
      X[att] = X[att].map(bin_map)
    return X

## ordinal attribute transformer

In [0]:
class OrdAttTransform(BaseEstimator,MetaEstimatorMixin):
  def __init__(self,ord_attr):
    self.ord_attr = ord_attr
    ord_dict = {'Freezing':1,'Cold':2,'Warm':3,'Hot':4,
                  'Boiling Hot':5,'Lava Hot':6,'Novice':1,
                  'Contributor':2,'Expert':3,'Master':4,'Grandmaster':5}
  def fit(self,X):
    return self
  def orde_scal(num_ordnl):
    seq = np.linspace(0,1,num_ordnl)
    return seq
  def transform(self,X):
    
    for attr in ord_attr:

      if attr in ('ord_1','ord_2'):
        X[attr] = X[attr].map(ord_dict)

      atlist = X[attr].unique().tolist()
      atlist.sort()
      arlist_len = len(atlist)
      
      attrmap = dict(zip(atlist, np.linspace(0,1,arlist_len)))
      X[attr] = X[attr].map(attrmap)
    return X

    

## nominal attribute transformer

In [0]:
class nomAttrTransform(BaseEstimator,MetaEstimatorMixin):
  def __init__(self,nom_attr):
    self.nom_attr = nom_attr
  def fit(self,X,y=None):
    return self
  def transform(self,X):
    X_label = X[nom_attr].apply(LabelEncoder().fit_transform)
    nom_sparse = OneHotEncoder().fit_transform(X_label)
    df_sp = X.drop(labels=nom_attr,axis=1)
    df_non_nom_csr = csr_matrix(df_sp)
    return hstack([df_non_nom_csr,nom_sparse])
    
    


## pipeline

In [0]:
pipeline = Pipeline([
                     ('bin_trans',BinAttrTransform(bin_attr)),
                     ('ord_trans',OrdAttTransform(ord_attr)),
                     ('nom_trans',nomAttrTransform(nom_attr))
])

In [264]:
df_sparse = pipeline.transform(df_test)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [265]:
df_sparse

<500000x16306 sparse matrix of type '<class 'numpy.float64'>'
	with 9163718 stored elements in COOrdinate format>

## train test data split

In [0]:
test_x = df_sparse.tocsr()[300000:,:]

In [0]:
train_x = df_sparse.tocsr()[:300000,:]

In [269]:
train_x.shape,test_x.shape

((300000, 16306), (200000, 16306))

## logistic rigression

In [271]:
%%time
logit = LogisticRegression(solver='lbfgs', max_iter=5000,C=0.12)
logit.fit(train_x,target)

CPU times: user 30.8 s, sys: 33.6 s, total: 1min 4s
Wall time: 32.5 s


In [0]:
pred = logit.predict_proba(train_x)[:,1]

In [0]:
fpr, tpr, _ = roc_curve(target, pred)
roc_auc = auc(fpr, tpr)

In [274]:
roc_auc

0.8272910664212785

In [275]:
pred_prob = logit.predict_proba(train_x)[:,1]

roc_auc_score(target,pred_prob)

0.8272910664212785

In [0]:
# cross validation
cv = KFold(n_splits=10, random_state=42)

In [0]:
%%time
scores = cross_val_score(logit,train_x,target,cv=cv,scoring='roc_auc')

CPU times: user 4min 53s, sys: 1min 5s, total: 5min 59s
Wall time: 3min 1s


In [0]:
print(scores.mean())

0.8023996165134285


In [0]:
## Grid Search

In [0]:
%%time
param_grid = { 'C': [0.100, 0.150, 0.120, 0.125, 0.130, 0.135, 0.140, 0.145, 0.150] }

logit_grid = GridSearchCV(logit, param_grid,scoring='roc_auc', cv=cv)

logit_grid.fit(train_x, target)

best_C = logit_grid.best_params_['C']
# best_C = C = 0.12345

print('Best C:', best_C)

Best C: 0.12


In [0]:
pred_prob = logit_grid.predict_proba(train_x)[:,1]

roc_auc_score(target,pred_prob)

0.8252877437551849

In [0]:
# Results

In [0]:
pred_prob = logit_grid.predict_proba(test_x)[:,1]

#roc_auc_score(target,pred_prob)

pred=logit_grid.predict_proba(test_x)[:,1]

pd.DataFrame({"id": test["id"], "target": pred_prob}).to_csv("submission.csv", index=False)