In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import time

from numba import jit

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, VarianceThreshold, chi2
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [4]:
start = time.time()
train = pd.read_csv(r"C:\Users\Ong Yi Kai\Desktop\Data\Kaggle competitions\Tabular Data Sept 2021\train.csv")
test = pd.read_csv(r"C:\Users\Ong Yi Kai\Desktop\Data\Kaggle competitions\Tabular Data Sept 2021\train.csv")
total_time = time.time()-start
print('DATA LOADED! Time Taken:.{:.2f}'.format(total_time))

DATA LOADED! Time Taken:.44.43


Splitting training into Train and CV sets

In [15]:
X_train,X_CV,y_train,y_CV = train_test_split(train.iloc[:,1:-1], train.iloc[:,-1:],test_size=0.2,random_state=2021)
print('shape of training set\nX:{}\ny:{} \n shape of Cross Validation set \nX:{} \ny:{}'.format(X_train.shape,y_train.shape,X_CV.shape, y_CV.shape))

shape of training set
X:(766335, 118)
y:(766335, 1) 
 shape of Cross Validation set 
X:(191584, 118) 
y:(191584, 1)


Replacing Null Values

In [13]:
numerical_features = [col for col in train.columns if col not in ('id','claim')]
for col in numerical_features:
    train[col].replace(np.nan,train[col].mean(skipna=True),inplace=True)

Removing outliers

In [18]:
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)



In [20]:
mask = yhat!=-1
Xtrain, ytrain = X_train.iloc[mask,:], y_train.iloc[mask, :]

Pipeline Object

In [21]:
pipe = Pipeline([
    ('Scaler',StandardScaler()),
    ('Selector',VarianceThreshold()),
    ('Estimator',RandomForestClassifier())
])

In [22]:
grid_params = {
    'Selector':[SelectKBest(score_func=chi2),VarianceThreshold(threshold=0.3)],
    'Estimator':[LogisticRegression(),RandomForestClassifier(),KNeighborsClassifier()]
}

In [None]:
search = GridSearchCV(estimator = pipe,param_grid = grid_params,scoring='roc_auc',verbose=2)
search.fit(X_train,np.ravel(y_train))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END Estimator=LogisticRegression(), Selector=SelectKBest(score_func=<function chi2 at 0x000001856DFD8550>); total time=   9.3s
[CV] END Estimator=LogisticRegression(), Selector=SelectKBest(score_func=<function chi2 at 0x000001856DFD8550>); total time=  10.7s
[CV] END Estimator=LogisticRegression(), Selector=SelectKBest(score_func=<function chi2 at 0x000001856DFD8550>); total time=   2.1s
[CV] END Estimator=LogisticRegression(), Selector=SelectKBest(score_func=<function chi2 at 0x000001856DFD8550>); total time=   1.9s
[CV] END Estimator=LogisticRegression(), Selector=SelectKBest(score_func=<function chi2 at 0x000001856DFD8550>); total time=   1.9s
[CV] END Estimator=LogisticRegression(), Selector=VarianceThreshold(threshold=0.3); total time=   6.4s
[CV] END Estimator=LogisticRegression(), Selector=VarianceThreshold(threshold=0.3); total time=   4.7s
[CV] END Estimator=LogisticRegression(), Selector=VarianceThreshold(thresh

performance on CV

In [None]:
pred = search.predict(X_CV)
acc = accuracy_score(y_CV,pred)
rec = recall_score(y_CV,pred)
prec = precision_score(y_CV,pred)
F1 = f1_score(y_CV,pred)
roc_auc = roc_auc_score(y_CV,pred)

print('accuracy:{} \nrecall:{} \nprecision:{} \nF1:{} \nROC_AUC:{}'.format(acc,rec,prec,F1,roc_auc))