In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import set_config; set_config(display='diagram')

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [5]:
data = pd.read_csv("data/train.csv")

In [6]:
data.shape

(381109, 12)

## Data Pre-processing

In [7]:
data = data[data['Driving_License'] == 1]
data.shape

(380297, 12)

In [8]:
X = data.drop(columns=['Policy_Sales_Channel', 'Driving_License', 'id', 'Response'])
y = data.Response
X.shape

(380297, 8)

In [9]:
X.dtypes.value_counts()

object     3
int64      3
float64    2
dtype: int64

In [10]:
X.select_dtypes(include='object').nunique()

Gender            2
Vehicle_Age       3
Vehicle_Damage    2
dtype: int64

In [11]:
feat_categorical = X.select_dtypes(include='object').nunique()

In [12]:
feat_numerical = X.select_dtypes(include=["int64", "float64"]).columns

## Baseline Pipe

In [11]:
preproc_numerical_baseline = make_pipeline(
    MinMaxScaler())

preproc_categorical_baseline = make_pipeline(
    OneHotEncoder(handle_unknown="ignore"))

preproc_baseline = make_column_transformer(
    (preproc_numerical_baseline, feat_numerical),
    (preproc_categorical_baseline, feat_categorical),
    remainder="drop")

preproc_baseline

In [12]:
pipe_baseline = make_pipeline(preproc_baseline, DummyClassifier(strategy="most_frequent"))
pipe_baseline

In [13]:
score_baseline = cross_val_score(pipe_baseline, X, y, cv=5, scoring='roc_auc').mean()
score_baseline

0.5

In [16]:
X_test = pd.read_csv("data/test.csv")
X_test_ids = X_test['id']
X_test = X_test[X_test['Driving_License'] == 1]
X_test = X_test.drop(columns=['Policy_Sales_Channel', 'Driving_License', 'id'])


pipe_baseline.fit(X,y)
y_pred_baseline = pipe_baseline.predict(X_test)
y_pred_baseline

array([0, 0, 0, ..., 0, 0, 0])

In [17]:
results = pd.concat([X_test_ids, pd.Series(y_pred_baseline, name="Response")], axis=1)

In [18]:
results.to_csv("data/submission_baseline.csv", header=True, index=False)

## Model Iteration

In [14]:
preproc_numerical = make_pipeline(
    MinMaxScaler())

preproc_categorical = make_pipeline(
    OneHotEncoder(handle_unknown="ignore"))

preproc = make_column_transformer(
    (preproc_numerical, feat_numerical),
    (preproc_categorical, feat_categorical),
    remainder="drop")

preproc

### Decison Tree

In [24]:
model = DecisionTreeClassifier()
pipe_decision_tree = make_pipeline(preproc, model) 
cross_val_score(pipe_decision_tree, X, y, cv=5, scoring='roc_auc').mean()

0.5846567742225391

### SVC

In [25]:
model = SVC()
pipe_decision_tree = make_pipeline(preproc, model) 
cross_val_score(pipe_decision_tree, X, y, cv=5, scoring='roc_auc').mean()

### Random Forest Classifier

In [None]:
model = RandomForestClassifier()
pipe_RandomForest = make_pipeline(preproc, model) 
cross_val_score(pipe_RandomForest, X, y, cv=5, scoring='roc_auc').mean()

### AdaBoost

In [None]:
model = AdaBoostClassifier()
pipe_AdaBoost = make_pipeline(preproc, model) 
cross_val_score(pipe_AdaBoost, X, y, cv=5, scoring='roc_auc').mean()

### KNN

In [None]:
model = KNeighborsClassifier(n_neighbors=3)
pipe_KNN = make_pipeline(preproc, model) 
cross_val_score(pipe_KNN, X, y, cv=5, scoring='roc_auc').mean()

### Stacking

In [None]:
XGBoost = XGBClassifier()
adaboost = AdaBoostClassifier()


model = StackingClassifier(
    estimators = [("XGBoost", XGBoost),("adaboost", adaboost)],
    final_estimator=LogisticRegression(),
    cv=5,
    n_jobs=-1
)

pipe_ensemble = make_pipeline(preproc, model, memory=cachedir)

score = cross_val_score(pipe_ensemble, X, y, cv=5, scoring='roc_auc', n_jobs=-1)
print(score.std())
score.mean()

In [None]:
XGBoost = XGBClassifier()
adaboost = AdaBoostClassifier()


model = VotingClassifier(
    estimators = [("XGBoost", XGBoost),("adaboost", adaboost)], voting='soft',
    weights = [1,1], # to equally weight the two models
    n_jobs=-1
)

pipe_ensemble = make_pipeline(preproc, model)

score = cross_val_score(pipe_ensemble, X, y, cv=5, scoring='roc_auc', n_jobs=-1)
print(score.std())
score.mean()

### XGBoost

In [None]:
from tempfile import mkdtemp
from shutil import rmtree
cachedir = mkdtemp()

In [24]:
model = XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=70, alpha=1)
pipe_XGBoost = make_pipeline(preproc, model) 
cross_val_score(pipe_XGBoost, X, y, cv=5, scoring='roc_auc', n_jobs=-1).mean()









0.8324167655519314

In [27]:
pipe_XGBoost.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('pipeline-1',
                                    Pipeline(steps=[('minmaxscaler',
                                                     MinMaxScaler())]),
                                    Index(['Age', 'Region_Code', 'Previously_Insured', 'Annual_Premium',
          'Vintage'],
         dtype='object')),
                                   ('pipeline-2',
                                    Pipeline(steps=[('onehotencoder',
                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                    Gender            2
   Vehicle_Age       3
   Vehicle_Damage    2
   dtype: int64)])),
  ('xgbclassifier',
   XGBClassifier(alpha=1, base_score=None, booster=None, colsample_bylevel=None,
                 colsample_bynode=None, colsample_bytree=None,
                 enable_categorical=False, gamma=None, gpu_id=None,
                 importance_type=N

In [None]:
allow_grid_searching = True

if allow_grid_searching:
    param_grid =  {'xgbclassifier__max_depth': [3, 4, 5],
                  'xgbclassifier__n_estimators': [10, 50, 70],
                   'xgbclassifier__learning_rate': [0.01, 0.1, 1],
                  }
    search_XGBoost = GridSearchCV(pipe_XGBoost, param_grid=param_grid, 
                              cv=5, n_jobs=-1, verbose=2, scoring='roc_auc')

    search_XGBoost.fit(X, y);
    svm_XGBoost_best = search_XGBoost.best_estimator_
    print(search_XGBoost.best_params_)
    search_XGBoost.best_score_

## Final Result

In [20]:
X_test = pd.read_csv("data/test.csv")
X_test_ids = X_test['id']
X_test = X_test[X_test['Driving_License'] == 1]
X_test = X_test.drop(columns=['Policy_Sales_Channel', 'Driving_License', 'id'])

model = XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=70, alpha=1)
pipe_XGBoost = make_pipeline(preproc, model)
pipe_XGBoost.fit(X,y)
y_pred = pipe_XGBoost.predict(X_test)
y_pred





array([0, 0, 0, ..., 0, 0, 0])

In [22]:
results = pd.concat([X_test_ids, pd.Series(y_pred, name="Response")], axis=1)

In [23]:
results.to_csv("data/submission_final.csv", header=True, index=False)