## Load Data

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier

import pandas as pd
import matplotlib.pyplot as plt
import pickle
import seaborn as sns

In [15]:
df = pd.read_csv('../dataset/after_selection/train.csv')
df.head()

Unnamed: 0,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Policy_Sales_Channel,Age_Bin,Vehicle_Damage_Age_Interaction,Response
0,1,46.0,1,0,0,152.0,0,0,0
1,1,46.0,1,0,0,152.0,0,0,0
2,1,46.0,1,1,1,26.0,2,2,0
3,1,18.0,1,0,0,152.0,1,0,0
4,1,11.0,0,1,1,122.0,2,2,0


## Modeling

### Split Data Train & Test

In [16]:
X = df.drop(columns=['Response'])
y = df[['Response']]

In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

### Evaluation Method

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_validate, StratifiedKFold


def eval_classification(model, X_train, X_test, y_train, y_test, n_splits=5):
    # Evaluate on the test set
    y_pred_test = model.predict(X_test)

    # StratifiedKFold for cross-validation with stratified sampling
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform cross-validation
    cv_test_results = cross_validate(model, X_test, y_test, scoring=['roc_auc'],
                                     cv=cv, return_train_score=False)
    cv_train_results = cross_validate(model, X_train, y_train, scoring=['roc_auc'],
                                      cv=cv, return_train_score=False)

    # Display metrics for the training set
    print("Metrics for the Test Set:")
    print("Accuracy: %.2f" % accuracy_score(y_test, y_pred_test))
    print("Precision: %.2f" % precision_score(y_test, y_pred_test))
    print("Recall: %.2f" % recall_score(y_test, y_pred_test))
    print("F1-Score: %.2f" % f1_score(y_test, y_pred_test))
    print()

    # Display cross-validation results
    print("Metrics Using Cross Validation:")
    print(f"Mean ROC-AUC (Test): {cv_test_results['test_roc_auc'].mean():.2f}")
    print(f"Std ROC-AUC (Test): {cv_test_results['test_roc_auc'].std():.2f}")
    print()
    print(f"Mean ROC-AUC (Train): {cv_train_results['test_roc_auc'].mean():.2f}")
    print(f"Std ROC-AUC (Train): {cv_train_results['test_roc_auc'].std():.2f}")

### AdaBoost

#### Base Model

In [19]:
clf = AdaBoostClassifier()
clf.fit(X_train, y_train)
eval_classification(clf, X_train, X_test, y_train, y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Metrics for the Test Set:
Accuracy: 0.80
Precision: 0.72
Recall: 0.95
F1-Score: 0.82

Metrics Using Cross Validation:
Mean ROC-AUC (Test): 0.84
Std ROC-AUC (Test): 0.01

Mean ROC-AUC (Train): 0.85
Std ROC-AUC (Train): 0.00


#### Hyperparameter Tunning

In [20]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import numpy as np

# List of hyperparameter
hyperparameters = dict(n_estimators=[int(x) for x in np.linspace(start=50, stop=200, num=200)],
                       learning_rate=[float(x) for x in np.linspace(
                           start=0.001, stop=0.1, num=200)],
                       algorithm=['SAMME', 'SAMME.R']
                       )

In [21]:
# Init model
ab = AdaBoostClassifier(random_state=42)
ab_tuned = RandomizedSearchCV(ab, hyperparameters, random_state=42, cv=5, scoring='recall')
ab_tuned.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [22]:
print(f'Best Hyperparameters: {ab_tuned.best_params_}')

Best Hyperparameters: {'n_estimators': 98, 'learning_rate': 0.016422110552763818, 'algorithm': 'SAMME'}


In [23]:
# Predict & Evaluation
eval_classification(ab_tuned, X_train, X_test, y_train, y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

## Feature Importance

### Visualization

In [None]:
def show_feature_importance(model):
    feat_importances = pd.Series(model.feature_importances_, index=X.columns)
    ax = feat_importances.nlargest(25).plot(kind='barh', figsize=(10, 8))
    ax.invert_yaxis()

    plt.xlabel('score')
    plt.ylabel('feature')
    plt.title('feature importance score')

In [None]:
show_feature_importance(clf)

### Business Insight

Pemilihan model yang telah di buat berfokus terhadap metric evaluasi Recall / mengungari False Negative (pelanggan yang mempunyai Response 0 " tidak tertarik" tapi hasil prediksi dianggap 1 "tertarik"). Maka dari itu pemilihan model berdasarkan parameter tersebut bisa memberikan rekomendasi bisnis kepada Divisi Marketing agar bisa lebih tepat sasaran dalam melakukan penawaran Asuransi Kendaraan.

## Model Comparison

Masing-masing code model tersedia pada folder code_model. Berikut adalah hasil akhir masing-masing model yang terbaik

In [None]:
# Data for the models
data = {
    'Model': ['LightGBM', 'CatBoost', 'K nearest neighbors', 'Random Forest', 'AdaBoost', 'XGBoost'],
    'Accuracy': [0.80, 0.80, 0.80, 0.80, 0.80, 0.80],
    'Precision': [0.73, 0.73, 0.73, 0.73, 0.73, 0.73],
    'Recall': [0.94, 0.93, 0.94, 0.93, 0.95, 0.94],
    'F1-Score': [0.82, 0.82, 0.82, 0.82, 0.82, 0.82],
    'Mean ROC-AUC (Test)': [0.84, 0.84, 0.64, 0.84, 0.84, 0.84],
    'Std ROC-AUC (Test)': [0.01, 0.01, 0.00, 0.01, 0.00, 0.01],
    'Mean ROC-AUC (Train)': [0.85, 0.85, 0.65, 0.85, 0.85, 0.85],
    'Std ROC-AUC (Train)': [0.00, 0.00, 0.00, 0.00, 0.00, 0.00]
}

# Create DataFrame
df = pd.DataFrame(data)
df = df.sort_values(by=['Recall', 'Mean ROC-AUC (Test)', 'Std ROC-AUC (Test)'], ascending=False).reset_index(drop=True)

# Display DataFrame
df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,Mean ROC-AUC (Test),Std ROC-AUC (Test),Mean ROC-AUC (Train),Std ROC-AUC (Train)
0,AdaBoost,0.8,0.73,0.95,0.82,0.84,0.0,0.85,0.0
1,LightGBM,0.8,0.73,0.94,0.82,0.84,0.01,0.85,0.0
2,XGBoost,0.8,0.73,0.94,0.82,0.84,0.01,0.85,0.0
3,K nearest neighbors,0.8,0.73,0.94,0.82,0.64,0.0,0.65,0.0
4,CatBoost,0.8,0.73,0.93,0.82,0.84,0.01,0.85,0.0
5,Random Forest,0.8,0.73,0.93,0.82,0.84,0.01,0.85,0.0


Reasoning memilih adaboost disini