In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#%pip install ydata-profiling

In [None]:
#from ydata_profiling import ProfileReport
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from scipy.stats import entropy
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1. Understanding the Problem and Objective:
Before diving into the data, I needed understand the problem I was trying to solve and the goals of this analysis. 

## About Data
For this Health Insurance data,this dataset is about an Insurance company that has provided Health Insurance to its customers. 

## Goal or Objective at  Hand
I need to build a model to predict whether the policyholders (customers) from past year will also be interested in Vehicle Insurance provided by the company.

In [None]:
# Importing libraries and Loading the Dataset:

import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

train_dataset = pd.read_csv("/kaggle/input/playground-series-s4e7/train.csv")
test_dataset = pd.read_csv("/kaggle/input/playground-series-s4e7/test.csv")

In [None]:
# Taking a look at the structure of the Dataset
train_df = train_dataset.sample(frac=0.4, random_state=42)

In [None]:
train_df.info()

In [None]:
train_df.shape

In [None]:
# Checking the Duplicate and Missing Values in the Dataset
train_df.duplicated().sum()
train_df.drop_duplicates(inplace=True)
train_df.shape

In [None]:
missing_values = train_df.isnull().sum()
missing_values

# 2. Exploratory Data Analysis(EDA)

In [None]:
#profile = ProfileReport(train_df, title="Profiling Report")

In [None]:
#profile.to_notebook_iframe()

# 3. Data Preprocessing

In [None]:
train_df.head(10)

In [None]:
# Convert categorical variables
categorical_features = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']
train_df = pd.get_dummies(train_df, columns=categorical_features, drop_first=True)

# Normalize/Standardize numerical features
numerical_features = ['Annual_Premium', 'Vintage']
scaler = StandardScaler()
train_df[numerical_features] = scaler.fit_transform(train_df[numerical_features])

train_df.head(10)

In [None]:
train_df.info()

# 4. Feature Engineering

In [None]:
# Binning Age
train_df['Age_Bin'] = pd.cut(train_df['Age'], bins=[0, 30, 40, 50, 60, 70, 80], labels=[1, 2, 3, 4, 5, 6])

# Create Insurance History feature
train_df['Insurance_History'] = train_df['Previously_Insured'] * train_df['Vintage']

# Use quantile-based binning for normalized Vintage values
quantile_transformer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile', subsample=None)
train_df['Vintage_Bin'] = quantile_transformer.fit_transform(train_df[['Vintage']])

# Create a new feature combining vehicle damage status and vehicle age
train_df['Vehicle_Damage_Age'] = train_df['Vehicle_Damage_Yes'] * (
    train_df['Vehicle_Age_< 1 Year'] * 1 + 
    train_df['Vehicle_Age_> 2 Years'] * 2
)

# Calculate entropy for Policy_Sales_Channel
sales_channel_entropy = train_df['Policy_Sales_Channel'].value_counts(normalize=True)
train_df['Policy_Sales_Channel_Entropy'] = entropy(sales_channel_entropy)

In [None]:
# Check for missing values
train_df.isnull().sum()

# 5. Prepare data for Modeling

In [None]:
# Convert bool variables
categorical_bool_features = ['Gender_Male', 'Vehicle_Age_< 1 Year', 'Vehicle_Age_> 2 Years', 'Vehicle_Damage_Yes']
train_df[categorical_bool_features] = train_df[categorical_bool_features].astype(int)


# Fill missing values for each column with the mode
for column in train_df.columns:
    mode_value = train_df[column].mode()[0]
    train_df.loc[:, column] = train_df[column].fillna(mode_value)

# Define the features and target variable
X = train_df.drop(['Age', 'Response'], axis=1)
y = train_df['Response']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
train_df.isnull().sum()

In [None]:
train_df.head(15)

In [None]:
'''
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import KFold

# Initialize the models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# Define the K-fold cross-validation object
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Train and evaluate the models
model_scores = {}
for model_name, model in models.items():
    # Initialize the AUC-ROC scores
    auc_scores = []

    # Perform K-fold cross-validation
    for train_index, val_index in kfold.split(X_train):
        X_train_fold = X_train.iloc[train_index]
        y_train_fold = y_train.iloc[train_index]
        X_val_fold = X_train.iloc[val_index]
        y_val_fold = y_train.iloc[val_index]

        # Train the model on the current fold
        model.fit(X_train_fold, y_train_fold)

        # Make predictions on the validation set
        y_pred_proba_fold = model.predict_proba(X_val_fold)[:, 1]

        # Evaluate the model on the current fold
        auc = roc_auc_score(y_val_fold, y_pred_proba_fold)
        auc_scores.append(auc)

        # Print the AUC-ROC score for the current fold
        print(f'{model_name} Fold {train_index} AUC-ROC: {auc:.4f}')
        print('\n' + '='*60 + '\n')

    # Calculate the average AUC-ROC across all folds
    avg_auc = sum(auc_scores) / len(auc_scores)
    model_scores[model_name] = avg_auc
    print(f'{model_name} Average AUC-ROC: {avg_auc:.4f}')
    print('\n' + '='*60 + '\n')

# Compare the models and choose the best one
best_model_name = max(model_scores, key=model_scores.get)
best_model_avg_auc = model_scores[best_model_name]
print(f'Best Model: {best_model_name} with Average AUC-ROC: {best_model_avg_auc:.4f}')
'''

Code Cell Output:

Random Forest Fold [      1       4       5 ... 9203835 9203836 9203837] AUC-ROC: 0.8319

============================================================

Random Forest Fold [      0       1       2 ... 9203834 9203836 9203837] AUC-ROC: 0.8320

============================================================

Random Forest Fold [      0       1       2 ... 9203834 9203835 9203837] AUC-ROC: 0.8318

============================================================

Random Forest Fold [      0       2       3 ... 9203832 9203835 9203836] AUC-ROC: 0.8326

============================================================

Random Forest Fold [      0       1       2 ... 9203835 9203836 9203837] AUC-ROC: 0.8321

============================================================

Random Forest Average AUC-ROC: 0.8321

============================================================

Gradient Boosting Fold [      1       4       5 ... 9203835 9203836 9203837] AUC-ROC: 0.8625

============================================================

Gradient Boosting Fold [      0       1       2 ... 9203834 9203836 9203837] AUC-ROC: 0.8622

============================================================

Gradient Boosting Fold [      0       1       2 ... 9203834 9203835 9203837] AUC-ROC: 0.8621

============================================================

Gradient Boosting Fold [      0       2       3 ... 9203832 9203835 9203836] AUC-ROC: 0.8629

============================================================

Gradient Boosting Fold [      0       1       2 ... 9203835 9203836 9203837] AUC-ROC: 0.8616

============================================================

Gradient Boosting Average AUC-ROC: 0.8623

============================================================

In [None]:
xgb_data = train_df.copy()

In [None]:
xgb_data.columns = xgb_data.columns.str.replace('[', '').str.replace(']', '').str.replace('<', '').str.replace('>', '')

In [None]:
feature_names = list(xgb_data.columns)
feature_names

In [None]:
# Define the features and target variable for xgb
X1 = xgb_data.drop(['Age', 'Response'], axis=1)
y1 = xgb_data['Response']

# Split the data
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)

In [None]:
'''
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import KFold

# Initialize the models
models = {
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, device='cuda', enable_categorical=True)
}

# Define the K-fold cross-validation object
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Train and evaluate the models
model_scores = {}
for model_name, model in models.items():
    # Initialize the AUC-ROC scores
    auc_scores = []

    # Perform K-fold cross-validation
    for train_index, val_index in kfold.split(X_train1):
        X_train_fold = X_train1.iloc[train_index]
        y_train_fold = y_train1.iloc[train_index]
        X_val_fold = X_train1.iloc[val_index]
        y_val_fold = y_train1.iloc[val_index]

        # Train the model on the current fold
        model.fit(X_train_fold, y_train_fold)

        # Make predictions on the validation set
        y_pred_proba_fold = model.predict_proba(X_val_fold)[:, 1]

        # Evaluate the model on the current fold
        auc = roc_auc_score(y_val_fold, y_pred_proba_fold)
        auc_scores.append(auc)

        # Print the AUC-ROC score for the current fold
        print(f'{model_name} Fold {train_index} AUC-ROC: {auc:.4f}')
        print('\n' + '='*60 + '\n')

    # Calculate the average AUC-ROC across all folds
    avg_auc = sum(auc_scores) / len(auc_scores)
    model_scores[model_name] = avg_auc
    print(f'{model_name} Average AUC-ROC: {avg_auc:.4f}')
    print('\n' + '='*60 + '\n')

# Compare the models and choose the best one
best_model_name = max(model_scores, key=model_scores.get)
best_model_avg_auc = model_scores[best_model_name]
print(f'Best Model: {best_model_name} with Average AUC-ROC: {best_model_avg_auc:.4f}')
'''

XGBoost Fold [      1       4       5 ... 9203835 9203836 9203837] AUC-ROC: 0.8764

============================================================

XGBoost Fold [      0       1       2 ... 9203834 9203836 9203837] AUC-ROC: 0.8764

============================================================

XGBoost Fold [      0       1       2 ... 9203834 9203835 9203837] AUC-ROC: 0.8762

============================================================

XGBoost Fold [      0       2       3 ... 9203832 9203835 9203836] AUC-ROC: 0.8768

============================================================

XGBoost Fold [      0       1       2 ... 9203835 9203836 9203837] AUC-ROC: 0.8762

============================================================

XGBoost Average AUC-ROC: 0.8764

============================================================

Best Model: XGBoost with Average AUC-ROC: 0.8764

In [None]:
%pip install cupy

In [None]:
import xgboost as xgb
import optuna
import numpy as np
from sklearn.model_selection import cross_val_score
import cupy

# Move the data to the GPU
X_train1_gpu = cupy.asarray(X_train1)
y_train1_gpu = cupy.asarray(y_train1)

'''
# Define hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.5, 1],
    'n_estimators': [50, 100, 200],
    'gamma': [0, 0.1, 0.5],
    'subsample': [0.5, 0.8, 1],
    'colsample_bytree': [0.5, 0.8, 1],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}
'''

# Convert CuPy arrays to NumPy arrays
X_train1_np = cupy.asnumpy(X_train1_gpu)
y_train1_np = cupy.asnumpy(y_train1_gpu)

'''
# Perform grid search
grid_search = GridSearchCV(xgb.XGBClassifier(use_label_encoder=False, random_state=42, device='cuda', enable_categorical=True), param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train1_np, y_train1_np)

# Print best hyperparameters and score
print("Best hyperparameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

'''

def objective(trial):
    param = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 7)
    }

    xgb_model = xgb.XGBClassifier(**param, use_label_encoder=False, random_state=42, device='cuda', enable_categorical=True)
    scores = cross_val_score(xgb_model, X_train1_np, y_train1_np, cv=5, scoring='roc_auc')
    return np.mean(scores)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

In [None]:
print("Best hyperparameters:", study.best_trial.params)
print("Best score:", study.best_trial.value)

In [None]:
import xgboost as xgb
import optuna
import numpy as np
from sklearn.model_selection import cross_val_score
import cupy

In [None]:
best_param = {'learning_rate': 0.3165452668977133, 'n_estimators': 188, 'gamma': 0.2670568191008615, 
              'max_depth': 6, 'min_child_weight': 8, 'subsample': 0.9707704922792166, 
              'colsample_bytree': 0.822608299221203, 'reg_alpha': 0.4054578655101799, 
              'reg_lambda': 0.988778827463794}

xgb_model = xgb.XGBClassifier(**best_param, use_label_encoder=False, random_state=42, device='cuda', enable_categorical=True)
xgb_model.fit(X_train1_np, y_train1_np)

In [None]:
test_dataset.shape

In [None]:
test = test_dataset.copy()

In [None]:
# Convert categorical variables
categorical_features = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']
test = pd.get_dummies(test, columns=categorical_features, drop_first=True)

# Normalize/Standardize numerical features
numerical_features = ['Annual_Premium', 'Vintage']
scaler = StandardScaler()
test[numerical_features] = scaler.fit_transform(test[numerical_features])

test.head(10)

In [None]:
# Binning Age
test['Age_Bin'] = pd.cut(test['Age'], bins=[0, 30, 40, 50, 60, 70, 80], labels=[1, 2, 3, 4, 5, 6])

# Create Insurance History feature
test['Insurance_History'] = test['Previously_Insured'] * test['Vintage']

# Use quantile-based binning for normalized Vintage values
quantile_transformer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile', subsample=None)
test['Vintage_Bin'] = quantile_transformer.fit_transform(test[['Vintage']])

# Create a new feature combining vehicle damage status and vehicle age
test['Vehicle_Damage_Age'] = test['Vehicle_Damage_Yes'] * (
    test['Vehicle_Age_< 1 Year'] * 1 + 
    test['Vehicle_Age_> 2 Years'] * 2
)

# Calculate entropy for Policy_Sales_Channel
sales_channel_entropy = test['Policy_Sales_Channel'].value_counts(normalize=True)
test['Policy_Sales_Channel_Entropy'] = entropy(sales_channel_entropy)

In [None]:
test.columns = test.columns.str.replace('[', '').str.replace(']', '').str.replace('<', '').str.replace('>', '')

In [None]:
test.columns

In [None]:
test1 = test.copy()

In [None]:
print(test1.dtypes)

In [None]:
# One-hot encode the categorical columns
test1_encoded = pd.get_dummies(test1, columns=['Age_Bin'])

# Create a CuPy array from the encoded data
test_gpu = cp.asarray(test1_encoded)

# Convert the CuPy array back to a NumPy array
test_np = cp.asnumpy(test_gpu)

In [None]:
# Make predictions
y_pred = xgb_model.predict(test_np)

In [None]:
# Save the predictions to a CSV file
submission = pd.DataFrame({
    'id': test['id'],
    'Response': y_pred
})
submission.to_csv('submission1.csv', index=False)