In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s4e1/sample_submission.csv
/kaggle/input/playground-series-s4e1/train.csv
/kaggle/input/playground-series-s4e1/test.csv


In [2]:
train_data = pd.read_csv("/kaggle/input/playground-series-s4e1/train.csv")

In [3]:
from sklearn.model_selection import train_test_split  
# Dropping irrelevant columns for training
X = train_data.drop(['id', 'CustomerId', 'Surname', 'Exited'], axis=1)
y = train_data['Exited']

# Identifying categorical features
categorical_features = ['Geography', 'Gender']

# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
from sklearn.preprocessing import StandardScaler
# Creating transformers for numerical and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combining transformers into a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Creating a pipeline with the preprocessor and the logistic regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', CalibratedClassifierCV(LogisticRegression(random_state=42), method='sigmoid'))])
# Training the model
model.fit(X_train, y_train)

In [5]:
# Creating a pipeline with the preprocessor and the LightGBM classifier
model1 = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LGBMClassifier(random_state=42))])
# Training the LightGBM model
model1.fit(X_train, y_train)

In [6]:
from sklearn.tree import DecisionTreeClassifier

# Creating a pipeline with the preprocessor and the Decision Tree classifier
model2 = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', DecisionTreeClassifier(random_state=42))])

# Training the Decision Tree model
model2.fit(X_train, y_train)

In [7]:
from xgboost import XGBClassifier

# Creating a pipeline with the preprocessor and the XGBoost classifier
model3 = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', XGBClassifier(random_state=42))])

# Training the XGBoost model
model3.fit(X_train, y_train)

In [8]:
from catboost import CatBoostClassifier

# Creating a pipeline with the preprocessor and the CatBoost classifier
model4 = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', CatBoostClassifier(random_state=42, verbose=0))])

# Training the CatBoost model
model4.fit(X_train, y_train)

In [9]:
# Making predictions on the validation set for logistic regression
val_predictions = model.predict(X_val)
val_probabilities = model.predict_proba(X_val)[:, 1]

# Making predictions on the validation set for LightGBM
val_predictions1 = model1.predict(X_val)
val_probabilities1 = model1.predict_proba(X_val)[:, 1]

# Evaluating the logistic regression model on the validation set
accuracy = accuracy_score(y_val, val_predictions)
roc_auc = roc_auc_score(y_val, val_probabilities)
print(f'Logistic Regression - Accuracy on validation set: {accuracy:.2f}')
print(f'Logistic Regression - ROC-AUC on validation set: {roc_auc:.2f}')

# Evaluating the LightGBM model on the validation set
accuracy1 = accuracy_score(y_val, val_predictions1)
roc_auc1 = roc_auc_score(y_val, val_probabilities1)
print(f'LightGBM - Accuracy on validation set: {accuracy1:.2f}')
print(f'LightGBM - ROC-AUC on validation set: {roc_auc1:.2f}')

Logistic Regression - Accuracy on validation set: 0.84
Logistic Regression - ROC-AUC on validation set: 0.82
LightGBM - Accuracy on validation set: 0.87
LightGBM - ROC-AUC on validation set: 0.89


In [10]:
# Making predictions on the validation set for Decision Tree
val_predictions2 = model2.predict(X_val)
val_probabilities2 = model2.predict_proba(X_val)[:, 1]

# Making predictions on the validation set for XGBoost
val_predictions3 = model3.predict(X_val)
val_probabilities3 = model3.predict_proba(X_val)[:, 1]

# Making predictions on the validation set for CatBoost
val_predictions4 = model4.predict(X_val)
val_probabilities4 = model4.predict_proba(X_val)[:, 1]

# Evaluating the Decision Tree model on the validation set
accuracy2 = accuracy_score(y_val, val_predictions2)
roc_auc2 = roc_auc_score(y_val, val_probabilities2)
print(f'Decision Tree - Accuracy on validation set: {accuracy2:.2f}')
print(f'Decision Tree - ROC-AUC on validation set: {roc_auc2:.2f}')

# Evaluating the XGBoost model on the validation set
accuracy3 = accuracy_score(y_val, val_predictions3)
roc_auc3 = roc_auc_score(y_val, val_probabilities3)
print(f'XGBoost - Accuracy on validation set: {accuracy3:.2f}')
print(f'XGBoost - ROC-AUC on validation set: {roc_auc3:.2f}')

# Evaluating the CatBoost model on the validation set
accuracy4 = accuracy_score(y_val, val_predictions4)
roc_auc4 = roc_auc_score(y_val, val_probabilities4)
print(f'CatBoost - Accuracy on validation set: {accuracy4:.2f}')
print(f'CatBoost - ROC-AUC on validation set: {roc_auc4:.2f}')


Decision Tree - Accuracy on validation set: 0.80
Decision Tree - ROC-AUC on validation set: 0.70
XGBoost - Accuracy on validation set: 0.87
XGBoost - ROC-AUC on validation set: 0.89
CatBoost - Accuracy on validation set: 0.87
CatBoost - ROC-AUC on validation set: 0.89


In [11]:
test_data = pd.read_csv("/kaggle/input/playground-series-s4e1/test.csv")
test_probabilities_lgbm = model1.predict_proba(test_data)[:, 1]
test_probabilities_xgb = model3.predict_proba(test_data)[:, 1]
test_probabilities_catboost = model4.predict_proba(test_data)[:, 1]

# Combine predictions (averaging probabilities)
final_probabilities = (test_probabilities_lgbm + test_probabilities_xgb + test_probabilities_catboost) / 3

# Create a submission DataFrame
submission = pd.DataFrame({'id': test_data['id'], 'Exited': final_probabilities})

# Save the submission file
submission.to_csv('submission.csv', index=False)
submission

Unnamed: 0,id,Exited
0,165034,0.028563
1,165035,0.840081
2,165036,0.026570
3,165037,0.214574
4,165038,0.384275
...,...,...
110018,275052,0.035717
110019,275053,0.090987
110020,275054,0.018883
110021,275055,0.147955
