<h1 style="font-size: 48px; color: red; text-align: center">Bank Scheme: Subscribe or Not?</h1>

<center>
    <img src="https://media4.giphy.com/media/v1.Y2lkPTc5MGI3NjExbWJ3dWh6ejh3aHpqazBxZW52eXJsa3BjcGVwd3U2dGF4MTZvNDFnOSZlcD12MV9pbnRlcm5hbF9naWZfYnlfaWQmY3Q9Zw/8dJz7MwFWJlS3QiRmD/giphy.gif">
</center>

<div style="border-style: solid; border-color: blue; border-width: 4px; border-radius: 4px; padding: 0.5em; font-size: 24px;">
    If you appreciate the notebook, kindly remember to upvote it.  The sole impetus for creating such notebooks is the appreciation of the effort expended by others.
</div>

# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

sns.set_theme(style="whitegrid", palette="husl")
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.grid'] = True
plt.rcParams['grid.alpha'] = 0.3

# Importing Data

In [None]:
train = pd.read_csv("/kaggle/input/playground-series-s5e8/train.csv", index_col='id')
test = pd.read_csv("/kaggle/input/playground-series-s5e8/test.csv", index_col='id')

In [None]:
orig = pd.read_csv("/kaggle/input/bank-marketing-dataset-full/bank-full.csv", sep=';')
orig['y'] = orig['y'].map({'no': 0, 'yes': 1})

In [None]:
train = pd.concat([train, orig], ignore_index=True)
train = train.drop_duplicates()

In [None]:
train.head()

In [None]:
train.info()

# Exploratory Data Analysis

In [None]:
def custom_describe(df):
    df_ = df.select_dtypes(include=np.number)
    des = df_.describe().T
    des['skewness'] = df_.skew()
    des['kurtosis'] = df_.kurtosis()
    des['count'] = des['count'].astype('int')
    return des

In [None]:
features = test.columns.tolist()
print(features)

In [None]:
numerical_features = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'day', 'month', 'poutcome']
target = 'y'

## Target Analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.countplot(x=target, data=train, ax=axes[0])
axes[0].set_title('Distribution of Target Variable (Subscribed)', fontweight='bold', size=20)
axes[0].set_xticks(ticks=[0, 1],labels=['No', 'Yes'])
axes[0].set_xlabel("subscribed")

train[target].value_counts().plot(kind='pie', ax=axes[1], explode=(0.0, 0.1), autopct="%.2f%%", labels=['No', 'Yes'], pctdistance=0.75)
axes[1].add_artist(plt.Circle((0, 0), 0.5, fc='w'))
axes[1].set_title('Pie Chart of Target Variable', fontweight='bold', size=20)
axes[1].set_ylabel("")

plt.tight_layout()
plt.show()



## Numerical Feature Analysis

In [None]:
custom_describe(train[numerical_features])

In [None]:
def numerical_features_plot(df, feature, target):
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    fig.suptitle(f'Analysis of {feature}', fontsize=16, fontweight='bold')

    # Boxplot
    sns.boxplot(data=df, x=feature, y=target, hue=target, orient='h', ax=axes[0])
    axes[0].set_title(f'Boxplot of {feature}')
    axes[0].legend_.remove()  # Turn off legend

    # Violinplot
    sns.violinplot(data=df, x=feature, y=target, hue=target, orient='h', ax=axes[1])
    axes[1].set_title(f'Violinplot of {feature}')
    axes[1].legend_.remove()  # Turn off legend

    # Histogram with KDE
    sns.histplot(data=df, x=feature, hue=target, kde=True, ax=axes[2], alpha=0.6)
    axes[2].set_title(f'Distribution of {feature}')

    plt.tight_layout(rect=[0, 0, 1, 0.98])
    plt.show()

In [None]:
for feature in numerical_features:
    numerical_features_plot(train, feature, target)

## Categorical Feature Analysis

In [None]:
def categorical_features_plot(df, feature):
    value_counts = df[feature].value_counts()
    
    top_n = min(10, len(value_counts))
    top_categories = value_counts.nlargest(top_n)
    
    df_plot = df[df[feature].isin(top_categories.index)]

    top_percentages = (top_categories / len(df)) * 100

    plt.figure(figsize=(25, 6))

    plt.subplot(1, 2, 1)
    sns.countplot(df_plot, x=feature, palette=sns.color_palette('viridis'))
    plt.title(f"Count Plot of{(' Top ' + str(top_n)) if len(value_counts) > 10 else ''} Categories of {feature}", size=16, fontweight='bold')

    plt.subplot(1, 2, 2)
    plt.pie(
        top_percentages,
        labels=top_percentages.index,
        autopct=lambda pct: f"{pct:.2f}%",
        pctdistance=0.75
    )
    plt.gca().add_artist(plt.Circle((0, 0), 0.5, fc='w'))  # Donut hole
    plt.title(
        f"{('Top ' + str(top_n)) if len(value_counts) > 10 else ''} {feature} Categories as % of Full Dataset",
        size=16,
        fontweight='bold'
    )
    plt.ylabel("")

    plt.tight_layout()
    plt.show()

In [None]:
for feature in categorical_features:
    if feature == 'day':
        continue
    categorical_features_plot(train, feature)

In [None]:
df = train.copy()

g = sns.FacetGrid(df[df['job'].isin(df['job'].value_counts().head(6).index)], 
                  col='job', col_wrap=3, height=4, aspect=1.2)
g.map_dataframe(sns.boxplot, x='y', y='age', palette='viridis')
g.set_titles("{col_name}")
g.fig.suptitle('Age Distribution by Job and Subscription Status', y=1.05)
plt.show()

In [None]:
def plot_categorical_heatmap(feature1, feature2):
    cross_tab = pd.crosstab(df[feature1], df[feature2], normalize='index') * 100
    plt.figure(figsize=(10, 6))
    sns.heatmap(cross_tab, annot=True, fmt='.1f', cmap='YlGnBu', linewidths=.5)
    plt.title(f'Percentage of Subscription by {feature1} and {feature2}')
    plt.ylabel(feature1)
    plt.xlabel(feature2)
    plt.show()

In [None]:
plot_categorical_heatmap('job', 'education')

In [None]:
plot_categorical_heatmap('marital', 'education')

In [None]:
plot_categorical_heatmap('poutcome', 'contact')

# Model Training

In [None]:
def data_process(df): # Source: https://www.kaggle.com/code/haohuanchen/ps-s5e8-lightgb-model-a-simple-starter
    df = df.copy()
    
    def many_no(x):
        if x['default']=='no' and x['housing']=='no' and x['loan']=='no':
            return 21
        if x['default']=='no' and x['housing']=='no'\
        or x['default']=='no' and x['loan']=='no'\
        or x['housing']=='no' and x['loan']=='no':
            return 7
        if x['default']=='no' or x['housing']=='no' or x['loan']=='no':
            return 3
        return 0
    
    df['many_no']  = df.apply(lambda x: many_no(x), axis=1)
    
    return df

In [None]:
from sklearn.model_selection import train_test_split,StratifiedKFold

X = train.drop('y', axis=1)
y = train['y']

X_str = data_process(X).astype('str')
test_str = data_process(test).astype('str')

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.base import clone

cat_clf = CatBoostClassifier(
    allow_writing_files=False,
    verbose=False,
    task_type='GPU',
    loss_function='CrossEntropy',
    use_best_model=True,
    cat_features=X_str.columns.to_list(),
    n_estimators=10000,
    learning_rate=0.1,
)

N_SPLITS = 5
skfold = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=0)
test_pred = np.zeros(len(test_str))
roc_scores = []

for fold, (train_idx, test_idx) in enumerate(skfold.split(X_str, y), 1):
    X_train, X_test = X_str.iloc[train_idx], X_str.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    model = clone(cat_clf)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=200, verbose=500)
    
    y_pred = model.predict_proba(X_test)[:, 1]
    roc_score = roc_auc_score(y_test, y_pred)
    roc_scores.append(roc_score)

    test_pred += model.predict_proba(test_str)[:, 1]
    print(f"Fold {fold} -> ROC-AUC: {roc_score:.5f}")

print(f"Average Fold ROC-AUC: {np.mean(roc_scores):.5f} \xb1 {np.std(roc_scores):.5f}")

test_pred = test_pred / N_SPLITS

# Submission

In [None]:
sub = pd.read_csv("/kaggle/input/playground-series-s5e8/sample_submission.csv")
sub['y'] = test_pred
sub.to_csv("submission.csv", index=False)
sub.head()