In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from xgboost import XGBClassifier
from skopt import BayesSearchCV

# Load dataset
raw_df = pd.read_csv("train.csv").drop(columns=["loan_amount", "application_date"])

# Train-test split
train_df, test_df = train_test_split(raw_df, test_size=0.2, random_state=42)
input_cols = list(raw_df.columns)[1:-1]
target_col = 'target'

# Feature separation
train_inputs, train_targets = train_df[input_cols], train_df[target_col]
test_inputs, test_targets = test_df[input_cols], test_df[target_col]

numeric_cols = train_inputs.select_dtypes(include=np.number).columns.tolist()
categorical_cols = train_inputs.select_dtypes('object').columns.tolist()

# Define preprocessing pipeline
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline([
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# Define model pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(n_jobs=-1, random_state=42))
])

# Train model
model_pipeline.fit(train_inputs, train_targets)

# Model Evaluation
train_score = model_pipeline.score(train_inputs, train_targets)
test_score = model_pipeline.score(test_inputs, test_targets)
print("Training Accuracy:", train_score)
print("Test Accuracy:", test_score)

# Hyperparameter tuning with Bayesian Search
param_grid = {
    'classifier__n_estimators': (50, 300),
    'classifier__max_depth': (10, 50),
    'classifier__min_child_weight': (1, 10),
    'classifier__learning_rate': (0.01, 0.3, 'log-uniform')
}

bayes_search = BayesSearchCV(
    model_pipeline, param_grid, n_iter=20, cv=5, scoring='accuracy', n_jobs=-1, random_state=42
)

bayes_search.fit(train_inputs, train_targets)
print("Best Parameters:", bayes_search.best_params_)

# Kaggle Test Dataset Output
kaggle_test_df = pd.read_csv("test.csv")
kaggle_test_df = preprocessor.transform(kaggle_test_df)
predictions = bayes_search.best_estimator_.predict(kaggle_test_df)

submission = pd.DataFrame({
    "Id": kaggle_test_df[:, 0],  # Ensure correct ID mapping
    "target": predictions
})
submission.to_csv("submission.csv", index=False)


Training Accuracy: 0.936925
Test Accuracy: 0.8861
Best Parameters: OrderedDict({'classifier__learning_rate': 0.01, 'classifier__max_depth': 12, 'classifier__min_child_weight': 10, 'classifier__n_estimators': 300})


ValueError: X has 75 features, but ColumnTransformer is expecting 19 features as input.