In [155]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e10/sample_submission.csv
/kaggle/input/playground-series-s4e10/train.csv
/kaggle/input/playground-series-s4e10/test.csv
/kaggle/input/loan-approval-processed-datasets/loan_approval_train_full_processed.csv
/kaggle/input/loan-approval-processed-datasets/loan_approval_test_full_processed.csv


In [None]:
import pandas as pd
original_test_df = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')
df_train = pd.read_csv('/kaggle/input/loan-approval-processed-datasets/loan_approval_train_full_processed.csv')
df_final_test = pd.read_csv('/kaggle/input/loan-approval-processed-datasets/loan_approval_test_full_processed.csv')
df_train = df_train.drop(columns='Unnamed: 0')
df_final_test  = df_final_test.drop(columns='Unnamed: 0')

In [None]:
tr_cols = list(df_train.columns)
test_cols = list(df_final_test.columns)

df_train.shape, df_final_test.shape

In [None]:
df_train.head()

In [None]:
target = 'loan_status'
feature_cols = [col for col in df_train.columns if col != target ]
corr_target = df_train[feature_cols].corrwith(df_train[target],method='pearson')


In [None]:
# corr_matrix = corr_target.sort_values(ascending=False, key=lambda x : abs(x))
# corr_matrix = corr_matrix.dropna()
# corr_matrix.describe()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

X = df_train.drop(columns=target)
y = df_train.loc[:, target]
sss = StratifiedShuffleSplit(n_splits=1, random_state=0, test_size=0.3)
for tr_idx, test_idx in sss.split(X, y):
    X_train, y_train = X.iloc[tr_idx], y.iloc[tr_idx]
    X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
X.shape, y.shape, X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('scaler', StandardScaler())
])

X_train_scaled = pipe.fit_transform(X_train)
X_test_scaled = pipe.transform(X_test)

X_train.shape, X_train_scaled.shape, X_test.shape, X_test_scaled.shape

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, make_scorer
import xgboost as xgb
from xgboost import XGBClassifier
import os

os.chdir('/kaggle/working/')  # replace with your desired path

# Define StratifiedShuffleSplit
sss2 = StratifiedShuffleSplit(n_splits=5, random_state=32, test_size=0.3)

# Define scoring metrics
scoring = {
    'roc_auc': make_scorer(roc_auc_score, needs_proba=True, average='macro'),
    'f1': make_scorer(f1_score, average='macro'),
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro')
}

xgbclassifier_params = {
    'n_estimators':250,
    'learning_rate':0.1,
    'max_depth':5,
}

# Define classification models
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForestClassifier": RandomForestClassifier(max_depth=5),
    "GradientBoostingClassifier": GradientBoostingClassifier(max_depth=5),
    'XGBClassifier':XGBClassifier(**xgbclassifier_params,objective='binary:logistic')
}

# Initialize a list to store results
results_list = []

# Perform cross-validation for each model and store results
for model_name, model in models.items():
    print(f"Fitting the model: {model_name}")
    
    # Cross-validate and get training and validation scores
    scores = cross_validate(
        estimator=model,
        X=X_train_scaled,
        y=y_train,
        cv=sss2,
        scoring=scoring,
        return_train_score=True,
        n_jobs=-1
    )
    
    # Extract results and store them in a structured list
    for metric in scoring.keys():
        train_scores = scores[f'train_{metric}']
        test_scores = scores[f'test_{metric}']
        
        # Append the results to the list
        results_list.append({
            "Model": model_name,
            "Metric": metric,
            "Train Score": train_scores.mean(),
            "Validation Score": test_scores.mean()
        })

# Create a DataFrame from the results list
results_df = pd.DataFrame(results_list)

# Pivot the results DataFrame for better readability
results_df = results_df.pivot_table(index=["Metric", "Model"], values=["Train Score", "Validation Score"])
print(results_df)


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from xgboost import XGBClassifier

# Define scoring metrics for consistency with cross-validation
def evaluate_metrics(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  # use for roc_auc

    return {
        'roc_auc': roc_auc_score(y_test, y_pred_proba, average='macro'),
        'f1': f1_score(y_test, y_pred, average='macro'),
        'precision': precision_score(y_test, y_pred, average='macro'),
        'recall': recall_score(y_test, y_pred, average='macro')
    }

# Define classification models with consistent hyperparameters
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForestClassifier": RandomForestClassifier(max_depth=5),
    "GradientBoostingClassifier": GradientBoostingClassifier(max_depth=5),
    "XGBClassifier": XGBClassifier(n_estimators=250, learning_rate=0.1, max_depth=5, objective='binary:logistic')
}

# Evaluate each model on the test set and collect results
test_results = []
for model_name, model in models.items():
    print(f"Training and evaluating model: {model_name}")
    model.fit(X_train_scaled, y_train)
    metrics = evaluate_metrics(model, X_test_scaled, y_test)
    
    # Store results for each metric
    for metric_name, metric_value in metrics.items():
        test_results.append({
            "Model": model_name,
            "Metric": metric_name,
            "Test Score": metric_value
        })

# Convert results to DataFrame and pivot
test_results_df = pd.DataFrame(test_results).pivot_table(index=["Metric", "Model"], values="Test Score")
print(test_results_df)


In [144]:
test_results_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Test Score
Metric,Model,Unnamed: 2_level_1
f1,GradientBoostingClassifier,0.919389
f1,LogisticRegression,0.877792
f1,RandomForestClassifier,0.838101
f1,XGBClassifier,0.927448
precision,GradientBoostingClassifier,0.95245
precision,LogisticRegression,0.912332
precision,RandomForestClassifier,0.88851
precision,XGBClassifier,0.956293
recall,GradientBoostingClassifier,0.894885
recall,LogisticRegression,0.853353


In [152]:
xgb_model = XGBClassifier(**xgbclassifier_params)
xgb_model.fit(X_train_scaled, y_train)
final_test_scaled = pipe.fit_transform(df_final_test)
y_pred = xgb_model.predict(final_test_scaled)
y_pred_proba = xgb_model.predict_proba(final_test_scaled)[:, 1]

In [153]:
y_pred_proba

array([0.99384946, 0.99185854, 0.99479383, ..., 0.95641893, 0.9975528 ,
       0.9056701 ], dtype=float32)

In [157]:
original_test_df.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2
1,58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.1,Y,4
2,58647,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,Y,2
3,58648,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.9,0.14,N,7
4,58649,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,Y,4


In [160]:
df_final_sub = pd.DataFrame({
    'id':original_test_df['id'],
    'y_pred':y_pred_proba
})

df_final_sub.shape

(39098, 2)

In [159]:
df_final_sub.to_csv('submission_xgbc.csv', index=False)
# I have got 0.60653 for this

In [162]:
df_final_sub_2 = pd.DataFrame({
    'id':original_test_df['id'],
    'y_pred':y_pred
})
df_final_sub_2.to_csv('submission_xgbc_y_pred.csv', index=False)
# I have got 0.50271