# Performing eda

## import required libraries 

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from typing import Tuple
import xgboost as xgb
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, log_loss, mean_squared_error
from sklearn.preprocessing import StandardScaler, minmax_scale, RobustScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import optuna
import warnings
warnings.filterwarnings('ignore')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%%time
# Configure notebook display settings to only use 2 decimal places, tables look nicer.
pd.options.display.float_format = '{:,.3f}'.format
pd.set_option('display.max_columns', 15) 
pd.set_option('display.max_rows', 50)

SEED   = 578

CPU times: user 58 µs, sys: 21 µs, total: 79 µs
Wall time: 85.1 µs


## Read Data

In [3]:
%%time
# Create a function to read the Datasets...

def read_csv_to_dataframe(file_path):
    """
    Reads a CSV file from the given file path and returns a pandas DataFrame.

    :param file_path: str, the path to the CSV file.
    :return: pandas.DataFrame, the data from the CSV file.
    """
    try:
        df = pd.read_csv(file_path)
        return df
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Usage:
train_path = 'train.csv'
test_path = 'test.csv'

train_df = read_csv_to_dataframe(train_path)
test_df = read_csv_to_dataframe(test_path)

# Creating new columns to check if data exist
train_df['is_original'] = 0
test_df['is_original'] = 0



CPU times: user 86.6 ms, sys: 52.8 ms, total: 139 ms
Wall time: 157 ms


## Read original dataset

In [4]:
original_df = pd.read_csv('credit_risk_dataset.csv')
original_df['is_original'] = 1
original_df = original_df.dropna()
train_df = pd.concat([train_df, original_df])

## Performing compelete Analysis of the data

In [5]:
# Creating a function for analysis of data
%%time
def analyze_dataframe(df):
    """
    Analyze a pandas DataFrame and provide a summary of its characteristics.

    Parameters:
    df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
    None
    """
    print("DataFrame Information:")
    print("----------------------")
    display(df.info(verbose=True, show_counts=True))
    print("\n")
    
    print("DataFrame Values:")
    print("----------------------")
    display(df.head(10).T)
    print("\n")

    print("DataFrame Description:")
    print("----------------------")
    display(df.describe().T)
    print("\n")

    print("Number of Null Values:")
    print("----------------------")
    display(df.isnull().sum())
    print("\n")

    print("Number of Duplicated Rows:")
    print("--------------------------")
    display(df.duplicated().sum())
    print("\n")

    print("Number of Unique Values:")
    print("------------------------")
    display(df.nunique())
    print("\n")

    print("DataFrame Shape:")
    print("----------------")
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")

# Usage
# Analysis of the training data
analyze_dataframe(train_df)

DataFrame Information:
----------------------
<class 'pandas.core.frame.DataFrame'>
Index: 87283 entries, 0 to 32580
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          58645 non-null  float64
 1   person_age                  87283 non-null  int64  
 2   person_income               87283 non-null  int64  
 3   person_home_ownership       87283 non-null  object 
 4   person_emp_length           87283 non-null  float64
 5   loan_intent                 87283 non-null  object 
 6   loan_grade                  87283 non-null  object 
 7   loan_amnt                   87283 non-null  int64  
 8   loan_int_rate               87283 non-null  float64
 9   loan_percent_income         87283 non-null  float64
 10  cb_person_default_on_file   87283 non-null  object 
 11  cb_person_cred_hist_length  87283 non-null  int64  
 12  loan_status                 87283 non-null  int

None



DataFrame Values:
----------------------


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
id,0.000,1.000,2.000,3.000,4.000,5.000,6.000,7.000,8.000,9.000
person_age,37,22,29,30,22,27,25,21,37,35
person_income,35000,56000,28800,70000,60000,45000,45000,20000,69600,110000
person_home_ownership,RENT,OWN,OWN,RENT,RENT,RENT,MORTGAGE,RENT,RENT,MORTGAGE
person_emp_length,0.000,6.000,8.000,14.000,2.000,2.000,9.000,0.000,11.000,0.000
loan_intent,EDUCATION,MEDICAL,PERSONAL,VENTURE,MEDICAL,VENTURE,EDUCATION,PERSONAL,EDUCATION,DEBTCONSOLIDATION
loan_grade,B,C,A,B,A,A,A,C,D,C
loan_amnt,6000,4000,6000,12000,6000,9000,12000,2500,5000,15000
loan_int_rate,11.490,13.350,8.900,11.110,6.920,8.940,6.540,13.490,14.840,12.980
loan_percent_income,0.170,0.070,0.210,0.170,0.100,0.200,0.270,0.130,0.070,0.140




DataFrame Description:
----------------------


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,58645.0,29322.0,16929.498,0.0,14661.0,29322.0,43983.0,58644.0
person_age,87283.0,27.609,6.126,20.0,23.0,26.0,30.0,144.0
person_income,87283.0,64900.296,47370.243,4000.0,40000.0,57000.0,78000.0,6000000.0
person_emp_length,87283.0,4.73,4.025,0.0,2.0,4.0,7.0,123.0
loan_amnt,87283.0,9361.574,5829.811,500.0,5000.0,8000.0,12000.0,35000.0
loan_int_rate,87283.0,10.797,3.105,5.42,7.88,10.99,13.11,23.22
loan_percent_income,87283.0,0.163,0.097,0.0,0.09,0.14,0.22,0.83
cb_person_cred_hist_length,87283.0,5.807,4.032,2.0,3.0,4.0,8.0,30.0
loan_status,87283.0,0.167,0.373,0.0,0.0,0.0,0.0,1.0
is_original,87283.0,0.328,0.47,0.0,0.0,0.0,1.0,1.0




Number of Null Values:
----------------------


id                            28638
person_age                        0
person_income                     0
person_home_ownership             0
person_emp_length                 0
loan_intent                       0
loan_grade                        0
loan_amnt                         0
loan_int_rate                     0
loan_percent_income               0
cb_person_default_on_file         0
cb_person_cred_hist_length        0
loan_status                       0
is_original                       0
dtype: int64



Number of Duplicated Rows:
--------------------------


137



Number of Unique Values:
------------------------


id                            58645
person_age                       57
person_income                  4088
person_home_ownership             4
person_emp_length                38
loan_intent                       6
loan_grade                        7
loan_amnt                       746
loan_int_rate                   388
loan_percent_income              78
cb_person_default_on_file         2
cb_person_cred_hist_length       29
loan_status                       2
is_original                       2
dtype: int64



DataFrame Shape:
----------------
Rows: 87283, Columns: 14
CPU times: user 215 ms, sys: 22.4 ms, total: 237 ms
Wall time: 336 ms


In [6]:
# Analysis of the original data
analyze_dataframe(original_df)

DataFrame Information:
----------------------
<class 'pandas.core.frame.DataFrame'>
Index: 28638 entries, 0 to 32580
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  28638 non-null  int64  
 1   person_income               28638 non-null  int64  
 2   person_home_ownership       28638 non-null  object 
 3   person_emp_length           28638 non-null  float64
 4   loan_intent                 28638 non-null  object 
 5   loan_grade                  28638 non-null  object 
 6   loan_amnt                   28638 non-null  int64  
 7   loan_int_rate               28638 non-null  float64
 8   loan_status                 28638 non-null  int64  
 9   loan_percent_income         28638 non-null  float64
 10  cb_person_default_on_file   28638 non-null  object 
 11  cb_person_cred_hist_length  28638 non-null  int64  
 12  is_original                 28638 non-null  int

None



DataFrame Values:
----------------------


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
person_age,22,21,25,23,24,21,26,24,24,21
person_income,59000,9600,9600,65500,54400,9900,77100,78956,83000,10000
person_home_ownership,RENT,OWN,MORTGAGE,RENT,RENT,OWN,RENT,RENT,RENT,OWN
person_emp_length,123.000,5.000,1.000,4.000,8.000,2.000,8.000,5.000,8.000,6.000
loan_intent,PERSONAL,EDUCATION,MEDICAL,MEDICAL,MEDICAL,VENTURE,EDUCATION,MEDICAL,PERSONAL,VENTURE
loan_grade,D,B,C,C,C,A,B,B,A,D
loan_amnt,35000,1000,5500,35000,35000,2500,35000,35000,35000,1600
loan_int_rate,16.020,11.140,12.870,15.230,14.270,7.140,12.420,11.110,8.900,14.740
loan_status,1,0,1,1,1,1,1,1,1,1
loan_percent_income,0.590,0.100,0.570,0.530,0.550,0.250,0.450,0.440,0.420,0.160




DataFrame Description:
----------------------


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
person_age,28638.0,27.727,6.31,20.0,23.0,26.0,30.0,144.0
person_income,28638.0,66649.372,62356.447,4000.0,39480.0,55956.0,80000.0,6000000.0
person_emp_length,28638.0,4.789,4.155,0.0,2.0,4.0,7.0,123.0
loan_amnt,28638.0,9656.493,6329.683,500.0,5000.0,8000.0,12500.0,35000.0
loan_int_rate,28638.0,11.04,3.229,5.42,7.9,10.99,13.48,23.22
loan_status,28638.0,0.217,0.412,0.0,0.0,0.0,0.0,1.0
loan_percent_income,28638.0,0.169,0.106,0.0,0.09,0.15,0.23,0.83
cb_person_cred_hist_length,28638.0,5.794,4.038,2.0,3.0,4.0,8.0,30.0
is_original,28638.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0




Number of Null Values:
----------------------


person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
is_original                   0
dtype: int64



Number of Duplicated Rows:
--------------------------


137



Number of Unique Values:
------------------------


person_age                      57
person_income                 3835
person_home_ownership            4
person_emp_length               36
loan_intent                      6
loan_grade                       7
loan_amnt                      728
loan_int_rate                  348
loan_status                      2
loan_percent_income             77
cb_person_default_on_file        2
cb_person_cred_hist_length      29
is_original                      1
dtype: int64



DataFrame Shape:
----------------
Rows: 28638, Columns: 13


## Creating a Function for Counting outliers

In [7]:
def count_outliers(dataframe, features):
    """
    Identifies outliers in the specified features of a dataframe using both the IQR (Interquartile Range) method
    and the Z-score method. Adds two new columns to the dataframe to indicate the total number of outliers for each row.

    Parameters:
    dataframe (pd.DataFrame): The input dataframe containing the data.
    features (list of str): The list of feature names for which to identify outliers.

    Returns:
    pd.DataFrame: The dataframe with two new columns: 'total_outliers_IQR' and 'total_outliers_Z', indicating the number of outliers
    
    for each row as identified by the IQR and Z-score methods, respectively.
    """
    # Create new columns to store the total number of outliers for each row
    dataframe['total_outliers_IQR'] = 0
    dataframe['total_outliers_Z'] = 0
    
    for feature in features:
        # Calculate Q1 (25th percentile) and Q3 (75th percentile) for the feature
        Q1 = dataframe[feature].quantile(0.25)
        Q3 = dataframe[feature].quantile(0.75)
        
        # Calculate the Interquartile Range (IQR)
        IQR = Q3 - Q1
        
        # Define the lower and upper bounds for outliers (IQR method)
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Identify outliers for the feature using IQR method
        outliers_IQR = (dataframe[feature] < lower_bound) | (dataframe[feature] > upper_bound)
        
        # Add the outliers to the total count for each row (IQR method)
        dataframe['total_outliers_IQR'] += outliers_IQR.astype(int)
        
        # Calculate the Z-score for the feature
        z_scores = stats.zscore(dataframe[feature])
        
        # Identify outliers for the feature using Z-score method (threshold of 3)
        outliers_Z = np.abs(z_scores) > 3
        
        # Add the outliers to the total count for each row (Z-score method)
        dataframe['total_outliers_Z'] += outliers_Z.astype(int)
        
    return dataframe


# Count outliers for specified features
features = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
train_df = count_outliers(train_df, features)
test_df = count_outliers(test_df, features)
train_df.sample(20)

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,...,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,is_original,total_outliers_IQR,total_outliers_Z
45226,45226.0,25,75000,MORTGAGE,9.0,PERSONAL,C,...,0.13,Y,4,0,0,0,0
53156,53156.0,25,60000,RENT,0.0,EDUCATION,A,...,0.22,N,3,0,0,0,0
47838,47838.0,26,30000,RENT,0.0,PERSONAL,A,...,0.27,N,3,0,0,0,0
14254,14254.0,23,65000,MORTGAGE,5.0,VENTURE,B,...,0.11,N,4,0,0,0,0
558,,23,80000,RENT,3.0,MEDICAL,B,...,0.26,N,3,0,1,0,0
15103,,25,121680,MORTGAGE,3.0,HOMEIMPROVEMENT,B,...,0.12,N,3,0,1,0,0
53165,53165.0,25,36000,OWN,0.0,VENTURE,C,...,0.14,Y,3,0,0,0,0
31198,31198.0,26,45600,RENT,5.0,MEDICAL,A,...,0.15,N,3,0,0,0,0
9354,,26,61200,OWN,4.0,HOMEIMPROVEMENT,C,...,0.02,Y,2,0,1,0,0
32089,,42,48000,OWN,0.0,EDUCATION,C,...,0.07,Y,17,0,1,2,0


# Creating a Function for Encoding Categorial data

In [8]:
def encode_categorical_variables(train, test):
    """
    This function performs label encoding on categorical variables in the train and test datasets.
    It identifies categorical variables, encodes them in the train dataset, and applies the same
    encoding to the test dataset.

    param train: Pandas DataFrame for training data
    param test: Pandas DataFrame for testing data
    return: Tuple of DataFrames with encoded variables (encoded_train, encoded_test)
    
    """
    
    # Identifying categorical variables in the train dataset
    categorical_columns = train.select_dtypes(include = ['object', 'category']).columns
    print(categorical_columns)

    # Creating a copy of the datasets to avoid modifying the original ones
    encoded_train = train.copy(deep = True)
    encoded_train['is_train'] = 1
    encoded_test = test.copy(deep = True)
    encoded_test['is_train'] = 0
    encoded_tmp = pd.concat([encoded_train, encoded_test])

    # Initializing and applying LabelEncoder to each categorical column
    for column in categorical_columns:
        le = LabelEncoder()
        # Fit on the train data and transform both train and test data
        le.fit(encoded_tmp[column])
        # Transform test data using the same encoder
        encoded_tmp[column] = le.transform(encoded_tmp[column])
        
        encoded_train_df = encoded_tmp[encoded_tmp['is_train'] == 1]
        encoded_test_df = encoded_tmp[encoded_tmp['is_train'] == 0]
        
        encoded_train_df.drop(columns = ['is_train'], inplace = True)
        encoded_test_df.drop(columns = ['is_train'], inplace = True)

    return encoded_train_df, encoded_test_df

# Example usage
# Assuming you have train_df and test_df as your train and test DataFrames
train_df, test_df = encode_categorical_variables(train_df, test_df)

Index(['person_home_ownership', 'loan_intent', 'loan_grade',
       'cb_person_default_on_file'],
      dtype='object')


In [9]:
%%time
avoid_this_features = ['id', 'loan_status']
features = [feat for feat in train_df.columns if feat not in avoid_this_features]

CPU times: user 27 µs, sys: 8 µs, total: 35 µs
Wall time: 40.8 µs


## Creating a function for Fitting the data on XGboost Classifier

In [10]:
%%time
def fit_xgboost_with_kfold(df, features, target_variable, parameters, n_splits=10,  random_state=SEED):
    """
    Fit an XGBoost Classifier to a pandas DataFrame with k-fold cross-validation.

    Parameters:
    df (pandas.DataFrame): The input DataFrame.
    target_variable (str): The name of the target variable column in the DataFrame.
    n_splits (int): The number of folds in the cross-validation (default: 5).
    random_state (int): A random seed for reproducible results (default: 42).

    Returns:
    xgboost.XGBClassifier: A trained XGBoost Classifier model.
    """
    X = df.drop(columns=[target_variable])
    y = df[target_variable]

    kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
 
    model = xgb.XGBClassifier(**parameters)

    fold_rocs = []
    fold_loglosses = []
    fold_predictions = []
    fold = 1

    for train_index, test_index in kfold.split(X[features], y):
        print(f'Training Fold: {fold} ...')
        X_train, X_test = X[features].iloc[train_index], X[features].iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train,
                  y_train,
                  eval_set = [(X_test, y_test)], 
                  verbose = 512,)
        
        #best_iteration = model.get_booster().best_ntree_limit
        #y_pred = model.predict(X_test, ntree_limit=best_iteration)
        #y_pred_proba = model.predict_proba(X_test, ntree_limit=best_iteration)[:,1]
        
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:,1]
        
        fold_logloss = log_loss(y_test, y_pred_proba)
        fold_roc = roc_auc_score(y_test, y_pred_proba)
        fold_rocs.append(fold_roc)
        fold_loglosses.append(fold_logloss)
        fold += 1
        
        test_pred = model.predict_proba(test_df[features])[:,1]
        fold_predictions.append(test_pred)
        
        print('....', '\n')

    predictions = np.mean(fold_predictions, axis=0)

    print("Fold Accuracies:", fold_rocs)
    print("Fold Log Losses:", fold_loglosses)
    print("Mean AUC:", sum(fold_rocs) / len(fold_rocs))
    print("Mean Log Loss:", sum(fold_loglosses) / len(fold_loglosses))

    return model, predictions 

CPU times: user 5 µs, sys: 2 µs, total: 7 µs
Wall time: 11.4 µs


## Creating a dictionary for all the parameters for Training

In [11]:
params = {'n_estimators'          : 4096,
          'max_depth'             : 10,
          'learning_rate'         : 0.025,
          'booster'               : 'gbtree',
          'subsample'             : 0.95,
          'colsample_bytree'      : 0.45,
          'reg_lambda'            : 1.50,
          'reg_alpha'             : 1.0,
          'gamma'                 : 1.00,
          'random_state'          : SEED,
          'objective'             : 'binary:logistic',
          #'tree_method'           : 'gpu_hist',
          'eval_metric'           : 'auc',
          'early_stopping_rounds' : 256,
          'n_jobs'                : -1,
         }

# Training XGBoost model on the data

In [12]:

xgboost_model, xgboost_predictions = fit_xgboost_with_kfold(train_df, 
                                                            features, 
                                                            target_variable='loan_status',
                                                            parameters = params, 
                                                            random_state=SEED, 
                                                            n_splits = 10)

Training Fold: 1 ...
[0]	validation_0-auc:0.82264
[512]	validation_0-auc:0.95616
[1024]	validation_0-auc:0.95711
[1254]	validation_0-auc:0.95712
.... 

Training Fold: 2 ...
[0]	validation_0-auc:0.81625
[512]	validation_0-auc:0.95238
[1024]	validation_0-auc:0.95333
[1536]	validation_0-auc:0.95353
[2048]	validation_0-auc:0.95366
[2560]	validation_0-auc:0.95369
[2666]	validation_0-auc:0.95368
.... 

Training Fold: 3 ...
[0]	validation_0-auc:0.81278
[512]	validation_0-auc:0.95626
[1024]	validation_0-auc:0.95705
[1536]	validation_0-auc:0.95715
[1975]	validation_0-auc:0.95717
.... 

Training Fold: 4 ...
[0]	validation_0-auc:0.81803
[512]	validation_0-auc:0.95267
[1024]	validation_0-auc:0.95351
[1536]	validation_0-auc:0.95362
[1833]	validation_0-auc:0.95367
.... 

Training Fold: 5 ...
[0]	validation_0-auc:0.81635
[512]	validation_0-auc:0.95650
[1024]	validation_0-auc:0.95710
[1536]	validation_0-auc:0.95728
[1987]	validation_0-auc:0.95728
.... 

Training Fold: 6 ...
[0]	validation_0-auc:0.8283

# Creating a Function of All the models

In [14]:
def blended_predictions(train, test, features):
    # Initialize the classifiers
    classifiers = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "hist_gbm" : HistGradientBoostingClassifier (max_iter=300, learning_rate=0.001,  max_leaf_nodes=80),
        "CatBoost": CatBoostClassifier(silent=True),
        "LGBM": LGBMClassifier(),
        "XGBoost": XGBClassifier()
    }
    
    test_preds = []
    
    for name, clf in classifiers.items():
        # Cross-validation predictions on training set
        cross_val_pred = cross_val_predict(clf, train[features], train['loan_status'], cv=10, method='predict_proba')[:, 1]
        
        # Fit the classifier to the entire training set
        clf.fit(train[features], train['loan_status'])
        
        # Predict on the test set
        test_pred = clf.predict_proba(test[features])[:, 1]
        test_preds.append(test_pred)
        
        print(f"{name} done!")
    
    # Average the predictions from all classifiers
    blended_pred = np.mean(test_preds, axis=0)
    
    return blended_pred

# Training data with all the models

In [15]:
blend_predictions = blended_predictions(train_df, test_df, features)

Logistic Regression done!
hist_gbm done!
CatBoost done!
[LightGBM] [Info] Number of positive: 13097, number of negative: 65457
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003968 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 904
[LightGBM] [Info] Number of data points in the train set: 78554, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166726 -> initscore=-1.609010
[LightGBM] [Info] Start training from score -1.609010
[LightGBM] [Info] Number of positive: 13097, number of negative: 65457
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003529 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 902
[LightGBM] [Info] Number of data points in the train set: