In [None]:
#Random Forest Classifier for Loan Status Prediction

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Custom Label Encoder Class Definition
class SafeLabelEncoder:
    def __init__(self):
        self.le = LabelEncoder()
        self.classes_ = None

    def fit_transform(self, X):
        self.le.fit(X)
        self.classes_ = set(self.le.classes_)
        return self.le.transform(X)

    def transform(self, X):
        unseen = [x for x in X if x not in self.classes_]
        return np.array([self.le.transform([x])[0] if x not in unseen else -1 for x in X])

# Load data
data = pd.read_csv('loan_data_set_Train.csv')  # Replace with the correct path to your data

# Preprocessing
# Handling missing data
imputer = SimpleImputer(strategy='most_frequent')
data_filled = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Encoding categorical variables using SafeLabelEncoder
label_encoders = {}
for column in data_filled.columns:
    if data_filled[column].dtype == 'object' and column not in ['Loan_ID', 'Loan_Status']:
        sle = SafeLabelEncoder()
        data_filled[column] = sle.fit_transform(data_filled[column])
        label_encoders[column] = sle

# Splitting data into features and target
X = data_filled.drop(['Loan_ID', 'Loan_Status'], axis=1)
y = data_filled['Loan_Status']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Predicting the Loan Status for a new candidate
# Assuming you have new candidate data in 'new_data.csv'
new_data = pd.read_csv('testdata.csv')
new_data_filled = pd.DataFrame(imputer.transform(new_data), columns=new_data.columns)
for column in new_data_filled.columns:
    if column in label_encoders:
        new_data_filled[column] = label_encoders[column].transform(new_data_filled[column])

# Ensure that new_data_filled does not include 'Loan_ID' or 'Loan_Status' in the feature set
new_X = new_data_filled.drop(['Loan_ID', 'Loan_Status'], axis=1, errors='ignore')

# Prediction
predicted_statuses = rf_classifier.predict(new_X)

# Combine Loan_IDs with their predicted Loan_Status
predictions = pd.DataFrame({
    'Loan_ID': new_data_filled['Loan_ID'],
    'Predicted_Loan_Status': predicted_statuses
})

print(predictions)


     Loan_ID Predicted_Loan_Status
0   LP001002                     N
1   LP001003                     Y
2   LP001005                     Y
3   LP001006                     Y
4   LP001008                     Y
5   LP001011                     Y
6   LP001013                     Y
7   LP001014                     N
8   LP001018                     N
9   LP001020                     Y
10  LP001024                     Y
11  LP001027                     Y
12  LP001028                     N
13  LP001029                     N
14  LP001030                     Y
15  LP001032                     N
16  LP001034                     Y
17  LP001036                     N
18  LP001038                     N


In [3]:
#Catboost Classifier for Loan Status Prediction

# ===================== 📦 IMPORT REQUIRED LIBRARIES ===================== #
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# ===================== 🛠️ SAFE LABEL ENCODER ===================== #
class SafeLabelEncoder:
    def __init__(self):
        self.le = LabelEncoder()
        self.classes_ = None

    def fit_transform(self, X):
        self.le.fit(X)
        self.classes_ = set(self.le.classes_)
        return self.le.transform(X)

    def transform(self, X):
        unseen = [x for x in X if x not in self.classes_]
        return np.array([self.le.transform([x])[0] if x not in unseen else -1 for x in X])

# ===================== 📥 LOAD TRAINING DATA ===================== #
data = pd.read_csv('loan_data_set_Train.csv')  # Update path if needed

# ===================== 🔧 IMPUTE MISSING VALUES ===================== #
imputer = SimpleImputer(strategy='most_frequent')
data_filled = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# ===================== 🔁 ENCODE CATEGORICAL FEATURES ===================== #
label_encoders = {}
for column in data_filled.columns:
    if data_filled[column].dtype == 'object' and column not in ['Loan_ID', 'Loan_Status']:
        sle = SafeLabelEncoder()
        data_filled[column] = sle.fit_transform(data_filled[column])
        label_encoders[column] = sle

# ===================== 🧾 SPLIT FEATURES & TARGET ===================== #
X = data_filled.drop(['Loan_ID', 'Loan_Status'], axis=1)
y = data_filled['Loan_Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ===================== 🧠 TRAIN CATBOOST CLASSIFIER ===================== #
cb_classifier = CatBoostClassifier(verbose=0, random_state=42)
cb_classifier.fit(X_train, y_train)

# ===================== 🧪 PREDICT NEW DATA ===================== #
new_data = pd.read_csv('testdata.csv')
new_data_filled = pd.DataFrame(imputer.transform(new_data), columns=new_data.columns)

# Apply label encoders
for column in new_data_filled.columns:
    if column in label_encoders:
        new_data_filled[column] = label_encoders[column].transform(new_data_filled[column])

# Remove unused columns
new_X = new_data_filled.drop(['Loan_ID', 'Loan_Status'], axis=1, errors='ignore')

# Prediction
predicted_statuses = cb_classifier.predict(new_X)

# Output Results
predictions = pd.DataFrame({
    'Loan_ID': new_data_filled['Loan_ID'],
    'Predicted_Loan_Status': predicted_statuses
})

print(predictions)

     Loan_ID Predicted_Loan_Status
0   LP001002                     Y
1   LP001003                     Y
2   LP001005                     Y
3   LP001006                     Y
4   LP001008                     Y
5   LP001011                     Y
6   LP001013                     Y
7   LP001014                     N
8   LP001018                     N
9   LP001020                     Y
10  LP001024                     Y
11  LP001027                     Y
12  LP001028                     N
13  LP001029                     N
14  LP001030                     Y
15  LP001032                     Y
16  LP001034                     Y
17  LP001036                     N
18  LP001038                     N


In [4]:
#LightGBM Classifier for Loan Status Prediction (Testing)
# ===================== 📦 IMPORT REQUIRED LIBRARIES ===================== #
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# ===================== 🛠️ SAFE LABEL ENCODER ===================== #
class SafeLabelEncoder:
    def __init__(self):
        self.le = LabelEncoder()
        self.classes_ = None

    def fit_transform(self, X):
        self.le.fit(X)
        self.classes_ = set(self.le.classes_)
        return self.le.transform(X)

    def transform(self, X):
        unseen = [x for x in X if x not in self.classes_]
        return np.array([self.le.transform([x])[0] if x not in unseen else -1 for x in X])

# ===================== 📥 LOAD TRAINING DATA ===================== #
data = pd.read_csv('loan_data_set_Train.csv')

# ===================== 🔧 IMPUTE MISSING VALUES ===================== #
imputer = SimpleImputer(strategy='most_frequent')
data_filled = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# ===================== 🔁 ENCODE CATEGORICAL FEATURES ===================== #
label_encoders = {}
for column in data_filled.columns:
    if data_filled[column].dtype == 'object' and column not in ['Loan_ID', 'Loan_Status']:
        sle = SafeLabelEncoder()
        data_filled[column] = sle.fit_transform(data_filled[column])
        label_encoders[column] = sle

# ===================== 🧾 SPLIT FEATURES & TARGET ===================== #
X = data_filled.drop(['Loan_ID', 'Loan_Status'], axis=1)
y = data_filled['Loan_Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ===================== 🧠 TRAIN LIGHTGBM CLASSIFIER ===================== #
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train, y_train)

# ===================== 🧪 PREDICT NEW DATA ===================== #
new_data = pd.read_csv('testdata.csv')
new_data_filled = pd.DataFrame(imputer.transform(new_data), columns=new_data.columns)

for column in new_data_filled.columns:
    if column in label_encoders:
        new_data_filled[column] = label_encoders[column].transform(new_data_filled[column])

new_X = new_data_filled.drop(['Loan_ID', 'Loan_Status'], axis=1, errors='ignore')

predicted_statuses = lgb_model.predict(new_X)

predictions = pd.DataFrame({
    'Loan_ID': new_data_filled['Loan_ID'],
    'Predicted_Loan_Status': predicted_statuses
})

print(predictions)


[LightGBM] [Info] Number of positive: 325, number of negative: 151
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001362 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 366
[LightGBM] [Info] Number of data points in the train set: 476, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.682773 -> initscore=0.766545
[LightGBM] [Info] Start training from score 0.766545
     Loan_ID Predicted_Loan_Status
0   LP001002                     N
1   LP001003                     N
2   LP001005                     Y
3   LP001006                     N
4   LP001008                     Y
5   LP001011                     Y
6   LP001013                     Y
7   LP001014                     N
8   LP001018                     N
9   LP001020                     Y
10  LP001024                     Y
11  LP001027                   

In [8]:
# XGBoost Classifier for Loan Status Prediction (Testing)
# ===================== 📦 IMPORT REQUIRED LIBRARIES ===================== #
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore", category=UserWarning)


# ===================== 🛡️ CUSTOM SAFE LABEL ENCODER ===================== #
class SafeLabelEncoder:
    def __init__(self):
        self.le = LabelEncoder()
        self.classes_ = None

    def fit_transform(self, X):
        self.le.fit(X)
        self.classes_ = set(self.le.classes_)
        return self.le.transform(X)

    def transform(self, X):
        return np.array([self.le.transform([x])[0] if x in self.classes_ else -1 for x in X])

# ===================== 📥 LOAD & PREPROCESS DATA ===================== #
data = pd.read_csv('loan_data_set_Train.csv')  # Replace with actual path

# Impute missing values
imputer = SimpleImputer(strategy='most_frequent')
data_filled = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Encode categorical features (excluding Loan_ID and Loan_Status)
label_encoders = {}
for column in data_filled.columns:
    if data_filled[column].dtype == 'object' and column not in ['Loan_ID', 'Loan_Status']:
        sle = SafeLabelEncoder()
        data_filled[column] = sle.fit_transform(data_filled[column])
        label_encoders[column] = sle

# Encode target variable (Loan_Status: Y/N → 1/0)
target_encoder = LabelEncoder()
data_filled['Loan_Status'] = target_encoder.fit_transform(data_filled['Loan_Status'])

# ===================== 🧾 SPLIT DATA ===================== #
X = data_filled.drop(['Loan_ID', 'Loan_Status'], axis=1)
y = data_filled['Loan_Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ===================== 🧠 TRAIN XGBOOST CLASSIFIER ===================== #
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

# ===================== 🧪 PREDICT ON NEW TEST DATA ===================== #
new_data = pd.read_csv('testdata.csv')  # Replace with actual path

# Impute missing values using same imputer
new_data_filled = pd.DataFrame(imputer.transform(new_data), columns=new_data.columns)

# Encode categorical features in new data
for column in new_data_filled.columns:
    if column in label_encoders:
        new_data_filled[column] = label_encoders[column].transform(new_data_filled[column])

# Prepare features for prediction
new_X = new_data_filled.drop(['Loan_ID', 'Loan_Status'], axis=1, errors='ignore')

# Predict and decode labels back to Y/N
predicted_numeric = xgb_model.predict(new_X)
predicted_statuses = target_encoder.inverse_transform(predicted_numeric)

# Combine Loan_IDs with their predicted Loan_Status
predictions = pd.DataFrame({
    'Loan_ID': new_data_filled['Loan_ID'],
    'Predicted_Loan_Status': predicted_statuses
})

# ===================== ✅ DISPLAY RESULTS ===================== #
print(predictions)


     Loan_ID Predicted_Loan_Status
0   LP001002                     N
1   LP001003                     N
2   LP001005                     Y
3   LP001006                     Y
4   LP001008                     Y
5   LP001011                     Y
6   LP001013                     Y
7   LP001014                     N
8   LP001018                     N
9   LP001020                     Y
10  LP001024                     Y
11  LP001027                     Y
12  LP001028                     N
13  LP001029                     N
14  LP001030                     Y
15  LP001032                     N
16  LP001034                     N
17  LP001036                     N
18  LP001038                     N
