In [1]:
import pandas as pd

dataset = pd.read_csv("Loan_default.csv")
print(dataset.columns)

Index(['LoanID', 'Age', 'Income', 'LoanAmount', 'CreditScore',
       'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm',
       'DTIRatio', 'Education', 'EmploymentType', 'MaritalStatus',
       'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner',
       'Default'],
      dtype='object')


In [2]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# --------------------------------------------------
# 1. Load dataset
# --------------------------------------------------
file_name = 'Loan_default.csv'
if not os.path.exists(file_name):
    raise FileNotFoundError(f"{file_name} not found. Please place the file in the current directory.")

dataset = pd.read_csv(file_name)

print("Dataset loaded successfully!")
print("\nFirst 5 rows:")
print(dataset.head())
print("\nColumn names in the dataset:")
print(dataset.columns.tolist())

# --------------------------------------------------
# 2. Define features and target
# --------------------------------------------------
# Drop identifier column
if 'LoanID' in dataset.columns:
    dataset = dataset.drop(columns=['LoanID'])

# Target column
target_col = 'Default'
if target_col not in dataset.columns:
    raise KeyError(f"Target column '{target_col}' not found.")

# Separate features and target
X = dataset.drop(columns=[target_col])
y = dataset[target_col]

print(f"\nFeatures shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"\nNumeric features ({len(numeric_features)}): {numeric_features}")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")

# --------------------------------------------------
# 3. Train/test split (stratified)
# --------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --------------------------------------------------
# 4. Preprocessing pipeline
# --------------------------------------------------
# Numeric transformer: scaling
numeric_transformer = StandardScaler()

# Categorical transformer: one-hot encoding
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Combine into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Fit preprocessor on training data and transform both sets
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Save the preprocessor (includes both scaler and encoder)
with open("preprocessor.pkl", "wb") as f:
    pickle.dump(preprocessor, f)
print("\nPreprocessor (scaler + encoder) saved as preprocessor.pkl")

# --------------------------------------------------
# 5. Build ANN model
# --------------------------------------------------
input_dim = X_train_processed.shape[1]
model = Sequential()
model.add(Dense(units=8, activation='relu', input_dim=input_dim))
model.add(Dense(units=8, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# --------------------------------------------------
# 6. Handle class imbalance
# --------------------------------------------------
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, weights))
print("\nClass weights:", class_weight_dict)

# --------------------------------------------------
# 7. Train with early stopping
# --------------------------------------------------
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = model.fit(
    X_train_processed, y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=50,
    class_weight=class_weight_dict,
    callbacks=[early_stop],
    verbose=1
)

# --------------------------------------------------
# 8. Evaluate on test set
# --------------------------------------------------
y_pred_prob = model.predict(X_test_processed)
y_pred = (y_pred_prob > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
print(f"\n✅ Test Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --------------------------------------------------
# 9. Save the trained model
# --------------------------------------------------
# model.save("ann_model.h5")
# print("Model saved as ann_model.h5")

Dataset loaded successfully!

First 5 rows:
       LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  I38PQUQS96   56   85994       50587          520              80   
1  HPSK72WA7R   69   50432      124440          458              15   
2  C1OZ6DPJ8Y   46   84208      129188          451              26   
3  V2KKSFM3UN   32   31713       44799          743               0   
4  EY08JDHTZP   60   20437        9139          633               8   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio    Education  \
0               4         15.23        36      0.44   Bachelor's   
1               1          4.81        60      0.68     Master's   
2               3         21.17        24      0.31     Master's   
3               3          7.07        24      0.23  High School   
4               4          6.51        48      0.73   Bachelor's   

  EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose  \
0      Full-time      Divorced         Yes      

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5107/5107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - accuracy: 0.6715 - loss: 0.6085 - val_accuracy: 0.7049 - val_loss: 0.5675
Epoch 2/50
[1m5107/5107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - accuracy: 0.6845 - loss: 0.5906 - val_accuracy: 0.7058 - val_loss: 0.5629
Epoch 3/50
[1m5107/5107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - accuracy: 0.6859 - loss: 0.5892 - val_accuracy: 0.6887 - val_loss: 0.5891
Epoch 4/50
[1m5107/5107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - accuracy: 0.6857 - loss: 0.5887 - val_accuracy: 0.6896 - val_loss: 0.5871
Epoch 5/50
[1m5107/5107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.6881 - loss: 0.5886 - val_accuracy: 0.6627 - val_loss: 0.6210
Epoch 6/50
[1m5107/5107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - accuracy: 0.6889 - loss: 0.5883 - val_accuracy: 0.6889 - val_loss: 0.5848
Epoch 7/50
[1m5107/5107[0

In [5]:
model.save("ann_model.h5")
print("Model saved as ann_model.h5")



Model saved as ann_model.h5


In [4]:
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.keras.models import load_model

# --------------------------------------------------
# 1. Load the saved preprocessor and model
# --------------------------------------------------
def load_artifacts(preprocessor_path='preprocessor.pkl', model_path='ann_model.h5'):
    """Load the preprocessor (scaler + encoder) and the trained Keras model."""
    with open(preprocessor_path, 'rb') as f:
        preprocessor = pickle.load(f)
    model = load_model(model_path)
    return preprocessor, model

# --------------------------------------------------
# 2. Predict function for a single sample or DataFrame
# --------------------------------------------------
def predict_loan_default(input_data, preprocessor, model, return_prob=False):
    """
    Predict loan default for new input data.
    
    Parameters:
    - input_data: pandas DataFrame or dict (will be converted to DataFrame)
    - preprocessor: fitted ColumnTransformer
    - model: trained Keras model
    - return_prob: if True, return probability scores; else return binary class (0/1)
    
    Returns:
    - predictions: numpy array of predictions
    """
    # Ensure input is a DataFrame
    if isinstance(input_data, dict):
        input_data = pd.DataFrame([input_data])
    elif not isinstance(input_data, pd.DataFrame):
        raise ValueError("input_data must be a pandas DataFrame or a dict")
    
    # Apply the same preprocessing (scaling + one-hot encoding)
    X_processed = preprocessor.transform(input_data)
    
    # Predict
    pred_prob = model.predict(X_processed).flatten()
    
    if return_prob:
        return pred_prob
    else:
        return (pred_prob > 0.5).astype(int)

# --------------------------------------------------
# 3. Example usage
# --------------------------------------------------
if __name__ == "__main__":
    # Load artifacts
    preprocessor, model = load_artifacts()
    
    # Example 1: Predict on a single new applicant (as a dict)
    new_applicant = {
        'Age': 50,
        'Income': 34641,
        'LoanAmount': 108855,
        'CreditScore': 347,
        'MonthsEmployed': 17,
        'NumCreditLines': 4,
        'InterestRate': 11.77,
        'LoanTerm': 24,
        'DTIRatio': 0.47,
        'Education': "PhD",
        'EmploymentType': 'Unemployed',
        'MaritalStatus': 'Divorced',
        'HasMortgage': 'Yes',
        'HasDependents': 'No',
        'LoanPurpose': 'Business',
        'HasCoSigner': 'No'
    }
    # Q8OXUYF0TI,50,34641,108855,347,17,4,11.77,24,0.47,PhD,Unemployed,Divorced,Yes,No,Business,No,1

    pred_class = predict_loan_default(new_applicant, preprocessor, model, return_prob=False)
    pred_prob = predict_loan_default(new_applicant, preprocessor, model, return_prob=True)
    
    print(f"Predicted class: {pred_class[0]}")
    print(f"Probability of default: {pred_prob[0]:.4f}")
    
    # Example 2: Predict on a batch from a CSV file (e.g., new_applicants.csv)
    # new_data = pd.read_csv('new_applicants.csv')
    # predictions = predict_loan_default(new_data, preprocessor, model)
    # print(predictions)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Predicted class: 1
Probability of default: 0.5755
