In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Downloads/loan_default_prediction.csv')

# Display the first few rows of the dataset
df.head()


Unnamed: 0,ID,Loan Amount,Funded Amount,Funded Amount Investor,Term,Batch Enrolled,Interest Rate,Grade,Sub Grade,Employment Duration,...,Recoveries,Collection Recovery Fee,Collection 12 months Medical,Application Type,Last week Pay,Accounts Delinquent,Total Collection Amount,Total Current Balance,Total Revolving Credit Limit,Loan Status
0,56492997,17120,10365,16025.08269,59,BAT2575549,12.163926,A,D1,RENT,...,8.425776,0.731797,0,INDIVIDUAL,135,0,24,475442,4364,
1,22540813,7133,11650,12615.7956,59,BAT2833642,6.564296,B,E3,MORTGAGE,...,6.157008,0.992918,0,INDIVIDUAL,56,0,1,72412,2573,
2,9862181,25291,25825,11621.28083,59,BAT1761981,14.7299,A,C3,MORTGAGE,...,5.705077,0.28158,0,INDIVIDUAL,3,0,26,284825,19676,
3,10097822,30781,9664,15375.82351,59,BAT5341619,10.523767,A,A2,RENT,...,2.469688,0.959162,0,INDIVIDUAL,21,0,32,40842,7226,
4,47771809,8878,9419,7176.647582,58,BAT4694572,9.997013,C,B3,OWN,...,2.127835,0.402315,0,INDIVIDUAL,104,0,33,90825,26145,


In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

# Check the shape of the data
print(df.shape)

# Check for missing values
print(df.isnull().sum())

# Summary statistics
print(df.describe())

# # Check the distribution of the target variable
# sns.countplot(x='default', data=df)
# plt.show()

# Correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()


(28913, 35)
ID                                  0
Loan Amount                         0
Funded Amount                       0
Funded Amount Investor              0
Term                                0
Batch Enrolled                      0
Interest Rate                       0
Grade                               0
Sub Grade                           0
Employment Duration                 0
Home Ownership                      0
Verification Status                 0
Payment Plan                        0
Loan Title                          0
Debit to Income                     0
Delinquency - two years             0
Inquires - six months               0
Open Account                        0
Public Record                       0
Revolving Balance                   0
Revolving Utilities                 0
Total Accounts                      0
Initial List Status                 0
Total Received Interest             0
Total Received Late Fee             0
Recoveries                          0


ValueError: could not convert string to float: 'BAT2575549'

<Figure size 1200x800 with 0 Axes>

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Handle missing values (example: fill with mean for numeric columns)
df.fillna(df.mean(), inplace=True)

# Encode categorical variables (example: using LabelEncoder)
le = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    df[col] = le.fit_transform(df[col])

# Separate features and target variable
X = df.drop('default', axis=1)
y = df['default']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
from sklearn.feature_selection import SelectKBest, chi2

# Select the top 10 features
selector = SelectKBest(chi2, k=10)
X_selected = selector.fit_transform(X_scaled, y)

# Get the selected feature names
selected_features = X.columns[selector.get_support()]
print(selected_features)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Initialize the models
models = {
    'RandomForest': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(),
    'SVM': SVC()
}


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Train and evaluate the models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'--- {name} ---')
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
    print('Classification Report:\n', classification_report(y_test, y_pred))


In [None]:
from sklearn.metrics import roc_auc_score, roc_curve

# Evaluate models using ROC-AUC
for name, model in models.items():
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)
    print(f'{name} ROC-AUC: {auc}')
    
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.2f})')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


In [None]:
# Example using Flask
from flask import Flask, request, jsonify
import joblib

app = Flask(__name__)

# Load the trained model
model = joblib.load('best_model.pkl')

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json(force=True)
    prediction = model.predict([data['features']])
    return jsonify({'prediction': int(prediction[0])})

if __name__ == '__main__':
    app.run(debug=True)
