### Business Understanding
Project Idea: Develop a model to predict whether a loan will be paid back or defaulted using the LendingClub dataset.
Design a system to predict whether borrowers will repay their loans or not, based on their information and the loan details.
Construct a classification system to estimate the probability of loan default based on borrower and loan details.
#### objectives
1. Data Collection
2. Data Preprocessing
3. Exploratory Data Analysis (EDA)
4. Feature Engineering
5. Modeling
6. Evaluation
7. Conclusion

### Data Understanding
LendingClub dataset: Available on Kaggle.

### Load data

In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score


# Load the dataset
data = pd.read_csv('data/loans_full_schema.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,emp_title,emp_length,state,homeownership,annual_income,verified_income,debt_to_income,annual_income_joint,verification_income_joint,...,sub_grade,issue_month,loan_status,initial_listing_status,disbursement_method,balance,paid_total,paid_principal,paid_interest,paid_late_fees
0,1,global config engineer,3.0,NJ,MORTGAGE,90000.0,Verified,18.01,,,...,C3,Mar-2018,Current,whole,Cash,27015.86,1999.33,984.14,1015.19,0.0
1,2,warehouse office clerk,10.0,HI,RENT,40000.0,Not Verified,5.04,,,...,C1,Feb-2018,Current,whole,Cash,4651.37,499.12,348.63,150.49,0.0
2,3,assembly,3.0,WI,RENT,40000.0,Source Verified,21.15,,,...,D1,Feb-2018,Current,fractional,Cash,1824.63,281.8,175.37,106.43,0.0
3,4,customer service,1.0,PA,RENT,30000.0,Not Verified,10.16,,,...,A3,Jan-2018,Current,whole,Cash,18853.26,3312.89,2746.74,566.15,0.0
4,5,security supervisor,10.0,CA,RENT,35000.0,Verified,57.96,57000.0,Verified,...,C3,Mar-2018,Current,whole,Cash,21430.15,2324.65,1569.85,754.8,0.0


### Preprocess the data

In [9]:
# Handling missing values
data = data.dropna()

# Encoding categorical variables
data = pd.get_dummies(data, drop_first=True)

# Check column names again to see the changes
print(data.columns)

# Normalizing numerical features
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Convert scaled data back to DataFrame
data_scaled = pd.DataFrame(data_scaled, columns=data.columns)

# Display the first few rows after scaling
print(data_scaled.head())


Index(['Unnamed: 0', 'emp_length', 'annual_income', 'debt_to_income',
       'annual_income_joint', 'debt_to_income_joint', 'delinq_2y',
       'months_since_last_delinq', 'earliest_credit_line',
       'inquiries_last_12m',
       ...
       'sub_grade_F1', 'sub_grade_F2', 'sub_grade_F5', 'issue_month_Jan-2018',
       'issue_month_Mar-2018', 'loan_status_Fully Paid',
       'loan_status_In Grace Period', 'loan_status_Late (16-30 days)',
       'loan_status_Late (31-120 days)', 'initial_listing_status_whole'],
      dtype='object', length=304)
   Unnamed: 0  emp_length  annual_income  debt_to_income  annual_income_joint  \
0   -1.918572    0.966158       0.044447       -0.357741            -0.489263   
1   -1.896312    0.966158      -1.018184        0.484122            -0.600975   
2   -1.885183    0.966158       1.740136       -0.419872             0.308681   
3   -1.855025   -0.397590       0.270539       -0.323311            -0.170085   
4   -1.805481    0.966158       1.174907    

In [10]:
# Display the column names
print(data.columns)


Index(['Unnamed: 0', 'emp_length', 'annual_income', 'debt_to_income',
       'annual_income_joint', 'debt_to_income_joint', 'delinq_2y',
       'months_since_last_delinq', 'earliest_credit_line',
       'inquiries_last_12m',
       ...
       'sub_grade_F1', 'sub_grade_F2', 'sub_grade_F5', 'issue_month_Jan-2018',
       'issue_month_Mar-2018', 'loan_status_Fully Paid',
       'loan_status_In Grace Period', 'loan_status_Late (16-30 days)',
       'loan_status_Late (31-120 days)', 'initial_listing_status_whole'],
      dtype='object', length=304)


### EDA

In [11]:
# Exploratory Data Analysis (EDA)
# Use one of the new dummy columns created by pd.get_dummies as the target
target = 'loan_status_Fully Paid'  # or any other relevant status

# Distribution of target variable
sns.countplot(data[target])
plt.show()

# Correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.show()


KeyError: 0

### Feature Engineering

In [None]:
# Feature selection based on correlation
correlation = data.corr()
target_corr = correlation['loan_status'].abs().sort_values(ascending=False)
important_features = target_corr[target_corr > 0.1].index

data = data[important_features]


### Modeling

In [None]:
# Split data
X = data.drop('loan_status', axis=1)
y = data['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Gradient Boosting
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

# XGBoost
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)


### Evaluate model performance

In [None]:
# Logistic Regression
y_pred_log_reg = log_reg.predict(X_test)
print('Logistic Regression')
print(classification_report(y_test, y_pred_log_reg))
print('AUC-ROC:', roc_auc_score(y_test, log_reg.predict_proba(X_test)[:, 1]))

# Random Forest
y_pred_rf = rf.predict(X_test)
print('Random Forest')
print(classification_report(y_test, y_pred_rf))
print('AUC-ROC:', roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]))

# Gradient Boosting
y_pred_gb = gb.predict(X_test)
print('Gradient Boosting')
print(classification_report(y_test, y_pred_gb))
print('AUC-ROC:', roc_auc_score(y_test, gb.predict_proba(X_test)[:, 1]))

# XGBoost
y_pred_xgb = xgb_model.predict(X_test)
print('XGBoost')
print(classification_report(y_test, y_pred_xgb))
print('AUC-ROC:', roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1]))


### Report findings