In [None]:
# LoanTap Credit Underwriting Analysis

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve
from sklearn.impute import SimpleImputer
import joblib

# Load the data
df = pd.read_csv('../data/LoanTapData.csv')

# Exploratory Data Analysis
print(df.info())
print(df.describe())

# Check for missing values
print(df.isnull().sum())

# Univariate Analysis
plt.figure(figsize=(12, 6))
sns.histplot(df['loan_amnt'])
plt.title('Distribution of Loan Amount')
plt.show()

# Bivariate Analysis
plt.figure(figsize=(12, 6))
sns.boxplot(x='grade', y='loan_amnt', data=df)
plt.title('Loan Amount by Grade')
plt.show()

# Correlation Heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Data Preprocessing
# Convert categorical variables to category type
categorical_cols = ['term', 'grade', 'sub_grade', 'emp_title', 'home_ownership', 'verification_status', 'purpose', 'title', 'initial_list_status', 'application_type']
for col in categorical_cols:
    df[col] = df[col].astype('category')

# Feature Engineering
df['pub_rec_flag'] = (df['pub_rec'] > 1).astype(int)
df['mort_acc_flag'] = (df['mort_acc'] > 1).astype(int)
df['pub_rec_bankruptcies_flag'] = (df['pub_rec_bankruptcies'] > 1).astype(int)

# Handle missing values
imputer = SimpleImputer(strategy='median')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Prepare features and target
X = df_imputed.drop(['loan_status', 'Address'], axis=1)
y = df_imputed['loan_status']

# Encode categorical variables
X = pd.get_dummies(X, drop_first=True)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

# ROC AUC Curve
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC Score: {roc_auc}")

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

# Feature Importance
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': abs(model.coef_[0])})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print(feature_importance.head(10))

# Save the model
joblib.dump(model, '../models/loantap_model.pkl')

# Questionnaire Answers
print("Questionnaire Answers:")
print("1. Percentage of customers who fully paid their Loan Amount:", (df['loan_status'] == 'Fully Paid').mean() * 100)
print("2. Correlation between Loan Amount and Installment:", df['loan_amnt'].corr(df['installment']))
print("3. Majority home ownership:", df['home_ownership'].mode()[0])
print("4. People with grade 'A' are more likely to fully pay their loan:", df[df['grade'] == 'A']['loan_status'].value_counts(normalize=True)['Fully Paid'] > df['loan_status'].value_counts(normalize=True)['Fully Paid'])
print("5. Top 2 afforded job titles:", df['emp_title'].value_counts().nlargest(2))
print("6. Primary focus metric: Precision (to minimize false positives)")
print("7. Gap in precision and recall: A high precision with low recall means the model is conservative in predicting loan approvals, potentially missing out on good customers.")
print("8. Features heavily affecting the outcome:", feature_importance['feature'].head(5).tolist())
print("9. Results affected by geographical location: Yes (based on the 'Address' feature in the dataset)")