# Loan Approval Classification Project (Starter Notebook)
**Dataset:** `train.csv`

Run each section, then use the printed results + plots to write your report in your own words.


In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('train.csv')
df.head()

In [None]:
# Basic overview
print('Shape:', df.shape)
print('\nMissing values per column:')
print(df.isna().sum().sort_values(ascending=False))
print('\nTarget distribution:')
print(df['Loan_Status'].value_counts())

## Preprocessing + Train/Test Split
- Drop `Loan_ID` (identifier)
- Impute missing values
- One-hot encode categorical features
- Scale numeric features (important for KNN/SVM/LogReg)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

X = df.drop(columns=['Loan_Status','Loan_ID'])
y = df['Loan_Status'].map({'Y':1,'N':0})

num_cols = X.select_dtypes(include=['number']).columns
cat_cols = X.select_dtypes(exclude=['number']).columns

numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocess = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train.shape, X_test.shape

## Train 4 Classifiers (LogReg, KNN, Decision Tree, SVM)
Metrics: Accuracy, F1, Jaccard, LogLoss (when available).

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, jaccard_score, log_loss

models = {
    'LogisticRegression': LogisticRegression(max_iter=2000),
    'KNN': KNeighborsClassifier(n_neighbors=15),
    'DecisionTree': DecisionTreeClassifier(max_depth=5, random_state=42),
    'SVM_RBF': SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)
}

rows = []
fitted = {}
for name, model in models.items():
    pipe = Pipeline([('prep', preprocess), ('model', model)])
    pipe.fit(X_train, y_train)
    fitted[name] = pipe
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    jac = jaccard_score(y_test, y_pred)
    ll = None
    if hasattr(pipe.named_steps['model'], 'predict_proba'):
        ll = log_loss(y_test, pipe.predict_proba(X_test))
    rows.append([name, acc, f1, jac, ll])

results = pd.DataFrame(rows, columns=['Model','Accuracy','F1','Jaccard','LogLoss']).sort_values('F1', ascending=False)
results

## Pick a final model
Pick based on your goal:
- **Interpretation:** Logistic Regression (coefficients)
- **Pure prediction:** try tuning ensembles (Random Forest / Gradient Boosting)


In [None]:
# Example: interpret Logistic Regression coefficients
import numpy as np

log_pipe = fitted['LogisticRegression']
ohe = log_pipe.named_steps['prep'].named_transformers_['cat'].named_steps['onehot']
feature_names = np.r_[num_cols, ohe.get_feature_names_out(cat_cols)]
coefs = log_pipe.named_steps['model'].coef_.ravel()
coef_df = pd.DataFrame({'feature': feature_names, 'coef': coefs})
coef_df['abs'] = coef_df['coef'].abs()
coef_df.sort_values('abs', ascending=False).head(15)

## Plots for the report (confusion matrix, ROC, PR curve)
Use these plots in your PDF report.

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay, PrecisionRecallDisplay

best = fitted['LogisticRegression']
y_pred = best.predict(X_test)

ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=['N','Y'], values_format='d')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

if hasattr(best.named_steps['model'], 'predict_proba'):
    y_proba = best.predict_proba(X_test)[:,1]
    RocCurveDisplay.from_predictions(y_test, y_proba)
    plt.title('ROC Curve - Logistic Regression')
    plt.show()

    PrecisionRecallDisplay.from_predictions(y_test, y_proba)
    plt.title('Precision-Recall Curve - Logistic Regression')
    plt.show()