<a href="https://colab.research.google.com/github/Nirmal456/Machine-Learning-labs/blob/main/LAB_4_AP23110010200.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Logistic Regression: Loan Approval — Step‑by‑Step (Colab Tutorial)

**Goal:** Learn *how weights update* and *how the decision boundary appears* using a tiny 2‑feature dataset: **salary** and **loan amount** → output: **approve (1) or reject (0)**.

**How to use:** Run each cell from top to bottom. Each step is short and self‑contained.


## Step 1 — Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Make printing easier to read
np.set_printoptions(precision=4, suppress=True)
pd.set_option('display.precision', 4)


## Step 2 — Make a tiny dataset

We create two features and a label:
- `salary` (scaled 0–1)
- `loan_amount` (scaled 0–1)
- `approve` (1) or `reject` (0)

You can open the CSV later if you want to see raw values.


In [None]:
# Reproducible random data (already saved as CSV for reference)
from pathlib import Path

rng = np.random.default_rng(7)

# Load the prepared CSV so students can also download it if needed
df = pd.read_csv('/content/loan_toy_simple.csv')
df.head()


FileNotFoundError: [Errno 2] No such file or directory: '/content/loan_toy_simple.csv'

## Step 3 — Scale features and visualize

We scale to [0,1] for stable/fast gradient descent, then scatter‑plot the two classes.


In [None]:
from pathlib import Path

rng = np.random.default_rng(7)

# Load the prepared CSV so students can also download it if needed
df = pd.read_csv('/content/loan_toy_simple.csv')
df.head()

def minmax(a: np.ndarray):
    return (a - a.min()) / (a.max() - a.min())

salary = minmax(df['salary_lakhs'].values)
loan   = minmax(df['loan_lakhs'].values)
y      = df['approve'].values.astype(float)

X = np.column_stack([salary, loan])  # shape (n, 2)

# Quick scatter
plt.figure()
plt.scatter(X[y==1,0], X[y==1,1], label='Approved (y=1)')
plt.scatter(X[y==0,0], X[y==0,1], label='Rejected (y=0)')
plt.xlabel('Salary (scaled 0-1)')
plt.ylabel('Loan amount (scaled 0-1)')
plt.legend()
plt.title('Toy Loan Data (scaled)')
plt.tight_layout()
plt.show()

## Step 4 — Define the model and loss

- Sigmoid: \(\sigma(z)=1/(1+e^{-z})\)
- Logit: \(z=w^\top x + b\)
- Probability: \(p=\sigma(z)\)
- Log loss: \(-\big[y\log p + (1-y)\log(1-p)\big]\)
- Gradients: \(\nabla_w L=\frac{1}{n}X^\top(p-y),\ \partial L/\partial b=\text{mean}(p-y)\)


In [None]:
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

def predict_proba(W, b, X):
    return sigmoid(X @ W + b)

def log_loss(p, y):
    eps = 1e-12
    p = np.clip(p, eps, 1-eps)
    return -np.mean(y*np.log(p) + (1-y)*np.log(1-p))

def gradients(W, b, X, y):
    p = predict_proba(W, b, X)
    diff = (p - y)        # shape (n,)
    dW = (X.T @ diff) / X.shape[0]
    db = np.mean(diff)
    return dW, db, p


## Step 5 — Initialize weights

We start small and random. Change the learning rate (`lr`) or epochs to see different behaviors.


In [None]:
rng = np.random.default_rng(0)
W = rng.normal(0, 0.01, size=2)
b = 0.0
lr = 0.5
epochs = 80

print('Initial W:', W)
print('Initial b:', b)


Initial W: [ 0.0013 -0.0013]
Initial b: 0.0


## Step 6 — **One** manual gradient‑descent step

We compute gradients and update once: \(W \leftarrow W - \eta\,dW\), \(b \leftarrow b - \eta\,db\).

In [None]:
# Compute current loss and one GD step
dW, db, p = gradients(W, b, X, y)
before = log_loss(p, y)

W = W - lr * dW
b = b - lr * db

dW2, db2, p2 = gradients(W, b, X, y)
after = log_loss(p2, y)

print('dW (first step):', dW)
print('db (first step):', db)
print('loss before:', round(before, 4), ' -> loss after:', round(after, 4))
print('W after one step:', W, '  b:', round(b, 4))


NameError: name 'gradients' is not defined

## Step 7 — Train for a few epochs

We log weights and loss every 10 epochs to keep output readable.


In [None]:
hist = []
for ep in range(epochs):
    dW, db, p = gradients(W, b, X, y)
    loss = log_loss(p, y)
    hist.append((ep, W.copy(), b, loss))
    W -= lr * dW
    b -= lr * db
    if ep % 10 == 0 or ep == epochs-1:
        print(f'epoch {ep:3d} | loss={loss:.4f} | W={W} | b={b:.4f}')

# Save history as a small DataFrame
hist_df = pd.DataFrame({
    "epoch": [h[0] for h in hist],
    "w1": [h[1][0] for h in hist],
    "w2": [h[1][1] for h in hist],
    "b":  [h[2] for h in hist],
    "loss": [h[3] for h in hist],
})
hist_df.tail()


## Step 8 — Plot the decision boundary

Boundary line (in scaled coordinates): when \(w_1 x_1 + w_2 x_2 + b = 0\).


In [None]:
def boundary_points(w, b, x1_min=0.0, x1_max=1.0, num=200):
    x1 = np.linspace(x1_min, x1_max, num=num)
    if abs(w[1]) < 1e-12:
        x1_line = np.full_like(x1, -b/(w[0] if abs(w[0])>1e-12 else 1e-12))
        x2_line = np.linspace(0.0, 1.0, num=num)
        return x1_line, x2_line
    x2 = -(w[0]/w[1])*x1 - b/w[1]
    return x1, x2

# Show boundary at a few epochs (0, 10, 40, last)
snapshots = [0, 10, 40, hist_df['epoch'].iloc[-1]]
for ep in snapshots:
    row = hist_df.loc[hist_df['epoch']==ep].iloc[0]
    w = np.array([row['w1'], row['w2']])
    b_ep = row['b']
    x1_line, x2_line = boundary_points(w, b_ep)
    plt.figure()
    plt.scatter(X[y==1,0], X[y==1,1], label='Approved (y=1)')
    plt.scatter(X[y==0,0], X[y==0,1], label='Rejected (y=0)')
    plt.plot(x1_line, x2_line, label=f'Boundary @ epoch {ep}')
    plt.xlabel('Salary (scaled 0-1)')
    plt.ylabel('Loan amount (scaled 0-1)')
    plt.title(f'Decision Boundary @ epoch {ep}')
    plt.legend()
    plt.tight_layout()
    plt.show()


NameError: name 'hist_df' is not defined

## Step 9 — Loss curve

In [None]:
plt.figure()
plt.plot(hist_df['epoch'].values, hist_df['loss'].values)
plt.xlabel('Epoch')
plt.ylabel('Log Loss')
plt.title('Training Loss vs Epoch')
plt.tight_layout()
plt.show()


## Step 10 — Final accuracy

In [None]:
p_final = predict_proba(W, b, X)
y_pred  = (p_final >= 0.5).astype(int)
acc = (y_pred == y).mean()
print('Final W:', W, '  b:', round(b,4))
print(f'Training accuracy: {acc*100:.2f}%')

NameError: name 'predict_proba' is not defined

## Step 11 — Try it yourself

- Change `lr` (learning rate) and `epochs` in **Step 5** and re-run from there.

- Add L2: update rule becomes `W -= lr*(dW + lambda_*W)`.

- Change the **threshold**: use `y_pred = (p_final >= 0.4).astype(int)` and observe the boundary.

- Re-generate data: edit the CSV or replace with your class data (`salary`, `loan_amount`, `approve`).

In [None]:
import ipywidgets as widgets
from ipywidgets import interact

def train_and_plot(lr=0.5, epochs=50, threshold=0.5, lambda_=0.0):
    # Reinitialize weights
    rng = np.random.default_rng(0)
    W = rng.normal(0, 0.01, size=2)
    b = 0.0

    # Train
    for ep in range(epochs):
        dW, db, p = gradients(W, b, X, y)
        # L2 regularization on W
        W -= lr * (dW + lambda_ * W)
        b -= lr * db

    # Final predictions
    p_final = predict_proba(W, b, X)
    y_pred  = (p_final >= threshold).astype(int)
    acc = (y_pred == y).mean()

    # Plot decision boundary
    x1_line, x2_line = boundary_points(W, b)
    plt.figure(figsize=(5,4))
    plt.scatter(X[y==1,0], X[y==1,1], label='Approved (y=1)')
    plt.scatter(X[y==0,0], X[y==0,1], label='Rejected (y=0)')
    plt.plot(x1_line, x2_line, label='Decision boundary')
    plt.xlabel('Salary (scaled)')
    plt.ylabel('Loan amount (scaled)')
    plt.title(f'lr={lr}, epochs={epochs}, thr={threshold}, acc={acc*100:.2f}%')
    plt.legend()
    plt.show()

    print("Final Weights:", W, "Bias:", b)
    print(f"Training Accuracy = {acc*100:.2f}% (threshold={threshold})")

# Interactive sliders
interact(train_and_plot,
         lr=widgets.FloatSlider(value=0.5, min=0.01, max=2.0, step=0.05),
         epochs=widgets.IntSlider(value=50, min=10, max=300, step=10),
         threshold=widgets.FloatSlider(value=0.5, min=0.1, max=0.9, step=0.05),
         lambda_=widgets.FloatSlider(value=0.0, min=0.0, max=0.5, step=0.05));

interactive(children=(FloatSlider(value=0.5, description='lr', max=2.0, min=0.01, step=0.05), IntSlider(value=…

In [None]:
def boundary_points(w, b, x1_min=0.0, x1_max=1.0, num=200):
    x1 = np.linspace(x1_min, x1_max, num=num)
    if abs(w[1]) < 1e-12:
        x1_line = np.full_like(x1, -b/(w[0] if abs(w[0])>1e-12 else 1e-12))
        x2_line = np.linspace(0.0, 1.0, num=num)
        return x1_line, x2_line
    x2 = -(w[0]/w[1])*x1 - b/w[1]
    return x1, x2

# Show boundary at a few epochs (0, 10, 40, last)
snapshots = [0, 10, 40, hist_df['epoch'].iloc[-1]]
for ep in snapshots:
    row = hist_df.loc[hist_df['epoch']==ep].iloc[0]
    w = np.array([row['w1'], row['w2']])
    b_ep = row['b']
    x1_line, x2_line = boundary_points(w, b_ep)
    plt.figure()
    plt.scatter(X[y==1,0], X[y==1,1], label='Approved (y=1)')
    plt.scatter(X[y==0,0], X[y==0,1], label='Rejected (y=0)')
    plt.plot(x1_line, x2_line, label=f'Boundary @ epoch {ep}')
    plt.xlabel('Salary (scaled 0-1)')
    plt.ylabel('Loan amount (scaled 0-1)')
    plt.title(f'Decision Boundary @ epoch {ep}')
    plt.legend()
    plt.tight_layout()
    plt.show()

NameError: name 'hist_df' is not defined