<a href="https://colab.research.google.com/github/Shackeem54/bbit-learning-labs/blob/main/Bank_%2CCustomers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -------------------------------------------------------------
# QUESTION 1: Decision Tree & Random Forest Classification
# -------------------------------------------------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# -------------------------------------------------------------
# (a) LOAD DATA
# -------------------------------------------------------------
df = pd.read_csv("Customer.csv")

feature_cols = [
    "Gender","Married","Education","ApplicantIncome","CoapplicantIncome",
    "LoanAmount","Loan_Amount_Term","Credit_History","Property_Area"
]

X = df[feature_cols].copy()
y = df["Loan_Status"].copy()

# -------------------------------------------------------------
# (b) LABEL ENCODING & FEATURE SCALING
# -------------------------------------------------------------
label_cols = ["Gender", "Married", "Education", "Property_Area", "Loan_Status"]
encoder = LabelEncoder()

for col in label_cols:
    df[col] = encoder.fit_transform(df[col])

X = df[feature_cols].copy()
y = df["Loan_Status"].copy()

scaler = StandardScaler()
num_cols = ["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term"]

X[num_cols] = scaler.fit_transform(X[num_cols])

# -------------------------------------------------------------
# (c) TRAIN-TEST SPLIT & CLASSIFICATION
# -------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

dt.fit(X_train, y_train)
rf.fit(X_train, y_train)

# -------------------------------------------------------------
# (d) GENERATE & SAVE TREE IMAGES
# -------------------------------------------------------------
plt.figure(figsize=(20,10))
plot_tree(dt, feature_names=feature_cols, filled=True, fontsize=8)
plt.title("Decision Tree Classifier")
plt.savefig("DecisionTree.png", dpi=300)
plt.show()

# FIRST TWO AND LAST TWO TREES FROM RANDOM FOREST
trees_to_show = [0, 1, -2, -1]

for i, t in enumerate(trees_to_show):
    plt.figure(figsize=(20,10))
    plot_tree(rf.estimators_[t], feature_names=feature_cols, filled=True, fontsize=8)
    plt.title(f"Random Forest Tree #{t+1}")
    plt.savefig(f"RF_Tree_{t+1}.png", dpi=300)
    plt.show()

# -------------------------------------------------------------
# (e) CROSS-VALIDATION ACCURACIES & CONFUSION MATRICES
# -------------------------------------------------------------
dt_cv = cross_val_score(dt, X, y, cv=5)
rf_cv = cross_val_score(rf, X, y, cv=5)

print("Decision Tree 5-Fold Accuracy:", dt_cv.mean())
print("Random Forest 5-Fold Accuracy:", rf_cv.mean())

y_pred_dt = dt.predict(X_test)
y_pred_rf = rf.predict(X_test)

print("\nDecision Tree Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("\nRandom Forest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

# -------------------------------------------------------------
# (f) FEATURE IMPORTANCE (RANDOM FOREST)
# -------------------------------------------------------------
importances = rf.feature_importances_

plt.figure(figsize=(10,6))
sns.barplot(x=importances, y=feature_cols, palette="viridis")
plt.title("Feature Importance - Random Forest")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.show()


In [None]:
# -------------------------------------------------------------
# QUESTION 2: Clustering
# -------------------------------------------------------------
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

# -------------------------------------------------------------
# LOAD DATA
# -------------------------------------------------------------
housing = pd.read_csv("Housing.csv")

features = ["price", "area", "stories", "basement", "parking"]
X = housing[features].copy()

# SCALE FEATURES
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -------------------------------------------------------------
# (a) DEFAULT K-MEANS
# -------------------------------------------------------------
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_scaled)
labels = kmeans.labels_

# -------------------------------------------------------------
# (b) FIRST 100 CLUSTER MEMBERSHIPS & CENTROIDS
# -------------------------------------------------------------
print("Cluster Membership of First 100 samples:")
print(labels[:100])

centroids = scaler.inverse_transform(kmeans.cluster_centers_)
print("\nCluster Centers (denormalized):")
print(pd.DataFrame(centroids, columns=features))

# -------------------------------------------------------------
# (c) ELBOW METHOD
# -------------------------------------------------------------
inertia_vals = []
cluster_sizes = [5, 10, 15, 20, 25, 30]

for k in cluster_sizes:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(X_scaled)
    inertia_vals.append(km.inertia_)

plt.figure(figsize=(8,5))
plt.plot(cluster_sizes, inertia_vals, marker='o')
plt.title("Elbow Method for Optimal k")
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.grid(True)
plt.show()

# -------------------------------------------------------------
# (d) SCATTER PLOT (PRICE vs AREA)
# -------------------------------------------------------------
# Choose best k from elbow (example: 10)
k_opt = 10
kmeans_final = KMeans(n_clusters=k_opt, random_state=42)
labels_final = kmeans_final.fit_predict(X_scaled)

plt.figure(figsize=(10,6))
plt.scatter(housing["price"], housing["area"], c=labels_final, cmap="tab20")
plt.xlabel("Price")
plt.ylabel("Area")
plt.title(f"K-Means Clusters (k={k_opt})")
plt.show()

# -------------------------------------------------------------
# (e) AGGLOMERATIVE CLUSTERING + DENDROGRAM
# -------------------------------------------------------------
X2 = housing[["price", "area"]].copy()
X2_scaled = StandardScaler().fit_transform(X2)

linked = linkage(X2_scaled, method="ward")

plt.figure(figsize=(12,6))
dendrogram(linked, truncate_mode="level", p=6)
plt.title("Agglomerative Clustering Dendrogram")
plt.xlabel("Sample Index")
plt.ylabel("Distance")
plt.show()

agg = AgglomerativeClustering(n_clusters=3, linkage="ward")
labels_agg = agg.fit_predict(X2_scaled)
