In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df=pd.read_csv('data_student.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.columns = df.columns.str.strip()

In [None]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
#le=LabelEncoder()
#df['UNS']=le.fit_transform(df['UNS'])

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
sns.pairplot(df)

In [None]:
for col in df.columns:
    if col=='UNS':continue
    plt.figure(figsize=(4, 2))
    sns.kdeplot(df[col],fill=True)
    plt.title(f'KDE Plot of {col}')

In [None]:
sns.boxplot(data=df)

In [None]:
X=df.drop('UNS',axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
df_scaled=scaler.fit_transform(X)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
num_cols = [c for c in X.columns if np.issubdtype(X[c].dtype, np.number)]
cat_cols = [c for c in X.columns if c not in num_cols]

preprocess = ColumnTransformer([
    ("num", Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("sc", StandardScaler())
    ]), num_cols),
    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("oh", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols)
])

X_proc = preprocess.fit_transform(X)

pca = PCA(n_components=2, random_state=42)
X2 = pca.fit_transform(X_proc.toarray() if hasattr(X_proc, "toarray") else X_proc)

best_k, best_model, best_score = None, None, -1
for k in range(4,5):
    km = KMeans(n_clusters=k, n_init=10, random_state=42)
    labels = km.fit_predict(X_proc)
    sil = silhouette_score(X_proc, labels)
    if sil > best_score:
        best_k, best_model, best_score = k, km, sil

labels = best_model.labels_

silhouette = silhouette_score(X_proc, labels)
calinski = calinski_harabasz_score(X_proc, labels)
davies = davies_bouldin_score(X_proc, labels)

print(f"Best K: {best_k}")
print(f"Silhouette Score: {silhouette:.3f}")
print(f"Calinski-Harabasz Score: {calinski:.3f}")
print(f"Davies-Bouldin Score: {davies:.3f}")



In [None]:
plt.figure(figsize=(7,5))
for lab in np.unique(labels):
    mask = labels == lab
    plt.scatter(X2[mask,0], X2[mask,1], s=30, label=f"Cluster {lab}")
plt.title(f"KMeans Clusters (k={best_k})")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend()
plt.show()

In [None]:
from sklearn.cluster import AgglomerativeClustering


X_proc_dense = X_proc.toarray() if hasattr(X_proc, "toarray") else X_proc

pca = PCA(n_components=2, random_state=42)
X2 = pca.fit_transform(X_proc_dense)

best_k, best_model, best_score = None, None, -1
for k in range(2,5):
    hc = AgglomerativeClustering(n_clusters=k, linkage="ward")
    labels = hc.fit_predict(X_proc_dense)
    sil = silhouette_score(X_proc_dense, labels)
    if sil > best_score:
        best_k, best_model, best_score = k, hc, sil

labels = best_model.fit_predict(X_proc_dense)

print(f"Best k: {best_k}")
print("Silhouette Score:", silhouette_score(X_proc_dense, labels))
print("Calinski-Harabasz Score:", calinski_harabasz_score(X_proc_dense, labels))
print("Davies-Bouldin Score:", davies_bouldin_score(X_proc_dense, labels))

In [None]:
plt.figure(figsize=(7,5))
for lab in np.unique(labels):
    mask = labels == lab
    plt.scatter(X2[mask,0], X2[mask,1], s=30, label=f"Cluster {lab}")
plt.title(f"Hierarchical Clustering (k={best_k})")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend()
plt.show()

In [None]:
from sklearn.cluster import DBSCAN
X_proc_dense = X_proc.toarray() if hasattr(X_proc, "toarray") else X_proc

pca = PCA(n_components=2, random_state=42)
X2 = pca.fit_transform(X_proc_dense)

db = DBSCAN(eps=1.2, min_samples=3)
labels = db.fit_predict(X_proc_dense)

n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)

print(f"Estimated clusters: {n_clusters}")
print(f"Noise points: {n_noise}")

if n_clusters > 1:
    silhouette = silhouette_score(X_proc_dense, labels)
    calinski = calinski_harabasz_score(X_proc_dense, labels)
    davies = davies_bouldin_score(X_proc_dense, labels)

    print(f"Silhouette Score: {silhouette:.3f}")
    print(f"Calinski-Harabasz Score: {calinski:.3f}")
    print(f"Davies-Bouldin Score: {davies:.3f}")
else:
    print("Metrics not available (only 1 cluster found).")



In [None]:
plt.figure(figsize=(7,5))
unique_labels = set(labels)
for lab in unique_labels:
    mask = labels == lab
    if lab == -1:
        plt.scatter(X2[mask,0], X2[mask,1], s=30, c="k", label="Noise")
    else:
        plt.scatter(X2[mask,0], X2[mask,1], s=30, label=f"Cluster {lab}")
plt.title(f"DBSCAN Clustering (eps=0.8, min_samples=5) → {n_clusters} clusters")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend()
plt.show()


In [None]:
import joblib

joblib.dump(best_model, "model.pkl")
joblib.dump(scaler, "scaler.pkl")