# 05 â€” Autoencoder Feature Learning

In [None]:

# Update this if your data isn't under ./data
base_path = r"D:\IITB\STData\1"
  # change to r"D:\IITB\STData" on Windows if needed
save_models_to = r"./models"
save_fig_to = r"./notebooks/figures"

import os, pandas as pd, numpy as np, matplotlib.pyplot as plt
os.makedirs(save_models_to, exist_ok=True)
os.makedirs(save_fig_to, exist_ok=True)

def read_csv(name):
    p = os.path.join(base_path, name)
    return pd.read_csv(p)

print("Using base_path:", base_path)


In [None]:

import os, torch, torch.nn as nn, torch.optim as optim, pandas as pd, numpy as np

X = pd.read_csv(os.path.join(base_path,"processed_clean.csv")).values.astype("float32")
X = torch.tensor(X)

class AE(nn.Module):
    def __init__(self, d_in= X.shape[1], d_lat=8):
        super().__init__()
        self.enc = nn.Sequential(nn.Linear(d_in,128), nn.ReLU(), nn.Linear(128,64), nn.ReLU(), nn.Linear(64,d_lat))
        self.dec = nn.Sequential(nn.Linear(d_lat,64), nn.ReLU(), nn.Linear(64,128), nn.ReLU(), nn.Linear(128,d_in))
    def forward(self, x): 
        z = self.enc(x); xh = self.dec(z); 
        return xh, z

model = AE()
opt = optim.Adam(model.parameters(), lr=1e-3)
lossf = nn.MSELoss()

for epoch in range(50):
    opt.zero_grad(); xh, z = model(X); loss = lossf(xh, X); loss.backward(); opt.step()
    if (epoch+1)%10==0: print("epoch", epoch+1, "loss", float(loss))

torch.save(model.state_dict(), os.path.join(save_models_to,"autoencoder_model.pt"))
print("Saved autoencoder weights")

# Save latent for plotting in next notebook if needed
with torch.no_grad():
    _, Z = model(X)
    np.savetxt(os.path.join(base_path,"autoencoder_latent.csv"), Z.numpy(), delimiter=",")


In [None]:
# 05 â€” Autoencoder Feature Learning
import os, numpy as np, pandas as pd, torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib
import matplotlib.pyplot as plt

FEATURE_CSV = r"D:\IITB\STData\eye_features_all_students.csv"  # your merged per-student features

os.makedirs("./models", exist_ok=True)
os.makedirs("./notebooks/figures", exist_ok=True)

print("Using features from:", FEATURE_CSV)


In [None]:
feat = pd.read_csv(FEATURE_CSV)

# Keep numeric feature columns; drop id if present
cols = [c for c in feat.columns if c not in ["student_id","subject","student","id"]]
X = feat[cols].copy()

# Safety: remove all-NaN or constant columns
X = X.loc[:, X.notna().any(axis=0)]
X = X.loc[:, X.nunique(dropna=True) > 1]

print("Final feature columns:", list(X.columns))
print("Shape:", X.shape)

# Keep a student id column if available (for exports)
if "student_id" in feat.columns:
    stu_id = feat["student_id"].astype(int).values
else:
    # fallback: 1..N
    stu_id = np.arange(len(feat)) + 1


In [None]:
X_train, X_val = train_test_split(X, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train.values)
X_val_s   = scaler.transform(X_val.values)

# Save scaler for later reuse
joblib.dump(scaler, "./models/ae_scaler.pkl")

# Torch tensors / loaders
tr_ds = TensorDataset(torch.tensor(X_train_s, dtype=torch.float32))
va_ds = TensorDataset(torch.tensor(X_val_s, dtype=torch.float32))

tr_dl = DataLoader(tr_ds, batch_size=32, shuffle=True)
va_dl = DataLoader(va_ds, batch_size=64, shuffle=False)

input_dim = X.shape[1]
input_dim


In [None]:
class AE(nn.Module):
    def __init__(self, in_dim, latent_dim=2):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(in_dim, 16), nn.ReLU(),
            nn.Linear(16, 8), nn.ReLU(),
            nn.Linear(8, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 8), nn.ReLU(),
            nn.Linear(8,16), nn.ReLU(),
            nn.Linear(16, in_dim)
        )
    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat, z

model = AE(input_dim, latent_dim=2)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

sum(p.numel() for p in model.parameters()), model


In [None]:
best_val = np.inf
patience, wait = 15, 0
EPOCHS = 200

for ep in range(1, EPOCHS+1):
    # train
    model.train()
    tr_loss = 0.0
    for (xb,) in tr_dl:
        opt.zero_grad()
        x_hat, _ = model(xb)
        loss = loss_fn(x_hat, xb)
        loss.backward()
        opt.step()
        tr_loss += loss.item() * xb.size(0)
    tr_loss /= len(tr_ds)

    # val
    model.eval()
    va_loss = 0.0
    with torch.no_grad():
        for (xb,) in va_dl:
            x_hat, _ = model(xb)
            va_loss += loss_fn(x_hat, xb).item() * xb.size(0)
    va_loss /= len(va_ds)

    if va_loss < best_val - 1e-6:
        best_val = va_loss
        wait = 0
        torch.save(model.state_dict(), "./models/autoencoder_model.pt")
    else:
        wait += 1

    if ep % 10 == 0 or ep == 1:
        print(f"Epoch {ep:03d} | train {tr_loss:.4f} | val {va_loss:.4f} | best {best_val:.4f}")

    if wait >= patience:
        print("Early stopping.")
        break

print("Best val loss:", best_val)


In [None]:
# Reload best weights (just to be safe)
model.load_state_dict(torch.load("./models/autoencoder_model.pt"))
model.eval()

# Scale ALL rows, get latent & recon err
X_all_s = scaler.transform(X.values)
X_all_t = torch.tensor(X_all_s, dtype=torch.float32)
with torch.no_grad():
    X_hat, Z = model(X_all_t)
recon_mse = ((X_hat - X_all_t)**2).mean(dim=1).cpu().numpy()
Z_np = Z.cpu().numpy()

ae_df = pd.DataFrame({
    "student_id": stu_id,
    "z1": Z_np[:,0],
    "z2": Z_np[:,1],
    "recon_mse": recon_mse
})
ae_df.to_csv("./notebooks/figures/05_ae_latent.csv", index=False)
print("Saved latent CSV:", "./notebooks/figures/05_ae_latent.csv")

# Plot latent scatter colored by recon error
plt.figure(figsize=(5,4))
sc = plt.scatter(ae_df["z1"], ae_df["z2"], c=ae_df["recon_mse"])
plt.colorbar(sc, label="Reconstruction MSE")
plt.title("Autoencoder latent (2-D)")
plt.xlabel("z1"); plt.ylabel("z2")
plt.tight_layout()
plt.savefig("./notebooks/figures/05_ae_latent.png", dpi=200)
plt.show()

# Plot reconstruction error distribution
plt.figure(figsize=(5,3))
plt.hist(ae_df["recon_mse"], bins=20)
plt.xlabel("Reconstruction MSE"); plt.ylabel("Count")
plt.title("AE reconstruction error")
plt.tight_layout()
plt.savefig("./notebooks/figures/05_ae_recon_error.png", dpi=200)
plt.show()

ae_df.head()


In [None]:
labels_path = "./notebooks/figures/04_labels_from_lda.csv"
if os.path.exists(labels_path):
    lab = pd.read_csv(labels_path)
    lab["student_id"] = lab["student_id"].astype(int)
    merged = ae_df.merge(lab, on="student_id", how="left")

    plt.figure(figsize=(5,4))
    for g in sorted(merged["label"].dropna().unique()):
        m = merged["label"] == g
        plt.scatter(merged.loc[m,"z1"], merged.loc[m,"z2"], label=f"label {int(g)}", alpha=0.8)
    plt.legend()
    plt.title("AE latent colored by LDA labels")
    plt.xlabel("z1"); plt.ylabel("z2")
    plt.tight_layout()
    plt.savefig("./notebooks/figures/05_ae_latent_by_lda.png", dpi=200)
    plt.show()
else:
    print("No LDA label CSV found â€” skipping color-by-label plot.")
