In [1]:
# ============================================================
# ðŸ”¥ FINAL CNN + EMBEDDING EXTRACTION PIPELINE (FULL CODE)
# ============================================================

import warnings
warnings.filterwarnings('ignore')

# -------------------- 1. Load TRAIN & TEST CSV from path --------------------
import pandas as pd
import numpy as np

# Provide full path to your CSV files
train_path = "D:\Desktop\POC\data\synthetic_balanced_data_20000_60_40 (1).csv"   # <-- change this to your TRAIN CSV path
test_path = "D:\Desktop\POC\data\synthetic_balanced_test_data_7000_50_50 (1).csv"     # <-- change this to your TEST CSV path

# Load CSVs
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print("âœ… TRAIN dataset shape:", train_df.shape)
print("âœ… TEST dataset shape:", test_df.shape)

# -------------------- 2. Column Rename + Structure ------------------
COLUMN_MAP = {f'OpSet{i}': f'op_setting_{i}' for i in range(1,4)}
COLUMN_MAP.update({f'Sensor{i}': f'sensor_measurement_{i}' for i in range(1,22)})
COLUMN_MAP['Label_RUL_30'] = 'RUL_binary'

def load_and_structure_data(file_path, fake_units):
    df = pd.read_csv(file_path)
    df.rename(columns=COLUMN_MAP, inplace=True)

    total_rows = len(df)
    cycles_per_unit = int(np.ceil(total_rows / fake_units))

    df['unit_number'] = np.repeat(range(1, fake_units+1), cycles_per_unit)[:total_rows]
    df['time_in_cycles'] = np.tile(range(1, cycles_per_unit+1), fake_units)[:total_rows]

    return df, cycles_per_unit

df_train, train_cycles = load_and_structure_data(train_path, fake_units=1000)
df_test, test_cycles   = load_and_structure_data(test_path,  fake_units=400)

# -------------------- 3. Scaling -------------------------------
from sklearn.preprocessing import MinMaxScaler

selected_sensors = [2,3,4,7,11,12,15,20,21]
feature_cols = [f'op_setting_{i}' for i in range(1,4)] + [
    f'sensor_measurement_{i}' for i in selected_sensors
]

scaler = MinMaxScaler()
df_train[feature_cols] = scaler.fit_transform(df_train[feature_cols])
df_test[feature_cols] = scaler.transform(df_test[feature_cols])

# -------------------- 4. Class Weights -------------------------
import torch

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

labels = df_train['RUL_binary']
class_counts = labels.value_counts().sort_index()
weights = torch.tensor([len(labels)/(2*c) for c in class_counts]).float().to(DEVICE)

print("Class distribution:", class_counts.to_dict())

# -------------------- 5. Dataset with 15-cycle window --------------
from torch.utils.data import Dataset, DataLoader

CONTEXT_LENGTH = 15  #The model will look at 15 previous cycles to make one prediction.

class CNNDataset(Dataset):
    def __init__(self, df):
        self.samples = []
        for unit in df['unit_number'].unique():
            u = df[df['unit_number']==unit].sort_values('time_in_cycles')
            for i in range(CONTEXT_LENGTH-1, len(u)):   #Windows start from cycle 15 onward.
                ctx = u.iloc[i-CONTEXT_LENGTH+1:i+1]
                x = ctx[feature_cols].values.astype(np.float32)  # (15,12)
                y = int(ctx['RUL_binary'].iloc[-1])
                self.samples.append((x, y))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        x, y = self.samples[idx]
        x = torch.tensor(x).permute(1, 0)   # â†’ [12, 15]
        return x, torch.tensor(y)

train_dataset = CNNDataset(df_train)
test_dataset  = CNNDataset(df_test)   #15 cycle window for training and testing

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)    #Load 32 samples at a time
test_loader  = DataLoader(test_dataset, batch_size=32)

# -------------------- 6. CNN Model (Embedding Output) -----------------------------
import torch.nn as nn

class CNNBinaryClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(12, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(32, 64, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(64, 2)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        embed = self.pool(x).squeeze(-1)
        out = self.fc(embed)
        return out, embed

model = CNNBinaryClassifier().to(DEVICE)

# -------------------- 7. Training Loop -----------------------------
criterion = nn.CrossEntropyLoss(weight=weights)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

EPOCHS = 5
print("\nðŸš€ Training Started...\n")

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for x, y in train_loader:
        x, y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        logits, _ = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS}  Loss: {total_loss/len(train_loader):.4f}")

print("\nðŸŽ‰ Training Complete!")

# -------------------- 8. Extract & Save Embeddings -----------------------------

def extract_embeddings(model, df):
    rows = []
    model.eval()

    for unit in df['unit_number'].unique():
        u = df[df['unit_number']==unit].sort_values('time_in_cycles')
        for i in range(CONTEXT_LENGTH-1, len(u)):
            ctx = u.iloc[i-CONTEXT_LENGTH+1:i+1]
            x = torch.tensor(ctx[feature_cols].values.astype(np.float32)).permute(1,0).unsqueeze(0).to(DEVICE)

            with torch.no_grad():
                _, embed = model(x)

            embed = embed.cpu().numpy().flatten()

            rows.append({
                "unit": unit,
                "cycle": ctx['time_in_cycles'].iloc[-1],
                "label": int(ctx["RUL_binary"].iloc[-1]),
                **{f"emb_{j}": embed[j] for j in range(len(embed))}
            })

    return pd.DataFrame(rows)

print("\nðŸ“Œ Extracting Train Embeddings...")
train_embeddings = extract_embeddings(model, df_train)

print("ðŸ“Œ Extracting Test Embeddings...")
test_embeddings = extract_embeddings(model, df_test)

train_embeddings.to_csv("train_embeddings.csv", index=False)
test_embeddings.to_csv("test_embeddings.csv", index=False)

print("\nâœ… Saved Embeddings: train_embeddings.csv, test_embeddings.csv")

  train_path = "D:\Desktop\POC\data\synthetic_balanced_data_20000_60_40 (1).csv"   # <-- change this to your TRAIN CSV path
  test_path = "D:\Desktop\POC\data\synthetic_balanced_test_data_7000_50_50 (1).csv"     # <-- change this to your TEST CSV path


âœ… TRAIN dataset shape: (20000, 28)
âœ… TEST dataset shape: (7000, 25)
Using device: cpu
Class distribution: {0: 8000, 1: 12000}

ðŸš€ Training Started...

Epoch 1/5  Loss: 0.6790
Epoch 2/5  Loss: 0.4653
Epoch 3/5  Loss: 0.1702
Epoch 4/5  Loss: 0.0708
Epoch 5/5  Loss: 0.0466

ðŸŽ‰ Training Complete!

ðŸ“Œ Extracting Train Embeddings...
ðŸ“Œ Extracting Test Embeddings...

âœ… Saved Embeddings: train_embeddings.csv, test_embeddings.csv


In [2]:
df = pd.read_csv("train_embeddings.csv")
df.head()

Unnamed: 0,unit,cycle,label,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,...,emb_54,emb_55,emb_56,emb_57,emb_58,emb_59,emb_60,emb_61,emb_62,emb_63
0,1,15,1,0.085568,0.595105,0.560874,0.098581,0.704059,0.104998,0.101497,...,0.109752,0.0,0.05044,0.065817,0.098282,0.088516,0.134741,0.056025,0.105022,0.632593
1,1,16,1,0.077187,0.613869,0.577578,0.087007,0.731853,0.097004,0.087825,...,0.093898,0.0,0.043127,0.061173,0.087722,0.078893,0.123661,0.048601,0.094412,0.656915
2,1,17,1,0.073958,0.628032,0.595915,0.079446,0.755311,0.101598,0.076585,...,0.087929,0.0,0.039694,0.056636,0.08647,0.077106,0.119925,0.048228,0.096901,0.676797
3,1,18,1,0.090098,0.62674,0.599017,0.100359,0.752985,0.11859,0.089461,...,0.107307,0.0,0.060533,0.07612,0.104738,0.106392,0.129216,0.069394,0.1207,0.67681
4,1,19,1,0.081789,0.61699,0.59633,0.090944,0.742616,0.102805,0.087863,...,0.10371,0.0,0.044602,0.065744,0.09674,0.094204,0.124595,0.061623,0.103902,0.666932


In [3]:
!pip install streamlit

In [4]:
import streamlit as st
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

emb_df = pd.read_csv("train_embeddings.csv")
emb_cols = [c for c in emb_df.columns if c.startswith("emb_")]

if len(emb_cols) == 0:
    st.error("No embedding columns found in the CSV!")
else:
    n_components = min(2, len(emb_cols), emb_df.shape[0])
    if n_components < 2:
        st.warning(f"Not enough data/features for 2 components. Using n_components={n_components}")

    pca = PCA(n_components=n_components)
    reduced = pca.fit_transform(emb_df[emb_cols])

    fig, ax = plt.subplots(figsize=(6, 5))
    ax.scatter(reduced[:, 0], reduced[:, 1], c=emb_df["label"], cmap="coolwarm")
    ax.set_title("PCA of CNN Embeddings")
    st.pyplot(fig)

2025-12-12 14:58:35.477 
  command:

    streamlit run C:\Users\sjadhav\AppData\Roaming\Python\Python313\site-packages\ipykernel_launcher.py [ARGUMENTS]


In [11]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

Using device: cpu


In [12]:
!pip install 'accelerate>=0.26.0'
!pip install transformers[torch]

In [None]:
# ============================================================
# ðŸ”¥ CNN embeddings -> T5-small hybrid training & evaluation
# ============================================================

import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, T5EncoderModel, Trainer, TrainingArguments, DataCollatorWithPadding
import transformers
import accelerate

print("Transformers version:", transformers.__version__)
print("Accelerate version:", accelerate.__version__)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

# ---------- 1) load embedding CSVs ----------
train_emb_path = "train_embeddings.csv"
test_emb_path  = "test_embeddings.csv"

train_df = pd.read_csv(train_emb_path)
test_df  = pd.read_csv(test_emb_path)

print("Train rows:", len(train_df), "Test rows:", len(test_df))

# ---------- 2) sanity checks ----------
emb_cols = [c for c in train_df.columns if c.startswith("emb_")]
assert len(emb_cols) > 0, "No embedding columns found (must be emb_0 ... emb_n)."
print("Embedding dim detected:", len(emb_cols))

# ---------- 3) Build text prompts from embeddings ----------
def embedding_row_to_prompt(row, emb_cols=emb_cols, prefix="Embedding:"):
    vals = [f"{row[c]:.6f}" for c in emb_cols]
    return prefix + " " + ", ".join(vals) + " -> Predict failure (1 or 0):"

train_df["prompt"] = train_df.apply(lambda r: embedding_row_to_prompt(r), axis=1)
test_df["prompt"]  = test_df.apply(lambda r: embedding_row_to_prompt(r), axis=1)

train_df["label_int"] = train_df["label"].astype(int)
test_df["label_int"]  = test_df["label"].astype(int)
#making sure numbers are in int form bcoz t5 needs label in int

# ---------- 4) class weights (for loss) ----------
labels = train_df["label_int"]
class_counts = labels.value_counts().sort_index()  # index 0 then 1
weights = torch.tensor([len(labels) / (2 * class_counts.loc[c]) for c in class_counts.index], dtype=torch.float).to(DEVICE)
print("Class counts:", class_counts.to_dict(), "Weights:", weights.tolist())

# ---------- 5) Tokenizer & Dataset ----------
MODEL_NAME = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

CONTEXT_MAX_LENGTH = 256
class EmbeddingTextDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=CONTEXT_MAX_LENGTH):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.prompts = df["prompt"].tolist()
        self.labels = df["label_int"].tolist()

    def __len__(self):
        return len(self.prompts)

    def __getitem__(self, idx):
        p = self.prompts[idx]
        inputs = self.tokenizer(p, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        item = {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }
        return item

train_dataset = EmbeddingTextDataset(train_df, tokenizer)
eval_dataset  = EmbeddingTextDataset(test_df, tokenizer)

print("Train dataset size:", len(train_dataset), "Eval dataset size:", len(eval_dataset))

# ---------- 6) Model: T5 encoder + classifier (same pattern as your earlier T5BinaryClassifier) ----------
class T5EmbeddingClassifier(nn.Module):
    def __init__(self, model_name=MODEL_NAME):
        super().__init__()
        self.encoder = T5EncoderModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.encoder.config.d_model, 2)
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        enc = self.encoder(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state

        mask = attention_mask.unsqueeze(-1)
        masked_enc = enc * mask
        summed = masked_enc.sum(1)
        lengths = mask.sum(1).clamp(min=1e-9)
        pooled = summed / lengths
        logits = self.classifier(pooled)
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(weight=weights)
            loss = loss_fct(logits, labels)
        return {"loss": loss, "logits": logits}

model = T5EmbeddingClassifier().to(DEVICE)

# ---------- 7) Training arguments & Trainer ----------
training_args = TrainingArguments(
    output_dir="t5_on_embeddings",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_dir="t5_on_embeddings_logs",
    logging_steps=50,                    #for each 50 steps
    save_strategy="no",
    eval_strategy="epoch",
    fp16=torch.cuda.is_available()
)

data_collator = DataCollatorWithPadding(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# ---------- 8) Train ----------
print("Starting T5 training on embeddings...")
trainer.train()
print("T5 training complete.")

# ---------- 9) Evaluation helper (sample-level) ----------
def evaluate_model_trainer(trainer, dataset):
    trainer.model.eval()
    loader = DataLoader(dataset, batch_size=32)
    all_preds, all_labels = [], []
    for batch in loader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)    #moves batch data to GPU or CPU
        with torch.no_grad():
            outputs = trainer.model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs["logits"], dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds, zero_division=0)
    rec = recall_score(all_labels, all_preds, zero_division=0)
    f1 = f1_score(all_labels, all_preds, zero_division=0)
    cm = confusion_matrix(all_labels, all_preds)
    print(f"Accuracy: {acc*100:.2f}%\nPrecision: {prec:.4f}\nRecall: {rec:.4f}\nF1: {f1:.4f}")
    print("Confusion Matrix:\n", cm)
    return all_labels, all_preds

print("\n--- Train evaluation (T5 on embeddings) ---")
_ = evaluate_model_trainer(trainer, train_dataset)

print("\n--- Test evaluation (T5 on embeddings) ---")
_ = evaluate_model_trainer(trainer, eval_dataset)

# ---------- 10) Unit-wise evaluation (last cycle per unit) ----------
# Select last cycle embedding row per unit from test_df
unit_last = test_df.sort_values(["unit", "cycle"]).groupby("unit").tail(1).reset_index(drop=True)
print("Unit-wise rows:", len(unit_last))

# Make a small dataset/wrapper for these last-cycle prompts
class SimplePromptDataset(Dataset):
    def __init__(self, prompts, labels, tokenizer, max_length=CONTEXT_MAX_LENGTH):
        self.prompts = prompts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self): return len(self.prompts)
    def __getitem__(self, idx):
        p = self.prompts[idx]
        inputs = self.tokenizer(p, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        return {"input_ids": inputs["input_ids"].squeeze(0),
                "attention_mask": inputs["attention_mask"].squeeze(0),
                "labels": torch.tensor(self.labels[idx], dtype=torch.long)}

unit_dataset = SimplePromptDataset(unit_last["prompt"].tolist(), unit_last["label_int"].tolist(), tokenizer)
print("\n--- Unit-wise evaluation (T5 on last-cycle embeddings) ---")
_ = evaluate_model_trainer(trainer, unit_dataset)

# ---------- 11) Save model (optional) ----------
# trainer.save_model("t5_on_embeddings_saved")

print("\nAll done. You can compare T5 results with your CNN baseline.")

Device: cpu
Train rows: 6000 Test rows: 1554
Embedding dim detected: 64
Class counts: {0: 2400, 1: 3600} Weights: [1.25, 0.8333333134651184]
Train dataset size: 6000 Eval dataset size: 1554


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`