Model


In [None]:
# 1️⃣ Install & Import Libraries
# !pip install -q transformers accelerate datasets torch pandas scikit-learn tqdm

import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import re
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, T5EncoderModel, Trainer, TrainingArguments, DataCollatorWithPadding

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)


# -------------------- 1. Load TRAIN & TEST CSV from path --------------------
import pandas as pd
import numpy as np

# Provide full path to your CSV files
train_path = "D:\Desktop\POC\data\synthetic_balanced_data_20000_60_40 (1).csv"   # <-- change this to your TRAIN CSV path
test_path = "D:\Desktop\POC\data\synthetic_balanced_test_data_7000_50_50 (1).csv"     # <-- change this to your TEST CSV path

# Load CSVs
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print("✅ TRAIN dataset shape:", train_df.shape)
print("✅ TEST dataset shape:", test_df.shape)

# --- Column Mapping ---
COLUMN_MAP = {f'OpSet{i}': f'op_setting_{i}' for i in range(1,4)}
COLUMN_MAP.update({f'Sensor{i}': f'sensor_measurement_{i}' for i in range(1,22)})
COLUMN_MAP['Label_RUL_30'] = 'RUL_binary'

# --- Load & structure data ---
def load_and_structure_data(file_path, total_rows, fake_units, start_unit=1):
    df = pd.read_csv(file_path)
    df.rename(columns=COLUMN_MAP, inplace=True)
    CYCLES_PER_UNIT = total_rows // fake_units
    df['unit_number'] = np.repeat(range(start_unit, start_unit + fake_units), CYCLES_PER_UNIT)[:total_rows]
    df['time_in_cycles'] = np.tile(range(1, CYCLES_PER_UNIT + 1), fake_units)[:total_rows]
    df['RUL'] = np.nan
    print(f"Loaded {len(df)} rows, injected {fake_units} fake units starting from {start_unit}.")
    return df

df_train = load_and_structure_data(train_path, 20000, 1000)
df_test  = load_and_structure_data(test_path, 7000, 500, 2000)

# --- Scale Features ---
selected_sensors = [2,3,4,7,11,12,15,20,21]
feature_cols = [f'op_setting_{i}' for i in range(1,4)] + [f'sensor_measurement_{i}' for i in selected_sensors]

scaler = MinMaxScaler()
df_train[feature_cols] = scaler.fit_transform(df_train[feature_cols])
df_test[feature_cols] = scaler.transform(df_test[feature_cols])

# --- Class weights ---
labels = df_train['RUL_binary']
class_counts = labels.value_counts().sort_index()
weights = torch.tensor([len(labels)/(2*c) for c in class_counts], dtype=torch.float).to(DEVICE)
print("Class counts:", class_counts.to_dict(), "Weights:", weights.tolist())

# --- Context Dataset ---
CONTEXT_LENGTH = 15

def create_prompt(context_df):
    s = ""
    for i, row in enumerate(context_df.itertuples()):
        cycle_idx = f"t-{CONTEXT_LENGTH-1-i}" if i<CONTEXT_LENGTH-1 else "t"
        sensors_str = ", ".join([f"s{i+1}={getattr(row, col):.3f}" for col in feature_cols if 'sensor' in col])
        s += f"Cycle ({cycle_idx}): {sensors_str}\n"
    return s

class ContextDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.samples = []
        self.tokenizer = tokenizer
        for unit_id in df['unit_number'].unique():
            unit_df = df[df['unit_number']==unit_id].sort_values('time_in_cycles')
            for i in range(CONTEXT_LENGTH-1, len(unit_df)):
                ctx = unit_df.iloc[i-CONTEXT_LENGTH+1:i+1]
                label = int(ctx['RUL_binary'].iloc[-1])
                self.samples.append((ctx, label))
    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        ctx, label = self.samples[idx]
        prompt = create_prompt(ctx)
        inputs = tokenizer(prompt, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
        return {'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze(),
                'labels': torch.tensor(label)}

# --- Tokenizer & Model ---
MODEL_NAME = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
class T5BinaryClassifier(nn.Module):
    def __init__(self, model_name=MODEL_NAME):
        super().__init__()
        self.encoder = T5EncoderModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.encoder.config.d_model, 2)
    def forward(self, input_ids, attention_mask=None, labels=None):
        x = self.encoder(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        logits = self.classifier(x[:,0,:])
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(weight=weights)
            loss = loss_fct(logits, labels)
        return {'loss': loss, 'logits': logits}

model = T5BinaryClassifier().to(DEVICE)

# --- Prepare Datasets ---
train_dataset = ContextDataset(df_train, tokenizer)
eval_dataset  = ContextDataset(df_test, tokenizer)  # Or a validation split from train

# --- Training Arguments ---
training_args = TrainingArguments(
    output_dir="D:/Desktop/POC/data/t5_rul_binary",
    num_train_epochs=3,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,
    logging_dir='D:\Desktop\POC\data\logs',
    logging_steps=50,
    save_strategy='no',
    eval_strategy='epoch'
)

data_collator = DataCollatorWithPadding(tokenizer)

# --- Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# --- Start Training ---
print("Starting training...")
trainer.train()
print("Training complete!")

# --- Predictions & Metrics ---
def evaluate_model(trainer, dataset):
    trainer.model.eval()
    loader = DataLoader(dataset, batch_size=4)
    all_preds, all_labels = [], []
    for batch in loader:
        input_ids = batch['input_ids'].to(DEVICE)
        mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        with torch.no_grad():
            outputs = trainer.model(input_ids, attention_mask=mask)
        preds = torch.argmax(outputs['logits'], dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds)
    rec = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    cm = confusion_matrix(all_labels, all_preds)
    print(f"Accuracy: {acc*100:.2f}%\nPrecision: {prec:.2f}\nRecall: {rec:.2f}\nF1: {f1:.2f}")
    print("Confusion Matrix:\n", cm)

print("\n--- Train Evaluation ---")
evaluate_model(trainer, train_dataset)

print("\n--- Test Evaluation ---")
evaluate_model(trainer, eval_dataset)


Device: cpu
✅ TRAIN dataset shape: (20000, 28)
✅ TEST dataset shape: (7000, 25)
Loaded 20000 rows, injected 1000 fake units starting from 1.
Loaded 7000 rows, injected 500 fake units starting from 2000.
Class counts: {0: 8000, 1: 12000} Weights: [1.25, 0.8333333134651184]
Starting training...


Epoch,Training Loss,Validation Loss


In [None]:
# --- 16️⃣ Unit-wise Predictions on Test Set ---
def predict_unitwise(model, tokenizer, test_df, feature_columns, context_length=15):
    model.eval()
    predictions, true_labels = [], []

    for unit_id in tqdm(test_df['unit_number'].unique(), desc="Predicting per unit"):
        unit_df = test_df[test_df['unit_number'] == unit_id].sort_values('time_in_cycles')
        # Use last CONTEXT_LENGTH cycles as context
        context_df = unit_df.tail(context_length) if len(unit_df) >= context_length else unit_df

        true_label = int(context_df['RUL_binary'].iloc[-1])
        true_labels.append(true_label)

        prompt = create_prompt(context_df)
        inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=512).to(DEVICE)

        with torch.no_grad():
            outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
            pred = torch.argmax(outputs['logits'], dim=1).item()
        predictions.append(pred)

    return np.array(true_labels), np.array(predictions)

# --- 17️⃣ Evaluate Unit-wise ---
print("\n--- Unit-wise Test Set Evaluation ---")
true_ruls, predicted_ruls = predict_unitwise(model, tokenizer, df_test, feature_cols, CONTEXT_LENGTH)

# Calculate metrics
acc  = accuracy_score(true_ruls, predicted_ruls)
prec = precision_score(true_ruls, predicted_ruls, zero_division=0)
rec  = recall_score(true_ruls, predicted_ruls, zero_division=0)
f1   = f1_score(true_ruls, predicted_ruls, zero_division=0)
cm   = confusion_matrix(true_ruls, predicted_ruls)

print(f"Accuracy: {acc*100:.2f}%")
print(f"Precision: {prec:.2f}")
print(f"Recall: {rec:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix (rows: true, cols: predicted):\n", cm)


--- Unit-wise Test Set Evaluation ---


Predicting per unit: 100%|██████████| 500/500 [00:05<00:00, 84.06it/s]

Accuracy: 100.00%
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
Confusion Matrix (rows: true, cols: predicted):
 [[250   0]
 [  0 250]]





In [None]:
import os
import torch
os.makedirs("artifacts/models", exist_ok=True)

# Save trained CNN model (state dict)
torch.save(model.state_dict(), "artifacts/models/llm_model.pt")

print("✔ LLM model saved at artifacts/models/llm_model.pt")