<a href="https://colab.research.google.com/github/ParasKanchan/Material_Science_Project/blob/main/Material_Science_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
import argparse
import os
import numpy as np
import pandas as pd
from typing import List, Tuple
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

BATCH_SIZE = 64
MLP_HIDDEN = [128, 64]
DROPOUT = 0.2
LR = 1e-3
EPOCHS = 80
PATIENCE = 10
TARGET = "creep_time_h"

def load_and_prepare(csv_path: str, drop_cols: List[str] = None):
    df = pd.read_csv(csv_path)
    df = df.dropna(subset=[TARGET]).reset_index(drop=True)
    if drop_cols:
        for c in drop_cols:
            if c in df.columns:
                df = df.drop(columns=[c])
    cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
    cat_cols = [c for c in cat_cols if c != TARGET]
    for c in cat_cols:
        df[c] = df[c].astype("category").cat.codes
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    numeric_cols = [c for c in numeric_cols if c != TARGET]
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
    X = df[numeric_cols].values
    y = df[TARGET].values.astype(float)
    y = np.log1p(y)
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X)
    return Xs, y, numeric_cols, scaler

class TabularDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

class MLP(nn.Module):
    def __init__(self, n_in, hidden, p):
        super().__init__()
        layers = []
        prev = n_in
        for h in hidden:
            layers.append(nn.Linear(prev, h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(p))
            prev = h
        layers.append(nn.Linear(prev, 1))
        self.net = nn.Sequential(*layers)
    def forward(self, x): return self.net(x)




In [41]:
def train_mlp(model, tr_loader, val_loader, epochs, lr, patience):
    model.to(DEVICE)
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
    loss_fn = nn.MSELoss()
    best_loss = 1e9
    best_state = None
    no_imp = 0
    for ep in range(epochs):
        model.train()
        for xb, yb in tr_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            loss = loss_fn(model(xb), yb)
            opt.zero_grad(); loss.backward(); opt.step()
        model.eval()
        val_losses = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                val_losses.append(loss_fn(model(xb), yb).item())
        val_loss = float(np.mean(val_losses))
        if val_loss < best_loss:
            best_loss = val_loss
            best_state = model.state_dict()
            no_imp = 0
        else:
            no_imp += 1
            if no_imp >= patience:
                break
    if best_state is not None:
        model.load_state_dict(best_state)
    return model

In [42]:
def evaluate(y_true_log, y_pred_log):
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    rmse_log = np.sqrt(mean_squared_error(y_true_log, y_pred_log))
    mae_raw = mean_absolute_error(y_true, y_pred)
    rel = mae_raw / np.median(y_true)
    return {"rmse_log": rmse_log, "mae_raw": mae_raw, "rel_mae_med": rel}

In [43]:
def train_xgb(X_train, y_train, X_val, y_val):
    params = {
        "objective": "reg:squarederror",
        "learning_rate": 0.05,
        "max_depth": 4,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "n_estimators": 500,
        "random_state": SEED,
        "verbosity": 0,
    }
    model = xgb.XGBRegressor(**params)

    try:
        early_stop_cb = xgb.callback.EarlyStopping(rounds=50, save_best=True)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[early_stop_cb], verbose=False)
        return model
    except Exception:
        pass

    try:
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=False)
        return model
    except Exception:
        pass

    model.fit(X_train, y_train, verbose=False)
    return model


In [44]:
def main(args):
    try:
        if not os.path.exists(args.data):
            raise FileNotFoundError(f"Data file not found: {args.data}")
        X, y, features, scaler = load_and_prepare(args.data)
        print("DATA SHAPE:", "X", X.shape, "y", y.shape, "features_count", len(features))
        X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=SEED)
        print("SPLITS:", X_tr.shape, X_val.shape if 'X_val' in globals() else "no X_val yet")
        X_train, X_val, y_train, y_val = train_test_split(X_tr, y_tr, test_size=0.15, random_state=SEED)
        print("TRAIN/VAL shapes:", X_train.shape, X_val.shape)
        xgb_model = train_xgb(X_train, y_train, X_val, y_val)
        xgb_pred = xgb_model.predict(X_te)
        print("XGBoost:", evaluate(y_te, xgb_pred))
        ds_tr = TabularDataset(X_train, y_train)
        ds_val = TabularDataset(X_val, y_val)
        ds_te = TabularDataset(X_te, y_te)
        dl_tr = DataLoader(ds_tr, batch_size=BATCH_SIZE, shuffle=True)
        dl_val = DataLoader(ds_val, batch_size=BATCH_SIZE, shuffle=False)
        dl_te = DataLoader(ds_te, batch_size=BATCH_SIZE, shuffle=False)
        model = MLP(n_in=X.shape[1], hidden=MLP_HIDDEN, p=DROPOUT)
        model = train_mlp(model, dl_tr, dl_val, args.epochs, args.lr, args.patience)
        preds_list = []
        model.eval()
        with torch.no_grad():
            for xb, _ in dl_te:
                xb = xb.to(DEVICE)
                out = model(xb).cpu().numpy().ravel()
                preds_list.append(out)
        if len(preds_list) == 0:
            raise RuntimeError("No predictions produced: test DataLoader is empty.")
        preds = np.concatenate(preds_list)
        if preds.shape[0] != y_te.shape[0]:
            print("WARNING: prediction count", preds.shape[0], "!= test target count", y_te.shape[0])
            # try to align by trimming/padding if necessary
            min_len = min(preds.shape[0], y_te.shape[0])
            preds = preds[:min_len]
            y_te = y_te[:min_len]
        print("MLP:", evaluate(y_te, preds))
        os.makedirs(args.out_dir, exist_ok=True)
        xgb_model.save_model(os.path.join(args.out_dir, "xgb.json"))
        torch.save(model.state_dict(), os.path.join(args.out_dir, "mlp.pt"))
        import joblib
        joblib.dump(scaler, os.path.join(args.out_dir, "scaler.joblib"))
        joblib.dump(features, os.path.join(args.out_dir, "features.joblib"))
        print("Saved models and artifacts to:", args.out_dir)
    except Exception as e:
        import traceback, sys
        print("ERROR in main():", str(e))
        traceback.print_exc(file=sys.stdout)

if __name__ == "__main__":
    import sys
    p = argparse.ArgumentParser()
    p.add_argument("--data", required=True)
    p.add_argument("--out_dir", default="models")
    p.add_argument("--epochs", type=int, default=EPOCHS)
    p.add_argument("--lr", type=float, default=LR)
    p.add_argument("--patience", type=int, default=PATIENCE)
    if 'IPython' in sys.modules:
        args = p.parse_args(args=["--data", "realistic_synthetic_superalloy_dataset.csv", "--out_dir", "models"])
    else:
        args = p.parse_args()
    main(args)


DATA SHAPE: X (1200, 27) y (1200,) features_count 27
SPLITS: (960, 27) no X_val yet
TRAIN/VAL shapes: (816, 27) (144, 27)
XGBoost: {'rmse_log': np.float64(0.39832620132147456), 'mae_raw': 39.76926771589915, 'rel_mae_med': np.float64(0.8250371909611257)}
MLP: {'rmse_log': np.float64(0.5430716893051184), 'mae_raw': 52.76254599315325, 'rel_mae_med': np.float64(1.0945905025237692)}
Saved models and artifacts to: models
