In [None]:
#!/usr/bin/env python3
# Inference.py — loads model artifacts, schema.json, generates synthetic rows.

import argparse
import json
import pickle
import types
from pathlib import Path

import pandas as pd
import torch
from tqdm.auto import tqdm


def load_notebook_module(nb_path: Path, module_name: str = "finshield_nb") -> types.ModuleType:
    import nbformat
    from IPython.core.interactiveshell import InteractiveShell

    nb = nbformat.read(str(nb_path), as_version=4)
    module = types.ModuleType(module_name)
    shell = InteractiveShell.instance()
    exec_env = module.__dict__

    for cell in nb.cells:
        if cell.cell_type == "code":
            code = shell.input_transformer_manager.transform_cell(cell.source)
            exec(code, exec_env)

    return module


class _RenameUnpickler(pickle.Unpickler):
    def __init__(self, file, simple_tokenizer_cls):
        super().__init__(file)
        self._st_cls = simple_tokenizer_cls

    def find_class(self, module, name):
        if name == "SimpleTokenizer":
            return self._st_cls
        return super().find_class(module, name)


def _load_tokenizer(path: Path, simple_tokenizer_cls):
    try:
        with open(path, "rb") as f:
            return pickle.load(f)
    except AttributeError:
        with open(path, "rb") as f:
            return _RenameUnpickler(f, simple_tokenizer_cls).load()


def sync_globals(nbmod, schema: dict, meta: dict):
    """Inject schema + token maps into notebook module globals."""
    nbmod.token_maps = meta.get("token_maps", {})
    nbmod.column_order = schema.get("columns", [])
    nbmod.categorical_columns = schema.get("categoricals", [])
    nbmod.numerical_columns = schema.get("numericals", [])


def build_token_constraints(tokenizer, meta):
    token_constraints = {}
    token_maps = meta.get("token_maps", {})
    for col, vmap in token_maps.items():
        key = f"<CAT_{col}>"
        allowed_ids = []
        for tok in vmap.values():
            tid = tokenizer.token2id.get(tok)
            if tid is not None:
                allowed_ids.append(tid)
        token_constraints[key] = allowed_ids
    return token_constraints


def postprocess_dataframe(df: pd.DataFrame, schema: dict) -> pd.DataFrame:
    cols = schema.get("columns", list(df.columns))
    df = df.reindex(columns=cols)

    for c in schema.get("categoricals", []):
        if c in df.columns:
            df[c] = df[c].replace({"__UNK__": pd.NA}).astype("string")

    for n in schema.get("numericals", []):
        if n in df.columns:
            df[n] = pd.to_numeric(df[n], errors="coerce")

    return df


def resolve_artifact_paths(model_dir: Path, dataset_name: str):
    tok_path = model_dir / f"{dataset_name}_tokenizer.pkl"
    meta_path = model_dir / "meta.json"
    model_candidates = [
        model_dir / f"{dataset_name}_transformer_model.pt",
        model_dir / f"{dataset_name}_transformer.pt",
    ]
    mdl_path = next((p for p in model_candidates if p.exists()), None)
    return tok_path, mdl_path, meta_path


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--models_dir", type=Path, default=Path("models"))
    ap.add_argument("--dataset_name", required=True)
    ap.add_argument("--nb_path", type=Path, required=True)
    ap.add_argument("--n_rows", type=int, default=1000)
    ap.add_argument("--max_len", type=int, default=1024)
    ap.add_argument("--out", type=Path, default=Path("data/processed/synthetic_output.csv"))
    args = ap.parse_args()

    nbmod = load_notebook_module(args.nb_path, module_name="finshield_nb")
    GPTMini = nbmod.GPTMini
    SimpleTokenizer = nbmod.SimpleTokenizer
    generate_synthetic_row = nbmod.generate_synthetic_row

    # Resolve artifact paths
    model_dir = args.models_dir / args.dataset_name
    tok_path, mdl_path, meta_path = resolve_artifact_paths(model_dir, args.dataset_name)

    if not tok_path.exists():
        raise FileNotFoundError(f"Tokenizer not found: {tok_path}")
    if mdl_path is None:
        raise FileNotFoundError(f"Model weights not found for {args.dataset_name}")
    if not meta_path.exists():
        raise FileNotFoundError(f"Meta not found: {meta_path}")

    # Load tokenizer & meta
    tokenizer = _load_tokenizer(tok_path, SimpleTokenizer)
    meta = json.loads(meta_path.read_text(encoding="utf-8"))

    # Load schema.json from processed dataset
    schema_path = Path("data/processed") / args.dataset_name / "schema.json"
    if not schema_path.exists():
        raise FileNotFoundError(f"Schema not found: {schema_path}")
    schema = json.loads(schema_path.read_text(encoding="utf-8"))

    # Sync schema+meta into notebook module
    sync_globals(nbmod, schema, meta)

    # Load model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = GPTMini(vocab_size=len(tokenizer.token2id)).to(device)
    state = torch.load(mdl_path, map_location="cpu")
    model.load_state_dict(state)
    model.eval()

    token_constraints = build_token_constraints(tokenizer, meta)

    # Generate rows
    rows = []
    for _ in tqdm(range(args.n_rows), desc=f"Generating {args.dataset_name} rows"):
        r = generate_synthetic_row(
            model=model,
            tokenizer=tokenizer,
            device=device,
            token_constraints=token_constraints,
            max_len=args.max_len,
        )
        if r:
            rows.append(r)

    df = pd.DataFrame(rows)
    df = postprocess_dataframe(df, schema)
    args.out.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(args.out, index=False)
    print(f"Saved synthetic data: {args.out} | rows={len(df)}")


if __name__ == "__main__":
    main()


HOME CREDIT — loaded.
columns: ['DEFAULT', 'TARGET', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE'] ...
categoricals: ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']
numericals: ['TARGET', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'OWN_CAR_AGE', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'BUREAU_COUNT', 'PREVAPP_COUNT', 'POS_COUNT', 'INST_COUNT', 'CC_COUNT']
example token_maps key: CODE_GENDER


In [3]:
import pandas as pd
import numpy as np

token_counter = 0
for col in categorical_columns:
    unique_vals = df[col].unique()
    mapping = {val: f"T{token_counter + i}" for i, val in enumerate(unique_vals)}
    token_maps[col] = mapping
    token_counter += len(mapping)

BASE = 16
SCALE = 100      # keep 2 decimals after log1p
NUM_DIGITS = 6
def number_to_tokens_fixed(value):
    """
    Encode a numeric value into tokens:
    - Apply log1p transform (to compress big values).
    - Scale by SCALE (to preserve 2 decimals).
    - Convert to BASE=16 digits, fixed NUM_DIGITS long.
    - Always return: [sign] + NUM_DIGITS tokens.
    
    Missing -> ['P'] + ['NAN'] * NUM_DIGITS
    """
    if pd.isna(value):
        return ['P'] + ['NAN'] * NUM_DIGITS
    
    # sign
    sign = 'P' if value >= 0 else 'N'
    
    # log1p compression + scaling
    scaled = int(round(np.log1p(abs(float(value))) * SCALE))
    
    # encode in base-16 with fixed length
    digits = []
    for _ in range(NUM_DIGITS):
        digits.insert(0, str(scaled % BASE))
        scaled //= BASE
    
    return [sign] + digits

def tokenize_row(row):
    tokens = []
    for col in column_order:
        if col in categorical_columns:
            tokens.append(f"<CAT_{col}>")
            tokens.append(token_maps[col].get(row[col], "T_NAN"))
        else:
            tokens.append(f"<NUM_{col}>")
            tokens.extend(number_to_tokens_fixed(row[col]))
    tokens.append("<EOR>")
    return tokens

class SimpleTokenizer:
    def __init__(self):
        self.token2id = {"<PAD>": 0, "<BOS>": 1, "<EOS>": 2}
        self.id2token = {0: "<PAD>", 1: "<BOS>", 2: "<EOS>"}
        self.next_id = 3

    def fit(self, sequences):
        for seq in sequences:
            for token in seq:
                if token not in self.token2id:
                    self.token2id[token] = self.next_id
                    self.id2token[self.next_id] = token
                    self.next_id += 1

    def encode(self, seq):
        return [self.token2id["<BOS>"]] + [self.token2id[t] for t in seq] + [self.token2id["<EOS>"]]

    def decode(self, ids):
        return [self.id2token[i] for i in ids if i not in (0, 1, 2)]


In [6]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import pickle
from tqdm.auto import tqdm
class TabularDataset(Dataset):
    def __init__(self, sequences, tokenizer):
        self.data = [tokenizer.encode(seq) for seq in sequences]

    def __getitem__(self, idx):
        x = torch.tensor(self.data[idx][:-1])
        y = torch.tensor(self.data[idx][1:])
        return x, y

    def __len__(self):
        return len(self.data)

# --- Model ---
class CustomDecoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dropout=0.1):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.linear1 = nn.Linear(d_model, d_model * 4)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_model * 4, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, tgt, tgt_mask=None):
        # Self-attention block
        attn_output, _ = self.self_attn(tgt, tgt, tgt, attn_mask=tgt_mask)
        tgt2 = self.norm1(tgt + self.dropout1(attn_output))

        # Feed-forward block
        ff_output = self.linear2(self.dropout(F.relu(self.linear1(tgt2))))
        tgt3 = self.norm2(tgt2 + self.dropout2(ff_output))

        return tgt3

class GPTMini(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=8, num_layers=4, dropout=0.2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pos = nn.Parameter(torch.zeros(1, 512, d_model))
        self.dropout = nn.Dropout(dropout)

        self.layers = nn.ModuleList([
            CustomDecoderLayer(d_model, nhead, dropout)
            for _ in range(num_layers)
        ])
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embed(x) + self.pos[:, :x.size(1)]
        x = self.dropout(x)
        x = x.transpose(0, 1)  # seq_len, batch, d_model

        mask = nn.Transformer.generate_square_subsequent_mask(x.size(0)).to(x.device)

        for layer in self.layers:
            x = layer(x, tgt_mask=mask)

        x = x.transpose(0, 1)  # batch, seq_len, d_model
        return self.fc(x)


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# ---- add these constants somewhere near the top of the file (same module) ----
BASE = 16
SCALE = 100        # 2-decimal precision in log-space
NUM_DIGITS = 6     # fixed digits after the sign
TOKENS_PER_NUMBER = 1 + NUM_DIGITS
import torch
import torch.nn.functional as F

def generate_synthetic_row(model, tokenizer, device, token_constraints, max_len=256):
    model.eval()
    input_ids = [tokenizer.token2id["<BOS>"]]
    decoded_tokens = []
    expecting_num = False
    num_step = 0

    for _ in range(max_len):
        x = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)
        with torch.no_grad():
            logits = model(x)[0, -1, :]  # [V]

        if decoded_tokens:
            last_token = decoded_tokens[-1]

            if last_token.startswith("<CAT_"):
                # Only allow categorical VALUE tokens for that column
                allowed_ids = token_constraints.get(last_token, [])
                if not allowed_ids:
                    return None
                masked_logits = torch.full_like(logits, float('-inf'))
                masked_logits[allowed_ids] = logits[allowed_ids]
                logits = masked_logits

            elif last_token.startswith("<NUM_"):
                # next tokens must be numeric (sign + digits)
                expecting_num = True
                num_step = 0

            elif expecting_num:
                # block categorical VALUE tokens while emitting numeric tokens
                all_ids = set(range(len(tokenizer.token2id)))
                disallowed_ids = set()
                for cat_key in token_constraints:
                    disallowed_ids.update(token_constraints[cat_key])
                allowed_ids = list(all_ids - disallowed_ids)

                masked_logits = torch.full_like(logits, float('-inf'))
                masked_logits[allowed_ids] = logits[allowed_ids]
                logits = masked_logits

                num_step += 1
                if num_step == TOKENS_PER_NUMBER:  # sign + NUM_DIGITS digits
                    expecting_num = False

        probs = F.softmax(logits, dim=-1)
        if torch.isnan(probs).any() or torch.isinf(probs).any():
            return None

        next_token = torch.multinomial(probs, num_samples=1).item()
        token_str = tokenizer.id2token.get(next_token, "")

        if token_str in ["<EOS>", "<EOR>"]:
            break

        input_ids.append(next_token)
        decoded_tokens.append(token_str)

    # Decode tokens -> row dict
    row = {}
    i = 0
    while i < len(decoded_tokens):
        if decoded_tokens[i] == "<EOR>":
            break
        token = decoded_tokens[i]

        if token.startswith("<CAT_"):
            current_col = token[5:-1]
            i += 1
            # reverse-lookup categorical token -> original string value
            row[current_col] = next((k for k, v in token_maps[current_col].items()
                                     if v == decoded_tokens[i]), None)

        elif token.startswith("<NUM_"):
            current_col = token[5:-1]
            i += 1
            # need: sign + NUM_DIGITS digits
            if i + NUM_DIGITS >= len(decoded_tokens):
                return None
            try:
                sign_tok = decoded_tokens[i]
                digit_toks = decoded_tokens[i+1 : i+1+NUM_DIGITS]

                # if any 'NAN' digit was produced -> NaN
                if any(dt == 'NAN' for dt in digit_toks):
                    value = float("nan")
                else:
                    acc = 0
                    for d in digit_toks:
                        acc = acc * BASE + int(d)
                    # inverse of: scaled = round(log1p(abs(x)) * SCALE)
                    # value ≈ sign * expm1(acc / SCALE)
                    sign = 1 if sign_tok == 'P' else -1
                    value = sign * torch.expm1(torch.tensor(acc / float(SCALE))).item()

                row[current_col] = value
            except Exception:
                row[current_col] = float("nan")

            i += NUM_DIGITS  # consumed digits after the sign
        i += 1

    return row


In [10]:
def decode_generated_tokens(tokens):
    """
    Decode a flat list of tokens back into a row dict,
    using Option B (log1p + fixed NUM_DIGITS digits).
    """
    row = {}
    i = 0
    while i < len(tokens):
        token = tokens[i]

        if token.startswith("<CAT_"):
            col = token[5:-1] if token.endswith(">") else token[5:]
            i += 1
            row[col] = next((k for k, v in token_maps[col].items()
                             if v == tokens[i]), None)

        elif token.startswith("<NUM_"):
            col = token[5:-1] if token.endswith(">") else token[5:]
            i += 1
            # Expect: sign + NUM_DIGITS digits
            if i + NUM_DIGITS > len(tokens):
                row[col] = float("nan")
                break

            sign_tok = tokens[i]
            digit_toks = tokens[i+1 : i+1+NUM_DIGITS]

            if any(t == 'NAN' for t in digit_toks):
                row[col] = float("nan")
            else:
                try:
                    acc = 0
                    for d in digit_toks:
                        acc = acc * BASE + int(d)
                    sign = 1 if sign_tok == 'P' else -1
                    # inverse transform
                    val = sign * np.expm1(acc / float(SCALE))
                    row[col] = float(val)
                except Exception:
                    row[col] = float("nan")

            i += NUM_DIGITS
        else:
            i += 1

    return row if len(row) == len(column_order) else None


In [None]:


if __name__ == "__main__":
    tokenized = df.apply(tokenize_row, axis=1).tolist()
    tokenizer = SimpleTokenizer()
    tokenizer.fit(tokenized)

    dataset = TabularDataset(tokenized, tokenizer)
    # print(dataset[0])  # Check the first item
    train_len = int(0.8 * len(dataset))
    train_data, val_data = random_split(dataset, [train_len, len(dataset) - train_len])
    pad = lambda x: tuple(nn.utils.rnn.pad_sequence(t, batch_first=True) for t in zip(*x))
    train_loader = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=pad)
    val_loader = DataLoader(val_data, batch_size=8, collate_fn=pad)

    model = GPTMini(len(tokenizer.token2id)).to("cuda" if torch.cuda.is_available() else "cpu")
    # print(model)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    loss_fn = nn.CrossEntropyLoss(ignore_index=0)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for epoch in range(30):
        model.train()
        total = 0
        for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]"):
            x, y = x.to(device), y.to(device)
            out = model(x)
            loss = loss_fn(out.view(-1, out.size(-1)), y.view(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total += loss.item()
        print(f"Epoch {epoch+1} Train Loss: {total / len(train_loader):.4f}")

        model.eval()
        vtotal = 0
        with torch.no_grad():
            for x, y in tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]"):
                x, y = x.to(device), y.to(device)
                out = model(x)
                loss = loss_fn(out.view(-1, out.size(-1)), y.view(-1))
                vtotal += loss.item()
        print(f"Epoch {epoch+1} Val Loss: {vtotal / len(val_loader):.4f}")

    torch.save(model.state_dict(), "home_credit_transformer_model.pt")
    with open("home_credit_tokenizer.pkl", "wb") as f:
        pickle.dump(tokenizer, f)

__all__ = ["GPTMini", "SimpleTokenizer", "token_maps", "column_order", "generate_synthetic_row"]


Epoch 1 [Train]: 100%|██████████| 4000/4000 [00:44<00:00, 90.37it/s] 


Epoch 1 Train Loss: 0.6454


Epoch 1 [Val]: 100%|██████████| 1000/1000 [00:03<00:00, 275.20it/s]


Epoch 1 Val Loss: 0.3673


Epoch 2 [Train]: 100%|██████████| 4000/4000 [00:46<00:00, 86.05it/s]


Epoch 2 Train Loss: 0.3734


Epoch 2 [Val]: 100%|██████████| 1000/1000 [00:04<00:00, 246.48it/s]


Epoch 2 Val Loss: 0.3484


Epoch 3 [Train]: 100%|██████████| 4000/4000 [00:46<00:00, 86.20it/s]


Epoch 3 Train Loss: 0.3558


Epoch 3 [Val]: 100%|██████████| 1000/1000 [00:03<00:00, 263.34it/s]


Epoch 3 Val Loss: 0.3399


Epoch 4 [Train]: 100%|██████████| 4000/4000 [00:47<00:00, 85.09it/s]


Epoch 4 Train Loss: 0.3468


Epoch 4 [Val]: 100%|██████████| 1000/1000 [00:03<00:00, 268.21it/s]


Epoch 4 Val Loss: 0.3343


Epoch 5 [Train]: 100%|██████████| 4000/4000 [00:47<00:00, 84.64it/s]


Epoch 5 Train Loss: 0.3415


Epoch 5 [Val]: 100%|██████████| 1000/1000 [00:05<00:00, 194.48it/s]


Epoch 5 Val Loss: 0.3306


Epoch 6 [Train]: 100%|██████████| 4000/4000 [00:54<00:00, 73.25it/s]


Epoch 6 Train Loss: 0.3377


Epoch 6 [Val]: 100%|██████████| 1000/1000 [00:04<00:00, 223.58it/s]


Epoch 6 Val Loss: 0.3284


Epoch 7 [Train]: 100%|██████████| 4000/4000 [00:53<00:00, 75.20it/s]


Epoch 7 Train Loss: 0.3350


Epoch 7 [Val]: 100%|██████████| 1000/1000 [00:05<00:00, 196.67it/s]


Epoch 7 Val Loss: 0.3261


Epoch 8 [Train]: 100%|██████████| 4000/4000 [00:56<00:00, 71.11it/s]


Epoch 8 Train Loss: 0.3329


Epoch 8 [Val]: 100%|██████████| 1000/1000 [00:05<00:00, 195.56it/s]


Epoch 8 Val Loss: 0.3244


Epoch 9 [Train]: 100%|██████████| 4000/4000 [00:53<00:00, 74.93it/s]


Epoch 9 Train Loss: 0.3313


Epoch 9 [Val]: 100%|██████████| 1000/1000 [00:04<00:00, 237.61it/s]


Epoch 9 Val Loss: 0.3238


Epoch 10 [Train]: 100%|██████████| 4000/4000 [00:53<00:00, 75.09it/s]


Epoch 10 Train Loss: 0.3301


Epoch 10 [Val]: 100%|██████████| 1000/1000 [00:04<00:00, 220.16it/s]


Epoch 10 Val Loss: 0.3228


Epoch 11 [Train]: 100%|██████████| 4000/4000 [00:51<00:00, 77.25it/s]


Epoch 11 Train Loss: 0.3288


Epoch 11 [Val]: 100%|██████████| 1000/1000 [00:04<00:00, 222.01it/s]


Epoch 11 Val Loss: 0.3221


Epoch 12 [Train]: 100%|██████████| 4000/4000 [00:51<00:00, 77.47it/s]


Epoch 12 Train Loss: 0.3279


Epoch 12 [Val]: 100%|██████████| 1000/1000 [00:03<00:00, 255.79it/s]


Epoch 12 Val Loss: 0.3214


Epoch 13 [Train]: 100%|██████████| 4000/4000 [00:50<00:00, 79.33it/s]


Epoch 13 Train Loss: 0.3270


Epoch 13 [Val]: 100%|██████████| 1000/1000 [00:03<00:00, 253.36it/s]


Epoch 13 Val Loss: 0.3208


Epoch 14 [Train]: 100%|██████████| 4000/4000 [00:51<00:00, 78.26it/s]


Epoch 14 Train Loss: 0.3262


Epoch 14 [Val]: 100%|██████████| 1000/1000 [00:04<00:00, 208.49it/s]


Epoch 14 Val Loss: 0.3204


Epoch 15 [Train]: 100%|██████████| 4000/4000 [00:49<00:00, 81.41it/s]


Epoch 15 Train Loss: 0.3255


Epoch 15 [Val]: 100%|██████████| 1000/1000 [00:04<00:00, 216.53it/s]


Epoch 15 Val Loss: 0.3197


Epoch 16 [Train]: 100%|██████████| 4000/4000 [00:50<00:00, 79.03it/s]


Epoch 16 Train Loss: 0.3249


Epoch 16 [Val]: 100%|██████████| 1000/1000 [00:04<00:00, 212.70it/s]


Epoch 16 Val Loss: 0.3192


Epoch 17 [Train]: 100%|██████████| 4000/4000 [43:46<00:00,  1.52it/s]  


Epoch 17 Train Loss: 0.3243


Epoch 17 [Val]: 100%|██████████| 1000/1000 [00:02<00:00, 334.30it/s]


Epoch 17 Val Loss: 0.3187


Epoch 18 [Train]: 100%|██████████| 4000/4000 [00:41<00:00, 97.23it/s] 


Epoch 18 Train Loss: 0.3238


Epoch 18 [Val]: 100%|██████████| 1000/1000 [00:03<00:00, 300.30it/s]


Epoch 18 Val Loss: 0.3182


Epoch 19 [Train]: 100%|██████████| 4000/4000 [00:41<00:00, 96.15it/s] 


Epoch 19 Train Loss: 0.3234


Epoch 19 [Val]: 100%|██████████| 1000/1000 [00:03<00:00, 279.11it/s]


Epoch 19 Val Loss: 0.3181


Epoch 20 [Train]: 100%|██████████| 4000/4000 [00:46<00:00, 85.97it/s]


Epoch 20 Train Loss: 0.3229


Epoch 20 [Val]: 100%|██████████| 1000/1000 [00:03<00:00, 281.03it/s]


Epoch 20 Val Loss: 0.3175


Epoch 21 [Train]: 100%|██████████| 4000/4000 [00:42<00:00, 94.70it/s] 


Epoch 21 Train Loss: 0.3224


Epoch 21 [Val]: 100%|██████████| 1000/1000 [00:03<00:00, 312.35it/s]


Epoch 21 Val Loss: 0.3176


Epoch 22 [Train]: 100%|██████████| 4000/4000 [00:43<00:00, 92.73it/s] 


Epoch 22 Train Loss: 0.3220


Epoch 22 [Val]: 100%|██████████| 1000/1000 [00:03<00:00, 261.68it/s]


Epoch 22 Val Loss: 0.3170


Epoch 23 [Train]: 100%|██████████| 4000/4000 [00:43<00:00, 91.56it/s] 


Epoch 23 Train Loss: 0.3217


Epoch 23 [Val]: 100%|██████████| 1000/1000 [00:03<00:00, 276.50it/s]


Epoch 23 Val Loss: 0.3166


Epoch 24 [Train]: 100%|██████████| 4000/4000 [00:42<00:00, 94.20it/s] 


Epoch 24 Train Loss: 0.3213


Epoch 24 [Val]: 100%|██████████| 1000/1000 [00:03<00:00, 263.25it/s]


Epoch 24 Val Loss: 0.3167


Epoch 25 [Train]: 100%|██████████| 4000/4000 [00:45<00:00, 88.40it/s]


Epoch 25 Train Loss: 0.3210


Epoch 25 [Val]: 100%|██████████| 1000/1000 [00:03<00:00, 264.57it/s]


Epoch 25 Val Loss: 0.3163


Epoch 26 [Train]: 100%|██████████| 4000/4000 [00:41<00:00, 97.21it/s] 


Epoch 26 Train Loss: 0.3207


Epoch 26 [Val]: 100%|██████████| 1000/1000 [00:03<00:00, 304.60it/s]


Epoch 26 Val Loss: 0.3167


Epoch 27 [Train]: 100%|██████████| 4000/4000 [00:42<00:00, 94.15it/s] 


Epoch 27 Train Loss: 0.3204


Epoch 27 [Val]: 100%|██████████| 1000/1000 [00:03<00:00, 268.02it/s]


Epoch 27 Val Loss: 0.3161


Epoch 28 [Train]: 100%|██████████| 4000/4000 [00:41<00:00, 96.85it/s] 


Epoch 28 Train Loss: 0.3201


Epoch 28 [Val]: 100%|██████████| 1000/1000 [00:03<00:00, 297.28it/s]


Epoch 28 Val Loss: 0.3158


Epoch 29 [Train]: 100%|██████████| 4000/4000 [00:42<00:00, 94.66it/s] 


Epoch 29 Train Loss: 0.3199


Epoch 29 [Val]: 100%|██████████| 1000/1000 [00:03<00:00, 314.63it/s]


Epoch 29 Val Loss: 0.3157


Epoch 30 [Train]: 100%|██████████| 4000/4000 [00:44<00:00, 90.76it/s] 


Epoch 30 Train Loss: 0.3197


Epoch 30 [Val]: 100%|██████████| 1000/1000 [00:03<00:00, 289.02it/s]

Epoch 30 Val Loss: 0.3154





In [1]:
# === Exports for Inference ===
# This cell ensures that Inference.py can just `import HOME_CREDITED_Transformer`

__all__ = [
    "GPTMini",
    "SimpleTokenizer",
    "generate_synthetic_row",
    "token_maps",
    "column_order",
    "categorical_columns",
    "numerical_columns",
]

print("✅ Exported: GPTMini, SimpleTokenizer, generate_synthetic_row, schema globals")


✅ Exported: GPTMini, SimpleTokenizer, generate_synthetic_row, schema globals
