In [None]:
#include <iostream>
#include <filesystem>
#include <fstream>
#include <random>
#include <chrono>
#include <string>
#include <vector>
#include <unordered_map>
#include <sstream>
#include <iomanip>
#include <algorithm>
#include <cstdlib>
#include <cstdio>
#include <regex>

#pragma cling add_include_path("/home/warawreh/MakeCPPFunAgain/libtorch/include")
#pragma cling add_include_path("/home/warawreh/MakeCPPFunAgain/libtorch/include/torch/csrc/api/include")
#pragma cling add_include_path("/home/warawreh/MakeCPPFunAgain/include")
#pragma cling add_library_path("/home/warawreh/MakeCPPFunAgain/libtorch/lib")
#pragma cling load("/home/warawreh/MakeCPPFunAgain/libtorch/lib/libc10.so")
#pragma cling load("/home/warawreh/MakeCPPFunAgain/libtorch/lib/libtorch_cpu.so")
#pragma cling load("/home/warawreh/MakeCPPFunAgain/libtorch/lib/libtorch.so")

#include "mcppfa/huggingface.hpp"
#include "mcppfa/sentencepiece_lite.hpp"
#include "mcppfa/hf_dataset.hpp"
#include "mcppfa/hf_trainer.hpp"
#include "mcppfa/model_summary.hpp"
#include "mcppfa/torch_bert.hpp"   // use the T5 implementation in torch_bert.hpp

In [None]:
using namespace std;
using namespace mcppfa;

// Force stdout/stderr to stream while the cell runs (otherwise output may appear only at the end).
std::cout.setf(std::ios::unitbuf);
std::cerr.setf(std::ios::unitbuf);
setvbuf(stdout, nullptr, _IONBF, 0);
setvbuf(stderr, nullptr, _IONBF, 0);

// --- Globals shared across cells ---
std::string g_hf_token;
// Use this repo ONLY as tokenizer assets repo (spiece.model + tokenizer_config.json).
std::string g_model_repo = "XXXXX";
std::string g_dataset_repo = "XXXXXX";
std::string g_revision = "main";

std::string g_model_dir;
std::string g_spiece_path;
std::string g_tokenizer_config_path;
spm_lite::SentencePieceLite g_sp;
bool g_sp_loaded = false;

// Tokenizer-derived ids/sizes
int64_t g_base_sp_vocab_size = 0;
int64_t g_vocab_size = 0;
int64_t g_pad_id = 0;

// --- Training params (editable in Cell 3) ---
int64_t g_max_len = 32;
double  g_lr = 1e-3;
int     g_epochs = 10;
int64_t g_batch_size = 32;

size_t g_n_train = 8;
size_t g_n_valid = 4;
size_t g_n_test  = 4;

torchlm::T5Model g_t5 = nullptr;
torch::Device g_device = torch::kCPU;
bool g_model_ready = false;

hf_dataset::Table g_train_tbl;
hf_dataset::Table g_valid_tbl;
hf_dataset::Table g_test_tbl;
bool g_dataset_loaded = false;

try {
    // Load HF token from secrets.txt (do NOT print it)
    try {
        g_hf_token = hf::read_token_file("secrets.txt");
    } catch (const std::exception& e) {
        cerr << "Warning: could not read secrets.txt (" << e.what() << "); private downloads may fail." << endl;
        g_hf_token.clear();
    }
    if (!g_hf_token.empty()) {
        setenv("HF_TOKEN", g_hf_token.c_str(), 1);
        setenv("HUGGINGFACE_HUB_TOKEN", g_hf_token.c_str(), 1);
    }

    // --- Load tokenizer assets from repo ---
    g_model_dir = std::string(".hf/") + g_model_repo.substr(g_model_repo.find_last_of('/') + 1);
    std::filesystem::create_directories(g_model_dir);
    g_spiece_path = g_model_dir + "/spiece.model";
    g_tokenizer_config_path = g_model_dir + "/tokenizer_config.json";

    cout << "Preparing tokenizer from repo: " << g_model_repo << endl;
    {
        auto r = hf::download_file_http(g_model_repo, "spiece.model", g_spiece_path, hf::RepoType::model, "main", g_hf_token);
        if (r.exit_code != 0 && !std::filesystem::exists(g_spiece_path)) {
            throw std::runtime_error("Failed to download spiece.model (and no local copy present)");
        }
        // tokenizer_config.json is needed to get the *effective* vocab size (added tokens / extra ids).
        auto r2 = hf::download_file_http(g_model_repo, "tokenizer_config.json", g_tokenizer_config_path, hf::RepoType::model, "main", g_hf_token);
        if (r2.exit_code != 0 && !std::filesystem::exists(g_tokenizer_config_path)) {
            cerr << "Warning: could not download tokenizer_config.json; vocab size may be incomplete." << endl;
        }
    }

    // Base SentencePiece vocab comes only from spiece.model (often small in domain-specific tokenizers).
    g_sp.load_from_file(g_spiece_path);
    g_sp_loaded = (g_sp.vocab_size() > 0);
    g_base_sp_vocab_size = static_cast<int64_t>(g_sp.vocab_size());
    cout << "Loaded spiece.model base_vocab_size=" << g_base_sp_vocab_size << endl;

    // Merge HF added tokens (added_tokens_decoder) so ids and vocab_size match HuggingFace tokenizer.
    // This is why you were seeing vocab_size=128: that's just the *SentencePiece* piece count;
    // HF appends lots of tokens (extra_id_*, pad, and custom domain tokens) outside spiece.model.
    int64_t max_added_id = -1;
    int64_t extra_ids = 0;
    std::string pad_token_str;
    if (std::filesystem::exists(g_tokenizer_config_path)) {
        std::ifstream in(g_tokenizer_config_path);
        std::string cfg((std::istreambuf_iterator<char>(in)), std::istreambuf_iterator<char>());

        {
            std::smatch m;
            if (std::regex_search(cfg, m, std::regex("\\\"extra_ids\\\"\\s*:\\s*(\\d+)"))) {
                extra_ids = std::stoll(m[1].str());
            }
        }
        {
            std::smatch m;
            if (std::regex_search(cfg, m, std::regex("\\\"pad_token\\\"\\s*:\\s*\\\"([^\\\"]+)\\\""))) {
                pad_token_str = m[1].str();
            }
        }

        // Parse entries like: "123": { "content": "TOKEN", ... }
        // This is a targeted parser for tokenizer_config.json's added_tokens_decoder object.
        std::regex entry_re("\\\"(\\d+)\\\"\\s*:\\s*\\{[^\\}]*?\\\"content\\\"\\s*:\\s*\\\"([^\\\"]*)\\\"");
        for (auto it = std::sregex_iterator(cfg.begin(), cfg.end(), entry_re); it != std::sregex_iterator(); ++it) {
            const int64_t id = std::stoll((*it)[1].str());
            const std::string content = (*it)[2].str();
            g_sp.add_piece_with_id(id, content);
            if (id > max_added_id) max_added_id = id;
        }
    } else {
        cerr << "Warning: tokenizer_config.json not found; cannot merge added tokens." << endl;
    }

    // Determine pad_id. For T5-style tokenizers, pad is usually after extra_ids, but we also try to look it up by string.
    int64_t pad_id = -1;
    if (!pad_token_str.empty()) {
        pad_id = g_sp.id_for_piece(pad_token_str);
    }
    if (pad_id < 0 && extra_ids > 0) {
        pad_id = g_base_sp_vocab_size + extra_ids;
    }
    if (pad_id < 0) {
        // Fallback: keep 0, but warn (0 is usually <unk> for T5).
        pad_id = 0;
        cerr << "Warning: could not determine pad_id; defaulting to 0 (this may be wrong for T5 tokenizers)." << endl;
    }
    g_pad_id = pad_id;

    // Effective vocab size is max id + 1 after merging (or base size if no added tokens).
    g_vocab_size = static_cast<int64_t>(g_sp.vocab_size());
    if (g_vocab_size < g_base_sp_vocab_size) g_vocab_size = g_base_sp_vocab_size;
    cout << "Tokenizer effective_vocab_size=" << g_vocab_size << " (base=" << g_base_sp_vocab_size << ")" << endl;
    cout << "Tokenizer pad_id=" << g_pad_id << endl;

    // --- Prepare model (initialized here) ---
    const int64_t vocab_size = g_vocab_size;
    if (vocab_size <= 0) throw std::runtime_error("Invalid vocab_size from tokenizer assets");

    g_device = torch::kCPU;
    if (torch::cuda::is_available()) {
        cout << "CUDA available. Using GPU." << endl;
        g_device = torch::Device(torch::kCUDA);
    }

    const int64_t d_model = 512;
    const int64_t num_heads = 8;
    const int64_t enc_layers = 1;
    const int64_t dec_layers = 1;
    const int64_t d_ff = 1024;
    const int64_t model_max_len = 512;

    g_t5 = torchlm::T5Model(vocab_size, d_model, num_heads, enc_layers, dec_layers, d_ff, model_max_len, 0.1);
    g_t5->to(g_device);
    g_model_ready = true;

    cout << "\nModel ready. Total params: " << model_summary::count_total_params(*g_t5) << "\n";
    model_summary::print_model_summary(*g_t5, model_summary::SummaryOptions{.print_each_param=false, .max_groups=60});

    // --- Load dataset splits (train/validation/test) ---
    cout << "\nLoading dataset splits from: " << g_dataset_repo << endl;
    const auto splits = hf_dataset::list_splits(g_dataset_repo, g_hf_token);
    cout << "Splits: ";
    for (size_t i = 0; i < splits.size(); ++i) cout << splits[i] << (i + 1 == splits.size() ? "\n" : ", ");

    const size_t preview_train = 32;
    const size_t preview_valid = 16;
    const size_t preview_test  = 16;
    g_train_tbl = hf_dataset::load_rows_split(g_dataset_repo, "train", 0, preview_train, g_hf_token);
    g_valid_tbl = hf_dataset::load_rows_split(g_dataset_repo, "validation", 0, preview_valid, g_hf_token);
    g_test_tbl  = hf_dataset::load_rows_split(g_dataset_repo, "test", 0, preview_test, g_hf_token);
    g_dataset_loaded = true;

    cout << "\nTrain preview:" << endl;
    hf_dataset::print_columns(g_train_tbl);
    hf_dataset::print_head(g_train_tbl, 5);

} catch (const std::exception& e) {
    cerr << "Error in Cell 2: " << e.what() << endl;
}

In [None]:
using namespace std;
using namespace mcppfa;

// --- Training params (edit these) ---
// NOTE: g_pad_id is derived from tokenizer_config.json in Cell 2.
g_max_len = 512;
g_lr = 2e-5;
g_epochs = 100;
g_batch_size = 64;

// How many rows to fetch for each split
g_n_train = 8;
g_n_valid = 4;
g_n_test  = 4;

cout << "Training params" << endl;
cout << "  max_len=" << g_max_len << " pad_id=" << g_pad_id << endl;
cout << "  lr=" << g_lr << " epochs=" << g_epochs << " batch_size=" << g_batch_size << endl;
cout << "  n_train=" << g_n_train << " n_valid=" << g_n_valid << " n_test=" << g_n_test << endl;

In [None]:
using namespace std;
using namespace mcppfa;

try {
    if (!g_sp_loaded || !g_model_ready) {
        throw std::runtime_error("Tokenizer/model not ready. Run Cell 1 first.");
    }
    if (g_n_train == 0 || g_n_valid == 0 || g_n_test == 0) {
        throw std::runtime_error("Invalid dataset sizes. Set g_n_train/g_n_valid/g_n_test in Cell 2.");
    }
    if (g_max_len <= 0) throw std::runtime_error("g_max_len must be > 0");
    if (g_batch_size <= 0) throw std::runtime_error("g_batch_size must be > 0");

    // datasets-server often enforces a max page size; fetch in pages to be safe.
    auto fetch_paged = [&](const std::string& split, size_t total) -> hf_dataset::Table {
        const size_t page = 64;
        hf_dataset::Table out;
        size_t offset = 0;
        while (offset < total) {
            const size_t len = std::min(page, total - offset);
            auto chunk = hf_dataset::load_rows_split(g_dataset_repo, split, offset, len, g_hf_token);
            if (out.columns.empty()) out.columns = chunk.columns;
            for (auto& r : chunk.rows) out.rows.push_back(std::move(r));
            offset += len;
        }
        return out;
    };

    // Avoid re-downloading if the requested sizes are already loaded in-memory.
    static size_t s_loaded_train = 0;
    static size_t s_loaded_valid = 0;
    static size_t s_loaded_test  = 0;
    const bool need_fetch = (!g_dataset_loaded) || (s_loaded_train != g_n_train) || (s_loaded_valid != g_n_valid) || (s_loaded_test != g_n_test);
    if (need_fetch) {
        std::printf("Fetching dataset rows for training (paged)...\n");
        g_train_tbl = fetch_paged("train", g_n_train);
        g_valid_tbl = fetch_paged("validation", g_n_valid);
        g_test_tbl  = fetch_paged("test", g_n_test);
        g_dataset_loaded = true;
        s_loaded_train = g_n_train;
        s_loaded_valid = g_n_valid;
        s_loaded_test  = g_n_test;
    } else {
        std::printf("Dataset already loaded in-memory; skipping fetch.\n");
    }

    TrainingArguments args;
    args.max_len = g_max_len;
    args.pad_id = g_pad_id;
    args.batch_size = g_batch_size;
    args.epochs = g_epochs;
    args.lr = g_lr;
    args.input_col = "state";
    args.label_col = "very_short_result";
    args.one_token_target = true;
    args.device = g_device;

    Trainer<torchlm::T5Model> trainer(g_t5, g_sp, args);
    trainer.set_splits(g_train_tbl, g_valid_tbl, g_test_tbl);

    std::printf("Tokenizing splits once (this saves lots of time per epoch)...\n");
    trainer.tokenize_splits_once();
    trainer.train();

    const string out_model = "t5_dataset_finetuned.pt";
    torch::serialize::OutputArchive archive;
    g_t5->save(archive);
    archive.save_to(out_model);
    std::printf("Saved fine-tuned model to %s\n", out_model.c_str());

} catch (const std::exception& e) {
    std::fprintf(stderr, "Error in training cell: %s\n", e.what());
}

In [None]:
string text = "STATE PLAY CARD ROUND 2 BID HISTORY e2 b7 me b9 e1 b0 mp b0 e2 b0 SCORE US 9 THEM 9 LEADING THEM POINTS me 3 e1 2 mp 0 e2 0 US 3 THEM 2 PLAYERS CARDS PC me C r2 C r4 C r10 C r13 C r14 PC e1 D r4 C r6 C r7 C r8 C r9 C r12 PC mp H r14 S r6 S r9 D r3 D r7 D r10 PC e2 S r3 S r7 D r2 D r6 D r14 C r5 SUITS COUNT H 1 S 4 D 7 C 11 BID COUNT bc5 CURRENT BID me b9 BID SUIT C TRUMP COUNT 11 STATE PLAY CARD TABLE COUNT d3 TABLE CARDS U r0 D r4 D r10 D r14 TABLE SUIT D MY CARDS H r2 H r6 S r4 S r13 D r5 D r13 C r3 C r11";

// do inference with the fine-tuned model
try {
    if (!g_sp_loaded || !g_model_ready) {
        throw std::runtime_error("Tokenizer/model not ready. Run Cell 1 first.");
    }
    const int64_t max_len = g_max_len;
    const int64_t pad_id = g_pad_id;

    auto encode_fixed = [&](const std::string& text) -> torch::Tensor {
        std::vector<int64_t> ids = g_sp.encode(text);
        if (static_cast<int64_t>(ids.size()) > max_len) {
            ids.resize(static_cast<size_t>(max_len));
        }
        while (static_cast<int64_t>(ids.size()) < max_len) {
            ids.push_back(pad_id);
        }
        auto t = torch::from_blob(ids.data(), {1, max_len}, torch::TensorOptions().dtype(torch::kInt64)).clone();
        return t.to(g_device);
    };

    g_t5->eval();
    torch::NoGradGuard ng;

    auto input_ids = encode_fixed(text);
    // Decoder length = 1, since we only want one output token
    auto decoder_input_ids = torch::full({1, 1}, pad_id, torch::TensorOptions().dtype(torch::kInt64).device(g_device));

    auto logits = g_t5->forward(input_ids, decoder_input_ids); // [1, 1, V]
    auto pred_ids = logits.argmax(-1); // [1, 1]

    // Decode ONLY the first predicted token
    auto pred_cpu = pred_ids.to(torch::kCPU).contiguous();
    const int64_t first_id = pred_cpu[0][0].item<int64_t>();
    std::vector<int64_t> one_token;
    one_token.push_back(first_id);
    auto decoded = g_sp.decode(one_token);

    cout << "\nInput text:\n" << text << endl;
    cout << "\nPredicted output (1 token):\n" << decoded << endl;

} catch (const std::exception& e) {
    cerr << "Error in Cell 4: " << e.what() << endl;
}

In [None]:
string text = "A40";

// encoding with SentencePiece
try {
    if (!g_sp_loaded) {
        throw std::runtime_error("Tokenizer not ready. Run Cell 1 first.");
    }

    auto ids = g_sp.encode(text);
    cout << "Input text: " << text << endl;
    cout << "Token IDs: ";
    for (size_t i = 0; i < ids.size(); ++i) {
        cout << ids[i] << (i + 1 == ids.size() ? "\n" : ", ");
    }

    auto decoded = g_sp.decode(ids);
    cout << "Decoded text: " << decoded << endl;

} catch (const std::exception& e) {
    cerr << "Error in Cell 5: " << e.what() << endl;
}

In [None]:
// print tokenizer vocab (raw SentencePiece pieces + HF added tokens)
try {
    if (!g_sp_loaded) {
        throw std::runtime_error("Tokenizer not ready. Run Cell 2 first.");
    }

    const int64_t limit = std::min<int64_t>(200, static_cast<int64_t>(g_sp.vocab_size()));
    cout << "\nTokenizer vocabulary (first " << limit << " pieces):" << endl;
    for (int64_t i = 0; i < limit; ++i) {
        string piece = g_sp.piece_for_id(i);
        cout << "ID " << setw(4) << i << ": '" << piece << "'" << endl;
    }
    cout << "\nTokenizer effective vocab_size=" << g_sp.vocab_size() << " (model uses " << g_vocab_size << ")" << endl;
    cout << "pad_id=" << g_pad_id << endl;

} catch (const std::exception& e) {
    cerr << "Error in Cell 8: " << e.what() << endl;
}