# AG News Classification (Pure C++ / LibTorch / xcpp17)

This notebook trains a **BiLSTM** text classifier on the **AG News** dataset using **pure C++ (LibTorch)** inside the **xcpp17** Jupyter kernel (xeus-cling).

Kaggle link: https://www.kaggle.com/code/ishandutta/ag-news-classification-lstm

**Dataset**: `dataset/train.csv` (120,000 rows) and `dataset/test.csv` (7,600 rows).

**Columns**: `Class Index` (1..4), `Title`, `Description`. We combine `Title + " " + Description` into a single text field and map labels to **0..3**.

## What you get

- Training + evaluation in C++ (no Python ML stack)
- Tokenization + padding (word-level) using `mcppfa::text::WordVocab`
- Model checkpoint saving (`.pt`) + vocab export for reuse
- Simple demo predictions + confusion matrix

In [None]:
// --- LibTorch (repo-local, Linux/WSL build) ---
#pragma cling add_include_path("./libtorch/include")
#pragma cling add_include_path("./libtorch/include/torch/csrc/api/include")
#pragma cling add_library_path("./libtorch/lib")
#pragma cling load("./libtorch/lib/libc10.so")
#pragma cling load("./libtorch/lib/libtorch_cpu.so")
#pragma cling load("./libtorch/lib/libtorch.so")

// --- Repo headers ---
#pragma cling add_include_path("./include")

#include <torch/torch.h>

#include <bits/stdc++.h>

// Reusable helpers moved into the library
// (header-only, safe in xeus-cling)
#include <mcppfa/csv.hpp>
#include <mcppfa/word_vocab.hpp>
#include <mcppfa/torch_lstm.hpp>

using namespace std;
namespace fs = filesystem;

// Unbuffered output (statements are OK in this cell)
cout.setf(ios::unitbuf);
cerr.setf(ios::unitbuf);
setvbuf(stdout, nullptr, _IONBF, 0);
setvbuf(stderr, nullptr, _IONBF, 0);

// Repo root used throughout the notebook
static fs::path root = fs::current_path();
cout << "Repo root (cwd): " << root << "\n";
cout << "LibTorch version: " << TORCH_VERSION << "\n";
cout << "CUDA available: " << (torch::cuda::is_available() ? "yes" : "no") << "\n";


## 1) Load AG News CSVs

We load the repo-local CSVs from `dataset/`. Each row has:
- class index (1..4)
- title
- description

We combine `title + " " + description` and shift labels to `0..3`.

In [None]:
struct ExampleRow {
    int64_t label{};      // 0..3
    std::string text;     // title + description
};

const fs::path& g_repo_root = root;
if (g_repo_root.empty()) {
    throw std::runtime_error("Repo root not found. Run Cell 2 then Cell 3, and ensure you started Jupyter from inside WSL repo.");
}

// Build paths without `operator/` (some cling builds choke on it).
fs::path g_train_csv = "/home/warawreh/MakeCPPFunAgain/notebooks/AG News Classification/archive/train.csv";

fs::path g_test_csv = "/home/warawreh/MakeCPPFunAgain/notebooks/AG News Classification/archive/test.csv";

// Optional: limit rows to iterate faster while you tune hyperparams.
// If your kernel crashes, keep these small first (e.g. 20000 / 2000), then increase.
int64_t g_limit_train = 120000;
int64_t g_limit_test  = 8000;

// Stream parse (avoids allocating a giant rows[][] first).
auto load_ag_news_csv = [&](const fs::path& path, int64_t limit, size_t reserve_hint) -> std::vector<ExampleRow> {
    std::ifstream in(path.string());
    if (!in) throw std::runtime_error("Failed to open CSV: " + path.string());

    std::vector<ExampleRow> out;
    if (limit > 0) out.reserve(static_cast<size_t>(limit));
    else out.reserve(reserve_hint);

    std::string line;
    bool first = true;
    while (std::getline(in, line)) {
        if (!line.empty() && line.back() == '\r') line.pop_back();
        if (first) {
            first = false;
            continue; // header
        }
        if (line.empty()) continue;

        auto r = mcppfa::csv::split_csv_line(line);
        if (r.size() < 3) continue;
        const int64_t cls = std::stoll(r[0]);
        const int64_t label = cls - 1; // 1..4 -> 0..3
        if (label < 0 || label > 3) continue;

        std::string text = r[1];
        text.push_back(' ');
        text += r[2];

        out.push_back(ExampleRow{label, std::move(text)});
        if (limit > 0 && static_cast<int64_t>(out.size()) >= limit) break;
    }

    return out;
};

std::vector<ExampleRow> g_train_rows = load_ag_news_csv(g_train_csv, g_limit_train, 120000u);
std::vector<ExampleRow> g_test_rows  = load_ag_news_csv(g_test_csv,  g_limit_test,  7600u);

cout << "Train rows: " << g_train_rows.size() << "\n";
cout << "Test rows : " << g_test_rows.size() << "\n";

// Class distribution check
std::array<int64_t, 4> train_counts{0,0,0,0};
std::array<int64_t, 4> test_counts{0,0,0,0};
for (const auto& r : g_train_rows) train_counts[static_cast<size_t>(r.label)]++;
for (const auto& r : g_test_rows) test_counts[static_cast<size_t>(r.label)]++;
cout << "Train label counts (0..3): ";
for (auto c : train_counts) cout << c << " ";
cout << "\n";
cout << "Test  label counts (0..3): ";
for (auto c : test_counts) cout << c << " ";
cout << "\n";

## 2) Tokenize + build vocabulary + pad sequences

We use the repo’s simple word tokenizer (`mcppfa::text::word_tokenize_lower_ascii`) and a frequency-based vocab (`mcppfa::text::WordVocab`).

Then we encode each example into a fixed-length `max_len` vector of token ids (padding with `<pad>`).

In [None]:
// Hyperparameters (roughly matching the Python baseline)
int64_t g_vocab_size = 10000;
int64_t g_embed_size = 32;
int64_t g_max_len    = 200;  // capped for practicality (Python example used dataset max, which can get huge)

// Build vocab from training texts only (string_view avoids duplicating all texts in memory)
std::vector<std::string_view> g_train_text_views;
g_train_text_views.reserve(g_train_rows.size());
for (const auto& r : g_train_rows) g_train_text_views.emplace_back(r.text);

mcppfa::text::WordVocab g_vocab(g_vocab_size);
g_vocab.build_from_texts(g_train_text_views);
cout << "Built vocab. actual_size=" << g_vocab.size() << " (requested=" << g_vocab_size << ")\n";

// Light token-length stats (sampled; full scan can be slow/fragile in xeus-cling)
{
    const int64_t k = std::min<int64_t>(200, static_cast<int64_t>(g_train_text_views.size()));
    int64_t max_tokens_seen = 0;
    int64_t sum_tokens = 0;
    for (int64_t i = 0; i < k; ++i) {
        const auto n = static_cast<int64_t>(mcppfa::text::word_tokenize_lower_ascii(
            g_train_text_views[static_cast<size_t>(i)]).size());
        max_tokens_seen = std::max(max_tokens_seen, n);
        sum_tokens += n;
    }
    cout << "Train token length (sampled " << k << "): max=" << max_tokens_seen
         << ", avg=" << (k == 0 ? 0.0 : (double)sum_tokens / (double)k)
         << "\n";
}

// Build tensors inline (avoid defining helper functions; cling can get picky if scope is corrupted).
const int64_t n_train = static_cast<int64_t>(g_train_rows.size());
auto g_x_train = torch::empty({n_train, g_max_len}, torch::TensorOptions().dtype(torch::kInt64));
auto g_y_train = torch::empty({n_train},          torch::TensorOptions().dtype(torch::kInt64));
{
    auto xa = g_x_train.accessor<int64_t, 2>();
    auto ya = g_y_train.accessor<int64_t, 1>();
    for (int64_t i = 0; i < n_train; ++i) {
        const auto& ex = g_train_rows[static_cast<size_t>(i)];
        ya[i] = ex.label;
        g_vocab.encode_padded_to(ex.text, &xa[i][0], g_max_len);
    }
}

const int64_t n_test = static_cast<int64_t>(g_test_rows.size());
auto g_x_test = torch::empty({n_test, g_max_len}, torch::TensorOptions().dtype(torch::kInt64));
auto g_y_test = torch::empty({n_test},          torch::TensorOptions().dtype(torch::kInt64));
{
    auto xa = g_x_test.accessor<int64_t, 2>();
    auto ya = g_y_test.accessor<int64_t, 1>();
    for (int64_t i = 0; i < n_test; ++i) {
        const auto& ex = g_test_rows[static_cast<size_t>(i)];
        ya[i] = ex.label;
        g_vocab.encode_padded_to(ex.text, &xa[i][0], g_max_len);
    }
}

cout << "X_train: " << g_x_train.sizes() << ", y_train: " << g_y_train.sizes() << "\n";
cout << "X_test : " << g_x_test.sizes()  << ", y_test : " << g_y_test.sizes()  << "\n";

## 3) Define the BiLSTM classifier (LibTorch)

This matches the architecture used in your C++ CLI implementation (`src/ag_news_lstm.cpp`):
- Embedding
- BiLSTM(128) + BiLSTM(64)
- GlobalMaxPool over time
- MLP + Dropout
- 4-class logits

In [None]:
// Notebook-local AG News model that *uses* the reusable LSTM wrapper from the library.
// This keeps the library generic while making the notebook model easy to tweak.

struct AgNewsLSTMClassifierImpl : torch::nn::Module {
    torch::nn::Embedding emb{nullptr};
    mcppfa::LSTMBlock lstm1{nullptr};
    mcppfa::LSTMBlock lstm2{nullptr};

    torch::nn::Linear fc1{nullptr};
    torch::nn::Linear fc2{nullptr};
    torch::nn::Linear fc3{nullptr};
    torch::nn::Dropout drop{nullptr};
    torch::nn::Linear out{nullptr};

    AgNewsLSTMClassifierImpl(
        int64_t vocab_size,
        int64_t embed_size,
        int64_t lstm1_hidden = 128,
        int64_t lstm2_hidden = 64,
        int64_t fc1_dim = 256,
        int64_t fc2_dim = 128,
        int64_t fc3_dim = 64,
        double dropout = 0.25,
        int64_t num_classes = 4,
        bool bidirectional = true) {

        if (vocab_size <= 0) throw std::runtime_error("AgNewsLSTMClassifier: vocab_size must be > 0");
        if (embed_size <= 0) throw std::runtime_error("AgNewsLSTMClassifier: embed_size must be > 0");
        if (lstm1_hidden <= 0 || lstm2_hidden <= 0) throw std::runtime_error("AgNewsLSTMClassifier: LSTM hidden sizes must be > 0");
        if (fc1_dim <= 0 || fc2_dim <= 0 || fc3_dim <= 0) throw std::runtime_error("AgNewsLSTMClassifier: FC dims must be > 0");
        if (!(dropout >= 0.0 && dropout < 1.0)) throw std::runtime_error("AgNewsLSTMClassifier: dropout must be in [0,1)");
        if (num_classes <= 1) throw std::runtime_error("AgNewsLSTMClassifier: num_classes must be > 1");

        emb = register_module("emb", torch::nn::Embedding(torch::nn::EmbeddingOptions(vocab_size, embed_size)));

        lstm1 = register_module("lstm1", mcppfa::LSTMBlock(mcppfa::LSTMConfig{
            .input_size = embed_size,
            .hidden_size = lstm1_hidden,
            .num_layers = 1,
            .batch_first = true,
            .bidirectional = bidirectional,
            .dropout = 0.0
        }));

        lstm2 = register_module("lstm2", mcppfa::LSTMBlock(mcppfa::LSTMConfig{
            .input_size = lstm1->output_size(),
            .hidden_size = lstm2_hidden,
            .num_layers = 1,
            .batch_first = true,
            .bidirectional = bidirectional,
            .dropout = 0.0
        }));

        const int64_t features = lstm2->output_size();
        fc1 = register_module("fc1", torch::nn::Linear(features, fc1_dim));
        fc2 = register_module("fc2", torch::nn::Linear(fc1_dim, fc2_dim));
        fc3 = register_module("fc3", torch::nn::Linear(fc2_dim, fc3_dim));
        drop = register_module("drop", torch::nn::Dropout(dropout));
        out  = register_module("out",  torch::nn::Linear(fc3_dim, num_classes));
    }

    // input_ids: [B, T] int64
    torch::Tensor forward(torch::Tensor input_ids) {
        auto x = emb->forward(input_ids);         // [B, T, E]
        x = lstm1->forward(x);                   // [B, T, H1*dir]
        x = lstm2->forward(x);                   // [B, T, H2*dir]

        // GlobalMaxPooling1D over time dimension
        x = std::get<0>(x.max(1));               // [B, H2*dir]

        x = torch::relu(fc1->forward(x));
        x = drop->forward(x);
        x = torch::relu(fc2->forward(x));
        x = drop->forward(x);
        x = torch::relu(fc3->forward(x));
        x = drop->forward(x);
        return out->forward(x);                  // logits [B, C]
    }
};
TORCH_MODULE(AgNewsLSTMClassifier);

## 4) Train + evaluate (with checkpoint saving)

We train on the full training set and evaluate on the provided test set each epoch.

Artifacts written to the notebook working directory:
- `ag_news_lstm_xcpp17.pt` (best model weights)
- `ag_news_vocab.txt` (one token per line; id==line index)

In [None]:
// Device
torch::Device device(torch::kCPU);
cout << "Using device: " << (device.is_cuda() ? "CUDA" : "CPU") << "\n";

// Model (notebook-local, but built from reusable mcppfa::LSTMBlock)
auto model = AgNewsLSTMClassifier(
    g_vocab.size(),
    g_embed_size,
    /*lstm1_hidden=*/128,
    /*lstm2_hidden=*/64,
    /*fc1_dim=*/256,
    /*fc2_dim=*/128,
    /*fc3_dim=*/64,
    /*dropout=*/0.25,
    /*num_classes=*/4,
    /*bidirectional=*/true);
model->to(device);

// Optimizer / loss
torch::optim::Adam optimizer(model->parameters(), torch::optim::AdamOptions(1e-3));
auto criterion = torch::nn::CrossEntropyLoss();

// Training settings
int64_t epochs = 2;
int64_t batch_size = 256;
int64_t log_every = 100;

// Split settings (train -> train/val)
const double val_fraction = 0.10;
const int64_t split_seed = 123;

// Checkpoint path + best tracker (persist for later cells)
fs::path best_model_path = fs::current_path();
best_model_path /= "ag_news_lstm_xcpp17.pt";
double best_val_acc = -1.0;

In [None]:
// Prepare tensors
auto Xall = g_x_train.to(device);
auto Yall = g_y_train.to(device);
auto Xtest = g_x_test.to(device);
auto Ytest = g_y_test.to(device);

const int64_t n_all = Xall.size(0);
const int64_t n_val = std::max<int64_t>(1, static_cast<int64_t>(std::llround((double)n_all * val_fraction)));
const int64_t n_tr  = n_all - n_val;
cout << "Train/Val split: train=" << n_tr << ", val=" << n_val << " (from " << n_all << ")\n";

// Deterministic split
torch::manual_seed(split_seed);
auto perm_split = torch::randperm(n_all, torch::TensorOptions().dtype(torch::kInt64).device(device));
auto idx_tr  = perm_split.slice(0, 0, n_tr);
auto idx_val = perm_split.slice(0, n_tr, n_all);

auto Xtr = Xall.index_select(0, idx_tr);
auto Ytr = Yall.index_select(0, idx_tr);
auto Xval = Xall.index_select(0, idx_val);
auto Yval = Yall.index_select(0, idx_val);

cout << "Test samples (held out): " << Xtest.size(0) << "\n";

for (int64_t epoch = 1; epoch <= epochs; ++epoch) {
    const auto epoch_t0 = std::chrono::steady_clock::now();

    model->train();
    double train_loss_sum = 0.0;
    double train_acc_sum  = 0.0;
    int64_t steps = 0;

    // Shuffle train indices each epoch
    auto perm = torch::randperm(n_tr, torch::TensorOptions().dtype(torch::kInt64).device(device));
    for (int64_t start = 0; start < n_tr; start += batch_size) {
        auto end = std::min<int64_t>(start + batch_size, n_tr);
        auto local = perm.slice(0, start, end);
        auto idx = idx_tr.index_select(0, local);
        auto xb = Xall.index_select(0, idx);
        auto yb = Yall.index_select(0, idx);

        optimizer.zero_grad();
        auto logits = model->forward(xb);
        auto loss = criterion(logits, yb);
        loss.backward();
        optimizer.step();

        train_loss_sum += loss.item<double>();

        auto preds = logits.detach().argmax(1);
        train_acc_sum += preds.eq(yb).sum().item<double>() / (double)yb.size(0);

        ++steps;
        if (steps % log_every == 0) {
            cout << "epoch " << epoch << " step " << steps
                 << " loss=" << (train_loss_sum / (double)steps)
                 << " acc=" << (train_acc_sum / (double)steps)
                 << "\n";
        }
    }

    // Validation
    model->eval();
    torch::NoGradGuard no_grad;
    double val_loss_sum = 0.0;
    double val_acc_sum  = 0.0;
    int64_t val_steps = 0;

    const int64_t n_val_local = Xval.size(0);
    for (int64_t start = 0; start < n_val_local; start += batch_size) {
        auto end = std::min<int64_t>(start + batch_size, n_val_local);
        auto xb = Xval.slice(0, start, end);
        auto yb = Yval.slice(0, start, end);

        auto logits = model->forward(xb);
        auto loss = criterion(logits, yb);
        val_loss_sum += loss.item<double>();

        auto preds = logits.argmax(1);
        val_acc_sum += preds.eq(yb).sum().item<double>() / (double)yb.size(0);

        ++val_steps;
    }

    const double train_loss_epoch = (steps == 0) ? 0.0 : (train_loss_sum / (double)steps);
    const double train_acc_epoch  = (steps == 0) ? 0.0 : (train_acc_sum  / (double)steps);
    const double val_loss_epoch   = (val_steps == 0) ? 0.0 : (val_loss_sum / (double)val_steps);
    const double val_acc_epoch    = (val_steps == 0) ? 0.0 : (val_acc_sum  / (double)val_steps);

    const auto epoch_t1 = std::chrono::steady_clock::now();
    const double epoch_sec = std::chrono::duration_cast<std::chrono::duration<double>>(epoch_t1 - epoch_t0).count();

    cout << "Epoch " << epoch
         << " | train_loss=" << train_loss_epoch
         << " train_acc=" << train_acc_epoch
         << " | val_loss=" << val_loss_epoch
         << " val_acc=" << val_acc_epoch
         << " | time_sec=" << epoch_sec
         << "\n";

    if (val_steps > 0 && val_acc_epoch > best_val_acc) {
        best_val_acc = val_acc_epoch;
        torch::save(model, best_model_path.string());
        cout << "Saved best model to: " << best_model_path
             << " (best_val_acc=" << best_val_acc << ")\n";
    }
}

cout << "Training done. Best validation accuracy: " << best_val_acc
     << " (checkpoint: " << best_model_path << ")\n";

## 5) Load best checkpoint + run a small demo

This mirrors the Python notebook’s demo but runs entirely in C++: we encode the input text with the same vocab, run the model, and print the predicted label.

In [None]:
// Cell 13: Load the best checkpoint from Cell 11, then run demo predictions.
// Requires running Cell 11 first (it defines `best_model_path`).

if (fs::exists(best_model_path)) {
    torch::load(model, best_model_path.string());
    model->to(device);
    cout << "Loaded best model from: " << best_model_path << "\n";
} else {
    cout << "Warning: best model file not found at: " << best_model_path
         << " (using current in-memory weights)\n";
}

model->eval();
torch::NoGradGuard ng2;

static const std::vector<std::string> k_labels = {
    "World News",
    "Sports News",
    "Business News",
    "Science-Technology News",
};

auto predict_one = [&](const std::string& text) {
    auto ids = g_vocab.encode_padded(text, g_max_len);
    auto x = torch::from_blob(ids.data(), {1, g_max_len}, torch::TensorOptions().dtype(torch::kInt64))
                 .clone()
                 .to(device);
    auto logits = model->forward(x);
    auto pred = logits.argmax(1).to(torch::kCPU).item<int64_t>();
    cout << "\"" << text << "\" => " << k_labels.at(static_cast<size_t>(pred)) << "\n";
};

predict_one("New evidence of virus risks from wildlife trade");
predict_one("Coronavirus: Bank pumps £100bn into UK economy to aid recovery");
predict_one("Trump's bid to end Obama-era immigration policy ruled unlawful");
predict_one("David Luiz’s future with Arsenal to be decided this week");
predict_one("Indian Economic budget supports the underprivileged sections of society");

## 6) Full evaluation: confusion matrix + micro metrics

This reproduces the Python notebook’s evaluation section (confusion matrix + micro precision/recall/accuracy).

In [None]:
// Cell 15: Confusion matrix + accuracy on the test set (true held-out test.csv).
// Loads the best checkpoint (saved from validation in Cell 11) before evaluation.

if (fs::exists(best_model_path)) {
    torch::load(model, best_model_path.string());
    model->to(device);
    cout << "Loaded best model from: " << best_model_path << "\n";
} else {
    cout << "Warning: best model file not found at: " << best_model_path
         << " (using current in-memory weights)\n";
}

model->eval();
torch::NoGradGuard ng3;

// Confusion matrix: rows = true label, cols = predicted label
std::array<std::array<int64_t, 4>, 4> cm{};
for (auto& row : cm) row.fill(0);

const int64_t batch_size_eval = 256;
const int64_t num_test = g_x_test.size(0);
int64_t correct = 0;
int64_t total = 0;

for (int64_t start = 0; start < num_test; start += batch_size_eval) {
    const int64_t end = std::min<int64_t>(start + batch_size_eval, num_test);
    auto xb = g_x_test.slice(0, start, end).to(device);
    auto yb = g_y_test.slice(0, start, end).to(device);

    auto logits = model->forward(xb);
    auto preds = logits.argmax(1);

    auto preds_cpu = preds.to(torch::kCPU);
    auto y_cpu = yb.to(torch::kCPU);

    auto pa = preds_cpu.accessor<int64_t, 1>();
    auto ya = y_cpu.accessor<int64_t, 1>();

    for (int64_t i = 0; i < preds_cpu.size(0); ++i) {
        const int64_t yt = ya[i];
        const int64_t yp = pa[i];
        if (0 <= yt && yt < 4 && 0 <= yp && yp < 4) {
            cm[static_cast<size_t>(yt)][static_cast<size_t>(yp)]++;
        }
        if (yt == yp) correct++;
        total++;
    }
}

cout << "Confusion matrix (rows=true, cols=pred):\n";
cout << "      0      1      2      3\n";
for (int r = 0; r < 4; ++r) {
    cout << r << " ";
    for (int c = 0; c < 4; ++c) {
        cout << std::setw(6) << cm[static_cast<size_t>(r)][static_cast<size_t>(c)] << " ";
    }
    cout << "\n";
}

const double acc = total == 0 ? 0.0 : (double)correct / (double)total;
cout << "Test accuracy: " << acc << "\n";

// Micro precision/recall equal accuracy for single-label multi-class, but we print both for clarity.
cout << "Micro precision: " << acc << "\n";
cout << "Micro recall   : " << acc << "\n";