In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import ast
import numpy as np
import torch
from datasets import Dataset, DatasetDict, ClassLabel
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 1) Load & preprocess
news_df = pd.read_csv("/content/drive/MyDrive/cryptonews.csv")
news_df["sentiment_dict"] = news_df["sentiment"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
news_df["label_str"] = news_df["sentiment_dict"].apply(lambda d: d["class"])

# 2) HF Dataset & split
hf_ds = (
    Dataset.from_pandas(
        news_df[["text", "label_str"]], preserve_index=False
    )
    .train_test_split(test_size=0.1, seed=42)
)
hf_ds = DatasetDict(train=hf_ds["train"], validation=hf_ds["test"])

# 3) Cast & rename label
hf_ds = hf_ds.cast_column(
    "label_str", ClassLabel(names=["negative", "neutral", "positive"])
).rename_column("label_str", "labels")

# 4) Tokenization fn
def preprocess(batch, tokenizer, max_length=2048):
    return tokenizer(batch["text"], truncation=True, max_length=max_length)

# 5) Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    p, r, f, _ = precision_recall_fscore_support(labels, preds, average="macro")
    return {"accuracy": acc, "precision": p, "recall": r, "f1_macro": f}

# 6) Custom collator that only ever returns the three keys we want
def data_collator(features):
    batch = tokenizer.pad(
        {
            "input_ids":      [f["input_ids"]      for f in features],
            "attention_mask": [f["attention_mask"] for f in features],
        },
        padding=True,
        return_tensors="pt",
    )
    batch["labels"] = torch.tensor([f["labels"] for f in features], dtype=torch.long)
    return batch

# 7) Loop through backbones
for model_name, backbone in [
    ("longformer-2048", "allenai/longformer-base-4096"),
    ("bigbird-2048",    "google/bigbird-roberta-base"),
]:
    print(f"\n=== Training {model_name} ===")
    tokenizer = AutoTokenizer.from_pretrained(backbone)
    model     = AutoModelForSequenceClassification.from_pretrained(backbone, num_labels=3)

    # speedups
    model.gradient_checkpointing_enable()
    # -- don’t torch.compile here; it can break forward() signature introspection --

    # a) tokenize & drop raw text
    tokenized = hf_ds.map(
        lambda b: preprocess(b, tokenizer),
        batched=True,
        remove_columns=["text"],
    )
    tokenized.set_format(type="python")  # so our collator sees lists

    # b) training args — keep remove_unused_columns=False so HF won’t strip our labels
    args = TrainingArguments(
        output_dir=f"./{model_name}-finetuned",
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=4,
        num_train_epochs=3,

        do_train=True,
        do_eval=True,
        eval_strategy="steps",
        save_strategy="steps",
        eval_steps=500,
        save_steps=500,
        logging_steps=100,

        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        report_to=[],

        bf16=torch.cuda.is_bf16_supported(),
        dataloader_num_workers=4,
        remove_unused_columns=False,  # don't auto‑prune input_ids/attention_mask/labels
    )

    # c) Trainer
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["validation"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # d) train & eval
    trainer.train()
    print(trainer.evaluate())


Casting the dataset:   0%|          | 0/27933 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3104 [00:00<?, ? examples/s]


=== Training longformer-2048 ===


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/27933 [00:00<?, ? examples/s]

Map:   0%|          | 0/3104 [00:00<?, ? examples/s]

You're using a LongformerTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a LongformerTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a LongformerTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a LongformerTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Initializing global attention on CLS token...
Input ids are automaticall

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Macro
500,1.0667,1.073859,0.431701,0.1439,0.333333,0.20102
1000,1.0511,1.079147,0.431701,0.1439,0.333333,0.20102
1500,1.0588,1.06515,0.431701,0.1439,0.333333,0.20102
2000,1.0511,1.070808,0.431701,0.1439,0.333333,0.20102
2500,1.0643,1.063552,0.431701,0.1439,0.333333,0.20102
3000,1.0657,1.063891,0.431701,0.1439,0.333333,0.20102
3500,1.0475,1.067414,0.431701,0.1439,0.333333,0.20102
4000,1.064,1.061569,0.431701,0.1439,0.333333,0.20102
4500,1.0629,1.050696,0.431701,0.1439,0.333333,0.20102
5000,0.83,0.833435,0.618235,0.430598,0.534914,0.464272


You're using a LongformerTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a LongformerTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a LongformerTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a LongformerTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.8334345817565918, 'eval_accuracy': 0.6182345360824743, 'eval_precision': 0.43059791633780126, 'eval_recall': 0.5349139765752654, 'eval_f1_macro': 0.4642717328000339, 'eval_runtime': 57.3636, 'eval_samples_per_second': 54.111, 'eval_steps_per_second': 13.528, 'epoch': 3.0}

=== Training bigbird-2048 ===


tokenizer_config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/846k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/27933 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/513M [00:00<?, ?B/s]

Map:   0%|          | 0/3104 [00:00<?, ? examples/s]

You're using a BigBirdTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BigBirdTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BigBirdTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BigBirdTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Attention type 'block_sparse' is not possible if sequence_length: 40 <= num global t

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Macro
500,0.6216,0.567907,0.802513,0.789403,0.783966,0.786138
1000,0.4538,0.482061,0.846327,0.856339,0.811761,0.823596
1500,0.394,0.379213,0.884343,0.879254,0.876258,0.87519


You're using a BigBirdTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BigBirdTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BigBirdTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BigBirdTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BigBirdTokenizerFast tokenizer. Please note that with a fast tokenize

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Macro
500,0.6216,0.567907,0.802513,0.789403,0.783966,0.786138
1000,0.4538,0.482061,0.846327,0.856339,0.811761,0.823596
1500,0.394,0.379213,0.884343,0.879254,0.876258,0.87519
2000,0.3178,0.384062,0.89433,0.892365,0.880725,0.885312
2500,0.3262,0.334262,0.899162,0.89005,0.893062,0.890648
3000,0.3277,0.287562,0.917204,0.910332,0.91328,0.911563
3500,0.2683,0.303945,0.921392,0.914848,0.921153,0.917434
4000,0.1984,0.293516,0.923647,0.921794,0.919066,0.920056
4500,0.2569,0.252598,0.928802,0.925196,0.926898,0.925806
5000,0.2387,0.244856,0.932345,0.930278,0.929437,0.929626


You're using a BigBirdTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BigBirdTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BigBirdTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BigBirdTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BigBirdTokenizerFast tokenizer. Please note that with a fast tokenize

{'eval_loss': 0.24485577642917633, 'eval_accuracy': 0.9323453608247423, 'eval_precision': 0.9302777767687266, 'eval_recall': 0.9294367875042121, 'eval_f1_macro': 0.9296262946090188, 'eval_runtime': 15.2645, 'eval_samples_per_second': 203.348, 'eval_steps_per_second': 50.837, 'epoch': 3.0}


# Save

In [None]:
import os
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# map your model_name ↔ backbone
model_backbones = {
    "longformer-2048": "allenai/longformer-base-4096",
    "bigbird-2048":    "google/bigbird-roberta-base",
}

for model_name, backbone in model_backbones.items():
    root = f"./{model_name}-finetuned"
    # 1) find all checkpoint dirs under your output_dir
    ckpts = [
        d for d in os.listdir(root)
        if os.path.isdir(os.path.join(root, d)) and d.startswith("checkpoint-")
    ]
    if not ckpts:
        raise RuntimeError(f"No checkpoint-* dirs found in {root}")
    # 2) pick the largest-numbered (latest) one
    best_ckpt = sorted(ckpts, key=lambda x: int(x.split("-")[-1]))[-1]
    best_ckpt_path = os.path.join(root, best_ckpt)
    print(f"→ Loading model from {best_ckpt_path}")

    # 3) reload model + original tokenizer
    model     = AutoModelForSequenceClassification.from_pretrained(best_ckpt_path)
    tokenizer = AutoTokenizer.from_pretrained(backbone)

    # 4) save into a fresh “final” folder
    final_dir = f"./{model_name}-final"
    os.makedirs(final_dir, exist_ok=True)
    model.save_pretrained(final_dir)
    tokenizer.save_pretrained(final_dir)
    print(f"✅ Saved both to {final_dir}\n")


→ Loading model from ./longformer-2048-finetuned/checkpoint-5238
✅ Saved both to ./longformer-2048-final

→ Loading model from ./bigbird-2048-finetuned/checkpoint-5238
✅ Saved both to ./bigbird-2048-final



# Load

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model     = AutoModelForSequenceClassification.from_pretrained("./longformer-2048-final")
tokenizer = AutoTokenizer.from_pretrained("./longformer-2048-final")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 1) Point to your “final” directory
model_dir = "./longformer-2048-final"

# 2) Load tokenizer & model, and put the model in eval mode
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model     = AutoModelForSequenceClassification.from_pretrained(model_dir)
model.eval()

# (Optional) move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 3) Prepare a sample
sample_text = (
    "Bitcoin surges 10% after major exchange announces new institutional custody offering."
)

# 4) Tokenize (truncate/pad to your model’s max length)
inputs = tokenizer(
    sample_text,
    return_tensors="pt",
    truncation=True,
    max_length=2048
)
inputs = {k: v.to(device) for k, v in inputs.items()}

# 5) Forward pass + softmax to get probabilities
with torch.no_grad():
    outputs = model(**inputs)
    logits  = outputs.logits           # shape (1, 3)
    probs   = torch.softmax(logits, dim=-1).squeeze()  # shape (3,)

# 6) Map to labels
label_names = ["negative", "neutral", "positive"]
pred_id     = torch.argmax(probs).item()
pred_label  = label_names[pred_id]
pred_prob   = probs[pred_id].item()

print(f"Text:   {sample_text}")
print(f"Predicted sentiment: {pred_label} (p={pred_prob:.2f})")


Text:   Bitcoin surges 10% after major exchange announces new institutional custody offering.
Predicted sentiment: positive (p=0.90)


# Save to drive

In [None]:
import shutil, os

DRIVE_FOLDER = '/content/drive/MyDrive/ColabModels'
os.makedirs(DRIVE_FOLDER, exist_ok=True)

shutil.copytree('/content/longformer-2048-final',
                os.path.join(DRIVE_FOLDER, 'longformer-2048-final'),
                dirs_exist_ok=True)

shutil.copytree('/content/bigbird-2048-final',
                os.path.join(DRIVE_FOLDER, 'bigbird-2048-final'),
                dirs_exist_ok=True)


Error: [('/content/bigbird-2048-finetuned/checkpoint-1000/optimizer.pt', '/content/drive/MyDrive/ColabModels/bigbird-2048-finetuned/checkpoint-1000/optimizer.pt', '[Errno 2] No such file or directory'), ('/content/bigbird-2048-finetuned/checkpoint-1000/config.json', '/content/drive/MyDrive/ColabModels/bigbird-2048-finetuned/checkpoint-1000/config.json', "[Errno 2] No such file or directory: '/content/drive/MyDrive/ColabModels/bigbird-2048-finetuned/checkpoint-1000/config.json'"), ('/content/bigbird-2048-finetuned/checkpoint-1000/scheduler.pt', '/content/drive/MyDrive/ColabModels/bigbird-2048-finetuned/checkpoint-1000/scheduler.pt', "[Errno 2] No such file or directory: '/content/drive/MyDrive/ColabModels/bigbird-2048-finetuned/checkpoint-1000/scheduler.pt'"), ('/content/bigbird-2048-finetuned/checkpoint-1000/trainer_state.json', '/content/drive/MyDrive/ColabModels/bigbird-2048-finetuned/checkpoint-1000/trainer_state.json', "[Errno 2] No such file or directory: '/content/drive/MyDrive/ColabModels/bigbird-2048-finetuned/checkpoint-1000/trainer_state.json'"), ('/content/bigbird-2048-finetuned/checkpoint-1000/training_args.bin', '/content/drive/MyDrive/ColabModels/bigbird-2048-finetuned/checkpoint-1000/training_args.bin', "[Errno 2] No such file or directory: '/content/drive/MyDrive/ColabModels/bigbird-2048-finetuned/checkpoint-1000/training_args.bin'"), (<DirEntry 'checkpoint-1000'>, '/content/drive/MyDrive/ColabModels/bigbird-2048-finetuned/checkpoint-1000', '[Errno 2] No such file or directory'), ('/content/bigbird-2048-finetuned', '/content/drive/MyDrive/ColabModels/bigbird-2048-finetuned', '[Errno 2] No such file or directory')]

In [None]:
# 1) Load & preprocess into news_df_finetuning
news_df_finetuning = pd.read_csv("/content/drive/MyDrive/cryptonews.csv")
news_df_finetuning['sentiment_dict'] = news_df_finetuning['sentiment'] \
    .apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
news_df_finetuning['label_str'] = news_df_finetuning['sentiment_dict'] \
    .apply(lambda d: d['class'])

# 2) HuggingFace Dataset & split
hf_ds = Dataset.from_pandas(
    news_df_finetuning[['text', 'label_str']],
    preserve_index=False
)
hf_ds = hf_ds.train_test_split(test_size=0.1, seed=42)
hf_ds = DatasetDict(train=hf_ds['train'], validation=hf_ds['test'])

# 3) Cast & rename label
hf_ds = hf_ds.cast_column("label_str", ClassLabel(names=["negative","neutral","positive"]))
hf_ds = hf_ds.rename_column("label_str","labels")

# 4) Preprocess fn
def preprocess(batch, tokenizer, max_length):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=max_length)

# 5) Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds  = np.argmax(pred.predictions, axis=1)
    acc    = accuracy_score(labels, preds)
    p, r, f, _ = precision_recall_fscore_support(labels, preds, average="macro")
    return {"accuracy":acc, "precision":p, "recall":r, "f1_macro":f}

# 6) Collator
def collate_fn(batch):
    input_ids      = torch.tensor([ex["input_ids"] for ex in batch], dtype=torch.long)
    attention_mask = torch.tensor([ex["attention_mask"] for ex in batch], dtype=torch.long)
    labels         = torch.tensor([ex["labels"] for ex in batch], dtype=torch.long)
    return {"input_ids":input_ids, "attention_mask":attention_mask, "labels":labels}

# 7) Training loop
for model_name, backbone, max_len in [
    ("longformer-2048", "allenai/longformer-base-4096", 2048),
    ("bigbird-2048",    "google/bigbird-roberta-base",   2048),
]:
    print(f"\n=== Training {model_name} ===")
    tokenizer = AutoTokenizer.from_pretrained(backbone)
    model     = AutoModelForSequenceClassification.from_pretrained(backbone, num_labels=3)

    # a) Tokenize, dropping only "text"
    tokenized = hf_ds.map(
        lambda batch: preprocess(batch, tokenizer, max_length=max_len),
        batched=True,
        remove_columns=["text"]
    )

    # b) Prepare legacy TrainingArguments
    args = TrainingArguments(
        output_dir=f"./{model_name}-finetuned",
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=8,
        num_train_epochs=3,
        do_train=True,
        do_eval=True,
        eval_strategy="steps",
        save_strategy="steps",
        eval_steps=500,
        save_steps=500,
        logging_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        fp16=torch.cuda.is_available(),
        report_to=[],  # disable W&B
    )

    # c) Trainer with custom collator
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["validation"],
        tokenizer=tokenizer,
        data_collator=collate_fn,
        compute_metrics=compute_metrics,
    )

    # d) Train & evaluate
    trainer.train()
    print(trainer.evaluate())