In [None]:

from google.colab import drive
import nbformat, os

drive.mount('/content/drive', force_remount=True)
SRC = '/content/drive/MyDrive/Colab Notebooks/Untitled20.ipynb'
DST = '/content/drive/MyDrive/colab_saves/L0RA for Owen 1.5B.ipynb'

with open(SRC, 'r', encoding='utf-8') as f:
    nb = nbformat.read(f, as_version=4)


nb.metadata.pop('widgets', None)
for cell in nb.cells:
    cell.metadata.pop('widgets', None)
os.makedirs(os.path.dirname(DST), exist_ok=True)
with open(DST, 'w', encoding='utf-8') as f:
    nbformat.write(nb, f)

print("✅ Clean copy saved to", DST)

In [1]:
import os, math, numpy as np, pandas as pd, torch, torch.nn as nn
from sklearn.model_selection import train_test_split
from google.colab import drive

drive.mount('/content/drive')
file_path = "/content/drive/MyDrive/colab_saves/byday_with_diff_clean.pkl"
df = pd.read_pickle(file_path)
df = df[['titles', 'difference']].dropna()
df = df[df['titles'].astype(str).str.len() > 0]
df = df.drop_duplicates(subset=['titles']).reset_index(drop=True)

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
len(train_df), len(val_df), train_df.head(2)
train_df = train_df.reset_index(drop=True)
val_df   = val_df.reset_index(drop=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from transformers import AutoTokenizer, AutoModel
import torch

MODEL_NAME = "Qwen/Qwen2.5-1.5B"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


encoder = AutoModel.from_pretrained(
    MODEL_NAME,
    dtype=torch.float16,
    use_cache=False
)


if hasattr(encoder, 'gradient_checkpointing_disable'):
    encoder.gradient_checkpointing_disable()

encoder.to(device).eval()



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Qwen2Model(
  (embed_tokens): Embedding(151936, 1536)
  (layers): ModuleList(
    (0-27): 28 x Qwen2DecoderLayer(
      (self_attn): Qwen2Attention(
        (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
        (k_proj): Linear(in_features=1536, out_features=256, bias=True)
        (v_proj): Linear(in_features=1536, out_features=256, bias=True)
        (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
      )
      (mlp): Qwen2MLP(
        (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
        (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
        (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
    )
  )
  (norm): Qwen2RMSNorm((1536,), eps=1e-06)
  (rotary_emb): Qwen2RotaryEmbedding()
)

In [3]:
import torch.nn as nn

def mean_pool(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand_as(last_hidden_state).float()
    return (last_hidden_state * mask).sum(dim=1) / torch.clamp(mask.sum(dim=1), 1e-9)

class EncoderWithRegHead(nn.Module):
    def __init__(self, encoder, hidden_size):
        super().__init__()
        self.encoder = encoder
        self.reg_head = nn.Linear(hidden_size, 1)

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        last_hidden = out.last_hidden_state
        pooled = mean_pool(last_hidden, attention_mask)
        pred = self.reg_head(pooled).squeeze(-1)
        loss = None
        if labels is not None:
            loss = nn.MSELoss()(pred, labels)
        return {"loss": loss, "logits": pred}

model_core = EncoderWithRegHead(encoder, encoder.config.hidden_size)


In [4]:
from peft import LoraConfig, get_peft_model

lora_cfg = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
    bias="none",
    task_type="FEATURE_EXTRACTION",
)


model_core.encoder = get_peft_model(model_core.encoder, lora_cfg)


if hasattr(model_core.encoder.config, "use_cache"):
    model_core.encoder.config.use_cache = False


model_core.encoder.print_trainable_parameters()


model_core = model_core.to(device)

trainable params: 1,089,536 || all params: 1,544,803,840 || trainable%: 0.0705


In [5]:
from dataclasses import dataclass
from transformers.data.data_collator import DataCollatorMixin
from typing import Any, Dict, List
import torch

@dataclass
class DataCollatorForRegression(DataCollatorMixin):
    tokenizer: Any
    return_tensors: str = "pt"

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:

        labels = [f.pop("labels") for f in features] if "labels" in features[0] else None

        batch = self.tokenizer.pad(
            features,
            padding=True,
            return_tensors=self.return_tensors,
        )


        if labels is not None:
            batch["labels"] = torch.tensor(labels, dtype=torch.float)

        return batch

data_collator = DataCollatorForRegression(tokenizer=tokenizer)

In [8]:
from torch.utils.data import Dataset
import torch

class TitlesRegDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=64, text_col="titles", y_col="difference"):
        df = df[[text_col, y_col]].copy()
        df[y_col] = pd.to_numeric(df[y_col], errors="coerce")
        df = df.dropna(subset=[y_col]).reset_index(drop=True)

        self.df = df
        self.tk = tokenizer
        self.max_len = max_len
        self.text_col = text_col
        self.y_col = y_col

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row[self.text_col])
        y = float(row[self.y_col])

        enc = self.tk(
            text,
            truncation=True,
            padding=False,
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(y, dtype=torch.float)
        return item


train_ds = TitlesRegDataset(train_df, tokenizer, max_len=64)
val_ds = TitlesRegDataset(val_df, tokenizer, max_len=64)

print(f"Training dataset: {len(train_ds)} samples")
print(f"Validation dataset: {len(val_ds)} samples")

Training dataset: 413 samples
Validation dataset: 104 samples


In [9]:
from transformers import TrainingArguments, Trainer
from torch.utils.data import Subset

args = TrainingArguments(
    output_dir="./out_single",
    max_steps=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    learning_rate=5e-5,
    logging_steps=1,
    logging_first_step=True,
    save_strategy="no",
    report_to="none",
    remove_unused_columns=False,
    dataloader_num_workers=0,
    dataloader_pin_memory=False,
    fp16=True,
    gradient_checkpointing=False,
)


small_train = Subset(train_ds, range(min(8, len(train_ds))))

trainer = Trainer(
    model=model_core,
    args=args,
    train_dataset=small_train,
    data_collator=data_collator,
    processing_class=tokenizer,
)

# Train
trainer.train()

You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,4.1494
2,23.8899
3,0.2062


TrainOutput(global_step=3, training_loss=9.415194789568583, metrics={'train_runtime': 959.4888, 'train_samples_per_second': 0.003, 'train_steps_per_second': 0.003, 'total_flos': 0.0, 'train_loss': 9.415194789568583, 'epoch': 0.375})

In [10]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


predictions = trainer.predict(val_ds)
preds = predictions.predictions
labels = predictions.label_ids


mse = mean_squared_error(labels, preds)
rmse = np.sqrt(mse)
mae = mean_absolute_error(labels, preds)
r2 = r2_score(labels, preds)

print("=" * 50)
print("VALIDATION SET METRICS")
print("=" * 50)
print(f"RMSE (Root Mean Squared Error): {rmse:.4f}")
print(f"MAE (Mean Absolute Error):      {mae:.4f}")
print(f"R² (R-squared):                 {r2:.4f}")
print("=" * 50)

print("\nSample Predictions:")
print("-" * 50)
for i in range(min(5, len(val_df))):
    print(f"Title: {val_df.iloc[i]['titles'][:60]}...")
    print(f"  Actual: {labels[i]:.2f} | Predicted: {preds[i]:.2f}")
    print()

VALIDATION SET METRICS
RMSE (Root Mean Squared Error): 7.1924
MAE (Mean Absolute Error):      4.7791
R² (R-squared):                 -0.2572

Sample Predictions:
--------------------------------------------------
Title: ['Hyperscaler demand remains strong despite tariff concerns: Wedbush', 'Some sellers back out of Amazon Prime Day on Trump tariff concerns - report', 'Anthropic joins OpenAI, forms group looking into economic impact of AI', 'Retail blockbuster: Amazon earnings this week could reset expectations across the retail sector', 'What to do about AI, according to BlackRock']...
  Actual: -3.71 | Predicted: 2.52

Title: ['SoftBank & OpenAI set up an AI focused joint venture in Japan']...
  Actual: 1.77 | Predicted: 3.14

Title: ['Bill Gates to donate 99% of his remaining fortune to the Gates Foundation; org to close in 2045', 'Silicon Valley execs to push for softer AI regulatory approach in Senate hearing', 'FDA to speed up AI rollout to accelerate drug reviews']...
  Actual: 1