In [1]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# ============================================================
# Qwen2-1.5B Multitask Fine-tuning with LoRA (Corrected)
# ============================================================
# Run in a single Colab cell. Adjust DATA_ROOT and OUT_DIR as needed.

!pip install -q -U transformers accelerate peft bitsandbytes datasets safetensors

import os
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    get_linear_schedule_with_warmup,
    BitsAndBytesConfig
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
from torch.optim import AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# -------------------------
# User config
# -------------------------
DATA_ROOT = "/content/drive/MyDrive/humAID_dataset"   # change if needed
OUT_DIR = "/content/drive/MyDrive/humAID_Qwen2_lora_fixed"
os.makedirs(OUT_DIR, exist_ok=True)

BASE_MODEL = "Qwen/Qwen2-1.5B"
MAX_LENGTH = 128
BATCH_SIZE = 8
EPOCHS = 3
LR = 2e-5
WEIGHT_DECAY = 0.01
WARMUP_STEPS = 0
SEED = 42

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
torch.manual_seed(SEED)

# ============================================================
# Data loading (same as your original pipeline)
# ============================================================
def read_messy_tsv(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    split_lines = [line.strip().split('\t')[-3:] for line in lines]
    data = split_lines[1:]
    df = pd.DataFrame(data, columns=['tweet_text', 'class_label', 'disaster_type'])
    df = df[~df['tweet_text'].str.contains('tweet_text', case=False, na=False)]
    df = df[~df['class_label'].str.contains('class_label', case=False, na=False)]
    df = df[~df['disaster_type'].str.contains('disaster_type', case=False, na=False)]
    df.reset_index(drop=True, inplace=True)
    return df

def load_all_splits(base_folder):
    train_list, dev_list, test_list = [], [], []
    for subfolder in os.listdir(base_folder):
        subpath = os.path.join(base_folder, subfolder)
        if not os.path.isdir(subpath):
            continue
        for split in ['train', 'dev', 'test']:
            file_path = os.path.join(subpath, f"{subfolder}_{split}.tsv")
            if os.path.exists(file_path):
                df = read_messy_tsv(file_path)
                if split == 'train':
                    train_list.append(df)
                elif split == 'dev':
                    dev_list.append(df)
                else:
                    test_list.append(df)
    train_df = pd.concat(train_list, ignore_index=True) if train_list else pd.DataFrame()
    dev_df = pd.concat(dev_list, ignore_index=True) if dev_list else pd.DataFrame()
    test_df = pd.concat(test_list, ignore_index=True) if test_list else pd.DataFrame()
    print(f"✅ Loaded: {len(train_df)} train, {len(dev_df)} dev, {len(test_df)} test samples.")
    return train_df, dev_df, test_df

train_df, dev_df, test_df = load_all_splits(DATA_ROOT)

for df in [train_df, dev_df, test_df]:
    df.rename(columns={'tweet_text': 'text', 'class_label': 'text_humanitarian'}, inplace=True)
    df.dropna(subset=['text', 'text_humanitarian', 'disaster_type'], inplace=True)

train_df = pd.concat([train_df, dev_df], ignore_index=True)

# Label encoding
disaster_encoder = LabelEncoder()
human_encoder = LabelEncoder()
train_df['disaster_label'] = disaster_encoder.fit_transform(train_df['disaster_type'])
train_df['human_label'] = human_encoder.fit_transform(train_df['text_humanitarian'])

test_df['disaster_label'] = test_df['disaster_type'].map(
    lambda x: disaster_encoder.transform([x])[0] if x in disaster_encoder.classes_ else -1
)
test_df['human_label'] = test_df['text_humanitarian'].map(
    lambda x: human_encoder.transform([x])[0] if x in human_encoder.classes_ else -1
)
test_df = test_df[(test_df['disaster_label'] != -1) & (test_df['human_label'] != -1)]

num_labels_disaster = len(disaster_encoder.classes_)
num_labels_human = len(human_encoder.classes_)
print("Disaster types:", list(disaster_encoder.classes_))
print("Humanitarian types:", list(human_encoder.classes_))

# ============================================================
# Tokenizer and dataset
# ============================================================
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

class CrisisDataset(Dataset):
    def __init__(self, df):
        self.texts = df['text'].tolist()
        self.disaster = df['disaster_label'].tolist()
        self.human = df['human_label'].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=MAX_LENGTH,
            return_tensors='pt'
        )
        return {
            'input_ids': enc['input_ids'].squeeze(0),
            'attention_mask': enc['attention_mask'].squeeze(0),
            'disaster_label': torch.tensor(self.disaster[idx], dtype=torch.long),
            'human_label': torch.tensor(self.human[idx], dtype=torch.long)
        }

train_dataset = CrisisDataset(train_df)
test_dataset = CrisisDataset(test_df)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# ============================================================
# Load base model in 4-bit + prepare for LoRA
# ============================================================
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

print("Loading base model (this may take a while)...")
base_peft_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Prepare for k-bit training
base_peft_model = prepare_model_for_kbit_training(base_peft_model)

# LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

peft_model = get_peft_model(base_peft_model, lora_config)
print("PEFT/LoRA modules injected.")
peft_model.print_trainable_parameters()

# ============================================================
# Multitask wrapper (robust attribute access for wrapped model)
# ============================================================
class QwenForMultiTask(nn.Module):
    def __init__(self, peft_model, hidden_size, n_disaster, n_human):
        super().__init__()
        # store peft model (this contains the quantized base and LoRA adapters)
        self.peft_model = peft_model
        self.dropout = nn.Dropout(0.2)
        self.disaster_head = nn.Linear(hidden_size, n_disaster)
        self.human_head = nn.Linear(hidden_size, n_human)

    def _get_base_transformer(self):
        """
        Return the actual transformer module inside the HF wrapper.
        This is robust across different wrappers: try common attribute names.
        """
        # peft_model may have attributes like .model, .base_model, .transformer, etc.
        cand = getattr(self.peft_model, "model", None)
        if cand is None:
            cand = getattr(self.peft_model, "base_model", None)
        if cand is None:
            cand = getattr(self.peft_model, "transformer", None)
        if cand is None:
            # fallback to peft_model itself
            cand = self.peft_model
        return cand

    def forward(self, input_ids, attention_mask):
        base = self._get_base_transformer()

        # Ensure we disable cache and request hidden states
        outputs = base(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            use_cache=False,
            return_dict=True
        )

        # For decoder-only models, take last layer's hidden states and pool last token
        hidden_states = outputs.hidden_states  # tuple of layers
        last_hidden = hidden_states[-1]        # [B, seq_len, hidden]
        pooled = last_hidden[:, -1, :]         # last-token embedding
        pooled = self.dropout(pooled)

        d_logits = self.disaster_head(pooled)
        h_logits = self.human_head(pooled)
        return d_logits, h_logits

# find hidden size robustly
# Many wrappers: peft_model.base_model.model.config.hidden_size or peft_model.config.hidden_size
if hasattr(peft_model, "base_model") and hasattr(peft_model.base_model, "model"):
    hidden_size = peft_model.base_model.model.config.hidden_size
elif hasattr(peft_model, "base_model") and hasattr(peft_model.base_model, "config"):
    hidden_size = peft_model.base_model.config.hidden_size
elif hasattr(peft_model, "model") and hasattr(peft_model.model, "config"):
    hidden_size = peft_model.model.config.hidden_size
else:
    hidden_size = peft_model.config.hidden_size

multi_model = QwenForMultiTask(peft_model, hidden_size, num_labels_disaster, num_labels_human).to(device)

# ============================================================
# Optimizer (only trainable params: LoRA + heads)
# ============================================================
trainable_params = [p for p in multi_model.parameters() if p.requires_grad]
optimizer = AdamW(trainable_params, lr=LR, weight_decay=WEIGHT_DECAY)

total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=total_steps)

# Use new torch.amp API if CUDA
if device.type == "cuda":
    scaler = torch.amp.GradScaler("cuda")
else:
    scaler = None  # no scaler for CPU

criterion = nn.CrossEntropyLoss()

# ============================================================
# Training loop
# ============================================================
multi_model.train()
for epoch in range(EPOCHS):
    print(f"\n=== Epoch {epoch+1}/{EPOCHS} ===")
    epoch_loss = 0.0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        d_labels = batch['disaster_label'].to(device)
        h_labels = batch['human_label'].to(device)

        if device.type == "cuda":
            with torch.amp.autocast("cuda"):
                d_logits, h_logits = multi_model(input_ids=input_ids, attention_mask=attention_mask)
                loss_d = criterion(d_logits, d_labels)
                loss_h = criterion(h_logits, h_labels)
                loss = loss_d + loss_h

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            # CPU fallback
            d_logits, h_logits = multi_model(input_ids=input_ids, attention_mask=attention_mask)
            loss_d = criterion(d_logits, d_labels)
            loss_h = criterion(h_logits, h_labels)
            loss = loss_d + loss_h
            loss.backward()
            optimizer.step()

        scheduler.step()
        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    print(f"Epoch {epoch+1} avg loss: {avg_loss:.4f}")

# ============================================================
# Evaluation
# ============================================================
multi_model.eval()
true_d_all, pred_d_all = [], []
true_h_all, pred_h_all = [], []
results = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        d_labels = batch['disaster_label'].to(device)
        h_labels = batch['human_label'].to(device)

        d_logits, h_logits = multi_model(input_ids=input_ids, attention_mask=attention_mask)
        d_preds = torch.argmax(d_logits, dim=1)
        h_preds = torch.argmax(h_logits, dim=1)

        true_d_all.extend(d_labels.cpu().tolist())
        pred_d_all.extend(d_preds.cpu().tolist())
        true_h_all.extend(h_labels.cpu().tolist())
        pred_h_all.extend(h_preds.cpu().tolist())

        for i in range(len(d_preds)):
            results.append({
                "text": tokenizer.decode(batch['input_ids'][i], skip_special_tokens=True),
                "true_disaster": disaster_encoder.inverse_transform([d_labels[i].cpu().item()])[0],
                "pred_disaster": disaster_encoder.inverse_transform([d_preds[i].cpu().item()])[0],
                "true_human": human_encoder.inverse_transform([h_labels[i].cpu().item()])[0],
                "pred_human": human_encoder.inverse_transform([h_preds[i].cpu().item()])[0],
            })

# ============================================================
# Save artifacts: LoRA adapters + heads + tokenizer + label maps
# ============================================================
# Save PEFT adapters (LoRA)
peft_model.save_pretrained(OUT_DIR)

# Save the small heads' weights separately
heads_state = {
    "disaster_head": multi_model.disaster_head.state_dict(),
    "human_head": multi_model.human_head.state_dict()
}
torch.save(heads_state, os.path.join(OUT_DIR, "task_heads.pth"))

# Save tokenizer and label maps
tokenizer.save_pretrained(OUT_DIR)
with open(os.path.join(OUT_DIR, "label_maps.json"), "w") as f:
    json.dump({
        "disaster_labels": dict(enumerate(disaster_encoder.classes_)),
        "human_labels": dict(enumerate(human_encoder.classes_))
    }, f)

pd.DataFrame(results).to_csv(os.path.join(OUT_DIR, "test_predictions.csv"), index=False)

# Reports
print("\n===== Disaster Classification =====")
print(classification_report(true_d_all, pred_d_all, target_names=disaster_encoder.classes_))
print("Accuracy:", accuracy_score(true_d_all, pred_d_all))

print("\n===== Humanitarian Classification =====")
print(classification_report(true_h_all, pred_h_all, target_names=human_encoder.classes_))
print("Accuracy:", accuracy_score(true_h_all, pred_h_all))

print(f"\n✅ Training complete. Artifacts saved to: {OUT_DIR}")


Device: cuda
✅ Loaded: 28812 train, 4194 dev, 8161 test samples.
Disaster types: ['Cyclone', 'Earthquake', 'Flood', 'Hurricane', 'Wildfire']
Humanitarian types: ['caution_and_advice', 'displaced_people_and_evacuations', 'infrastructure_and_utility_damage', 'injured_or_dead_people', 'missing_or_found_people', 'not_humanitarian', 'other_relevant_information', 'requests_or_urgent_needs', 'rescue_volunteering_or_donation_effort', 'sympathy_and_support']
Loading base model (this may take a while)...
PEFT/LoRA modules injected.
trainable params: 1,089,536 || all params: 1,544,803,840 || trainable%: 0.0705

=== Epoch 1/3 ===


  return fn(*args, **kwargs)


Epoch 1 avg loss: 1.7164

=== Epoch 2/3 ===
Epoch 2 avg loss: 0.7690

=== Epoch 3/3 ===
Epoch 3 avg loss: 0.6927

===== Disaster Classification =====
              precision    recall  f1-score   support

     Cyclone       1.00      0.98      0.99       779
  Earthquake       0.95      0.96      0.96      1384
       Flood       0.94      0.89      0.91       259
   Hurricane       0.98      0.98      0.98      5438
    Wildfire       0.99      0.98      0.99       301

    accuracy                           0.98      8161
   macro avg       0.97      0.96      0.97      8161
weighted avg       0.98      0.98      0.98      8161

Accuracy: 0.976718539394682

===== Humanitarian Classification =====
                                        precision    recall  f1-score   support

                    caution_and_advice       0.73      0.61      0.66       433
      displaced_people_and_evacuations       0.91      0.92      0.92       347
     infrastructure_and_utility_damage       0.84  