In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# =============================
# Qwen Prompt-Based Multi-Class Disaster Tweet Classification (humAID)
# =============================
# Author: Vidushi Goyal
# =============================

# STEP 1: Imports
# =============================
import pandas as pd
import torch
import os
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# =============================
# STEP 2: Load dataset (same as before)
# =============================

def read_messy_tsv(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    split_lines = [line.strip().split('\t')[-3:] for line in lines]
    data = split_lines[1:]

    df = pd.DataFrame(data, columns=['tweet_text', 'class_label', 'disaster_type'])
    df = df[~df['tweet_text'].str.contains('tweet_text', case=False, na=False)]
    df = df[~df['class_label'].str.contains('class_label', case=False, na=False)]
    df = df[~df['disaster_type'].str.contains('disaster_type', case=False, na=False)]
    df.reset_index(drop=True, inplace=True)
    return df


data_folder = "/content/drive/MyDrive/humAID_dataset/"

def load_all_splits(base_folder):
    train_list, dev_list, test_list = [], [], []
    for subfolder in os.listdir(base_folder):
        subpath = os.path.join(base_folder, subfolder)
        if not os.path.isdir(subpath):
            continue
        for split in ['train', 'dev', 'test']:
            file_path = os.path.join(subpath, f"{subfolder}_{split}.tsv")
            if os.path.exists(file_path):
                df = read_messy_tsv(file_path)
                if split == 'train':
                    train_list.append(df)
                elif split == 'dev':
                    dev_list.append(df)
                else:
                    test_list.append(df)
    train_df = pd.concat(train_list, ignore_index=True) if train_list else pd.DataFrame()
    dev_df = pd.concat(dev_list, ignore_index=True) if dev_list else pd.DataFrame()
    test_df = pd.concat(test_list, ignore_index=True) if test_list else pd.DataFrame()

    print(f"✅ Loaded: {len(train_df)} train, {len(dev_df)} dev, {len(test_df)} test samples.")
    return train_df, dev_df, test_df


train_df, dev_df, test_df = load_all_splits(data_folder)

for df in [train_df, dev_df, test_df]:
    if len(df) > 0 and df.iloc[0]['tweet_text'] == 'tweet_text':
        df.drop(index=0, inplace=True)
    df.reset_index(drop=True, inplace=True)

for df in [train_df, dev_df, test_df]:
    df.rename(columns={'tweet_text': 'text', 'class_label': 'text_humanitarian'}, inplace=True)
    df.dropna(subset=['text', 'text_humanitarian', 'disaster_type'], inplace=True)

train_df = pd.concat([train_df, dev_df], ignore_index=True)

# =============================
# STEP 3: Encode Labels
# =============================
disaster_encoder = LabelEncoder()
human_encoder = LabelEncoder()

train_df['disaster_label'] = disaster_encoder.fit_transform(train_df['disaster_type'])
train_df['human_label'] = human_encoder.fit_transform(train_df['text_humanitarian'])

test_df['disaster_label'] = test_df['disaster_type'].map(
    lambda x: disaster_encoder.transform([x])[0] if x in disaster_encoder.classes_ else -1
)
test_df['human_label'] = test_df['text_humanitarian'].map(
    lambda x: human_encoder.transform([x])[0] if x in human_encoder.classes_ else -1
)
test_df = test_df[(test_df['disaster_label'] != -1) & (test_df['human_label'] != -1)]

print("\nDisaster types:", list(disaster_encoder.classes_))
print("Humanitarian types:", list(human_encoder.classes_))

# =============================
# STEP 4: Setup Qwen model (prompt-based)
# =============================
# =============================
# STEP 4: Setup Qwen model (prompt-based, memory optimized)
# =============================
model_name = "Qwen/Qwen2.5-7B-Instruct"

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Detect device
if torch.cuda.is_available():
    device_map = "auto"
    torch_dtype = torch.float16
else:
    device_map = "cpu"
    torch_dtype = torch.float32

print(f"⚙️ Loading {model_name} with memory optimization...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Try lightweight loading with automatic offload
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_map,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True
)

# Create generator pipeline (no full model copy)
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=150,
    temperature=0.2,
    do_sample=False
)

print("✅ Model loaded successfully with optimized memory settings.")


# =============================
# STEP 5: Define Prompt Function
# =============================

def build_prompt(tweet):
    return f"""
You are an expert disaster response analyst.

Task 1: Identify the type of disaster in the tweet.
Task 2: Identify the humanitarian category of the tweet.

Here are the possible disaster types:
{', '.join(disaster_encoder.classes_)}

Here are the possible humanitarian categories:
{', '.join(human_encoder.classes_)}

Tweet: "{tweet}"

Give your answer strictly in the following JSON format:
{{"disaster_type": "...", "humanitarian_category": "..."}}
"""

import os
import json
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score

def run_inference_and_save(generator, test_df, disaster_encoder, human_encoder, save_path, n_samples=10, random_state=42):
    """
    Runs prompt-based inference on n_samples from test_df, appends results to CSV,
    and evaluates on the cumulative data (all runs so far).
    """

    # --- Load existing results if present ---
    if os.path.exists(save_path):
        results_df = pd.read_csv(save_path)
        done_texts = set(results_df["text"].tolist())
        print(f"📂 Found existing results: {len(results_df)} samples already processed.")
    else:
        results_df = pd.DataFrame(columns=["text", "true_disaster", "pred_disaster", "true_human", "pred_human"])
        done_texts = set()
        print("🆕 No existing results found. Starting fresh.")

    # --- Pick new samples not yet processed ---
    remaining_df = test_df[~test_df["text"].isin(done_texts)]
    if len(remaining_df) == 0:
        print("✅ All samples already processed!")
        return

    subset = remaining_df.sample(n=min(n_samples, len(remaining_df)), random_state=random_state)
    print(f"🧪 Running inference on {len(subset)} new samples...")

    import re

    # --- Run inference for selected samples ---
    new_results = []
    for _, row in subset.iterrows():
        tweet = row['text']
        true_disaster = row['disaster_type']
        true_human = row['text_humanitarian']

        prompt = build_prompt(tweet)
        response = generator(prompt, max_new_tokens=100, temperature=0.2, do_sample=False)[0]['generated_text']
        print("\n---RAW RESPONSE---\n", response)

        # Parse JSON from response
        matches = re.findall(r'\{.*?\}', response, re.DOTALL)
        if matches:
            json_part = matches[-1]
            json_part = json_part.replace("“", "\"").replace("”", "\"").replace("'", "\"")
            json_part = re.sub(r",\s*}", "}", json_part)
            try:
                parsed = json.loads(json_part)
                pred_disaster = parsed.get("disaster_type", "").strip()
                pred_human = parsed.get("humanitarian_category", "").strip()
            except:
                pred_disaster, pred_human = "unknown", "unknown"
        else:
            pred_disaster, pred_human = "unknown", "unknown"

        new_results.append({
            "text": tweet,
            "true_disaster": true_disaster,
            "pred_disaster": pred_disaster,
            "true_human": true_human,
            "pred_human": pred_human
        })

    # --- Append new results to existing CSV ---
    new_df = pd.DataFrame(new_results)
    combined_df = pd.concat([results_df, new_df], ignore_index=True)
    combined_df.to_csv(save_path, index=False)
    print(f"\n💾 Saved cumulative results to: {save_path} (Total {len(combined_df)} samples)")

    # --- Compute cumulative evaluation ---
    true_disaster_all = combined_df["true_disaster"].tolist()
    pred_disaster_all = combined_df["pred_disaster"].tolist()
    true_human_all = combined_df["true_human"].tolist()
    pred_human_all = combined_df["pred_human"].tolist()

    print("\n===== Disaster Type Classification (Cumulative) =====")
    print(classification_report(true_disaster_all, pred_disaster_all, labels=disaster_encoder.classes_))
    print(f"🎯 Overall Disaster Type Accuracy: {accuracy_score(true_disaster_all, pred_disaster_all):.4f}")

    print("\n===== Humanitarian Type Classification (Cumulative) =====")
    print(classification_report(true_human_all, pred_human_all, labels=human_encoder.classes_))
    print(f"🎯 Overall Humanitarian Type Accuracy: {accuracy_score(true_human_all, pred_human_all):.4f}")


✅ Loaded: 28812 train, 4194 dev, 8161 test samples.

Disaster types: ['Cyclone', 'Earthquake', 'Flood', 'Hurricane', 'Wildfire']
Humanitarian types: ['caution_and_advice', 'displaced_people_and_evacuations', 'infrastructure_and_utility_damage', 'injured_or_dead_people', 'missing_or_found_people', 'not_humanitarian', 'other_relevant_information', 'requests_or_urgent_needs', 'rescue_volunteering_or_donation_effort', 'sympathy_and_support']
⚙️ Loading Qwen/Qwen2.5-7B-Instruct with memory optimization...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


✅ Model loaded successfully with optimized memory settings.


In [7]:
# =============================
# STEP 6: Batched Inference Runner
# =============================
save_path = "/content/drive/MyDrive/humAID_Qwen_Prompt_Results.csv"

# Run for first 10 samples
run_inference_and_save(generator, test_df, disaster_encoder, human_encoder, save_path, n_samples=200)

# Later, you can run more in new cells:
# run_inference_and_save(generator, test_df, disaster_encoder, human_encoder, save_path, n_samples=50)

📂 Found existing results: 245 samples already processed.
🧪 Running inference on 200 new samples...

---RAW RESPONSE---
 
You are an expert disaster response analyst.

Task 1: Identify the type of disaster in the tweet.
Task 2: Identify the humanitarian category of the tweet.

Here are the possible disaster types:
Cyclone, Earthquake, Flood, Hurricane, Wildfire

Here are the possible humanitarian categories:
caution_and_advice, displaced_people_and_evacuations, infrastructure_and_utility_damage, injured_or_dead_people, missing_or_found_people, not_humanitarian, other_relevant_information, requests_or_urgent_needs, rescue_volunteering_or_donation_effort, sympathy_and_support

Tweet: ""Houston after #Harvey Sadly, at least 18 people have died""

Give your answer strictly in the following JSON format:
{"disaster_type": "...", "humanitarian_category": "..."}
{"disaster_type": "Flood", "humanitarian_category": "injured_or_dead_people"}

---RAW RESPONSE---
 
You are an expert disaster respons