In [None]:
# 1. Install kagglehub (if not already installed)
!pip install kagglehub


import kagglehub

# Download latest version
path = kagglehub.dataset_download("mikolajbabula/disaster-images-dataset-cnn-model")

print("Path to dataset files:", path)

In [None]:
!pip install gTTS

In [None]:
# save this as generate_hazard_audios.py
import os
import pandas as pd
from gtts import gTTS   # pip install gTTS

# Path to your CSV
CSV_PATH = "labels.csv"   # update if needed
OUTPUT_DIR = "hazard_audios"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load the CSV
df = pd.read_csv(CSV_PATH)

for i, row in df.iterrows():
    text = str(row.get("text", ""))   # column name holding hazard description
    if not text:
        continue
    tts = gTTS(text, lang="en")       # change lang if needed
    out_file = os.path.join(OUTPUT_DIR, f"hazard_{i}.mp3")
    tts.save(out_file)
    if i >= 9999:                     # stop after 10,000
        break

print(f"Generated {min(len(df), 10000)} audio files in {OUTPUT_DIR}")


In [None]:
"""
Multimodal Hazard Detection & Fusion Pipeline (Enhanced with dataset path options)
- Handles separate dataset files for text (required) and optional image, video, audio.
- Uses location (latitude/longitude) when available; extracts from text if present.
- Computes a wave-altitude proxy using monocular depth estimation for images.
- Produces combined prediction: hazard_type, urgency_label, prior_hazard_at_location (true/false), review_tag, duplicate_flag (true/false), probability (0-1).
- Clusters duplicate/similar posts and gives priority to posts with multiple user reports.

Usage examples:
    python pipeline.py --text_csv /path/to/text.csv
    # or with optional modalities:
    python pipeline.py --text_csv /path/to/text.csv --image_csv /path/to/images.csv --audio_csv /path/to/audio.csv --video_csv /path/to/video.csv

Dependencies: torch, transformers, torchvision, torchaudio, sentence-transformers, opencv-python, scikit-learn, pandas, numpy, pillow, matplotlib
"""

import os
import re
import json
import argparse
from typing import List, Dict, Any, Tuple

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image

from sentence_transformers import SentenceTransformer
import torchaudio
import cv2

LATLON_REGEX = re.compile(r"([-+]?\d{1,3}\.\d+)[,\s]+([-+]?\d{1,3}\.\d+)")

# ---- Data Loading ----

def load_dataset(path: str) -> List[Dict]:
    """Load dataset from CSV or JSON file."""
    ext = os.path.splitext(path)[1].lower()
    records = []
    if ext == '.csv':
        df = pd.read_csv(path)
        records = df.to_dict(orient='records')
    elif ext == '.json':
        with open(path,'r') as f:
            records = json.load(f)
    else:
        raise ValueError("Unsupported file type. Use CSV or JSON.")
    for r in records:
        r.setdefault('video_frames', [])
    return records

class MultimodalDataset(Dataset):
    def __init__(self, records: List[Dict], transform=None, text_embedder=None):
        self.records = records
        self.transform = transform
        self.text_embedder = text_embedder

    def __len__(self):
        return len(self.records)

    def _load_image(self, path):
        try:
            img = Image.open(path).convert('RGB')
            if self.transform:
                img = self.transform(img)
            return img
        except Exception:
            return torch.zeros(3,224,224)

    def _embed_text(self, text):
        if not text or not self.text_embedder:
            return np.zeros(self.text_embedder.get_sentence_embedding_dimension() if self.text_embedder else 768)
        return self.text_embedder.encode(text)

    def _load_audio(self, path):
        try:
            waveform, sr = torchaudio.load(path)
            if waveform.shape[0] > 1:
                waveform = waveform.mean(dim=0, keepdim=True)
            if sr != 16000:
                resampler = torchaudio.transforms.Resample(sr, 16000)
                waveform = resampler(waveform)
            mel = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=64)(waveform)
            log_mel = torch.log1p(mel)
            return log_mel.mean(dim=-1).squeeze().numpy()
        except Exception:
            return np.zeros(64)

    def __getitem__(self, idx):
        r = self.records[idx]
        item = {}
        item['text_embedding'] = self._embed_text(r.get('text',''))
        if r.get('image_path'):
            item['image'] = self._load_image(r['image_path'])
        else:
            item['image'] = torch.zeros(3,224,224)
        item['video_frame_paths'] = r.get('video_frames', [])
        if r.get('audio_path'):
            item['audio_embedding'] = self._load_audio(r['audio_path'])
        else:
            item['audio_embedding'] = np.zeros(64)
        item['lat'] = r.get('lat', None)
        item['lon'] = r.get('lon', None)
        item['label'] = r.get('label', None)
        item['user_id'] = r.get('user_id', None)
        item['raw_record'] = r
        return item

class ImageEmbeddingModel(nn.Module):
    def __init__(self, device='cpu'):
        super().__init__()
        self.device = device
        self.backbone = models.resnet18(pretrained=True)
        self.backbone.fc = nn.Identity()
        self.to(device)

    def forward(self, x):
        return self.backbone(x)

def extract_latlon_from_text(text: str) -> Tuple[float,float]:
    if not text:
        return None, None
    m = LATLON_REGEX.search(text)
    if m:
        try:
            lat = float(m.group(1))
            lon = float(m.group(2))
            return lat, lon
        except:
            return None, None
    return None, None

def compute_wave_altitude_from_image(image_path: str) -> float:
    try:
        import torch.hub
        midas = torch.hub.load('intel-isl/MiDaS', 'MiDaS_small')
        midas.eval()
        transform = torch.hub.load('intel-isl/MiDaS', 'transforms').small_transform
        img = Image.open(image_path).convert('RGB')
        input_batch = transform(img).unsqueeze(0)
        with torch.no_grad():
            prediction = midas(input_batch)
            depth = prediction.squeeze().cpu().numpy()
        return float(depth.std())
    except Exception:
        return 0.0

def train_text_model(text_embeddings: np.ndarray, labels: np.ndarray, num_classes: int):
    from sklearn.linear_model import LogisticRegression
    clf = LogisticRegression(max_iter=1000)
    clf.fit(text_embeddings, labels)
    return clf

def train_image_model(image_features: np.ndarray, labels: np.ndarray):
    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(image_features, labels)
    return clf

def train_audio_model(audio_features: np.ndarray, labels: np.ndarray):
    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(audio_features, labels)
    return clf

def combine_models_stack(stack_X: np.ndarray, labels: np.ndarray):
    from sklearn.linear_model import LogisticRegression
    meta = LogisticRegression(max_iter=1000)
    meta.fit(stack_X, labels)
    return meta

def cluster_duplicates(embeddings: np.ndarray, eps=0.5, min_samples=2):
    sc = StandardScaler()
    Xs = sc.fit_transform(embeddings)
    db = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine')
    return db.fit_predict(Xs)

URGENCY_THRESHOLDS = {'danger': 0.8, 'review': 0.5}

def urgency_label_from_prob(p: float) -> str:
    if p >= URGENCY_THRESHOLDS['danger']:
        return 'danger/urgent action needed'
    elif p >= URGENCY_THRESHOLDS['review']:
        return 'urgent review needed'
    else:
        return 'low urgency'

def build_pipeline(text_csv: str,
                   image_csv: str = None,
                   audio_csv: str = None,
                   video_csv: str = None,
                   text_model=None,
                   image_model=None,
                   audio_model=None,
                   meta_model=None,
                   text_embedder_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
                   device='cpu') -> List[Dict]:
    records = load_dataset(text_csv)
    text_embedder = SentenceTransformer(text_embedder_name)
    img_transform = transforms.Compose([
        transforms.Resize((224,224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
    ])
    img_backbone = ImageEmbeddingModel(device=device)
    img_backbone.eval()

    ds = MultimodalDataset(records, transform=img_transform, text_embedder=text_embedder)
    text_feats, image_feats, audio_feats, locs, user_ids = [], [], [], [], []

    for i in range(len(ds)):
        item = ds[i]
        text_feats.append(item['text_embedding'])
        with torch.no_grad():
            img_tensor = item['image'].unsqueeze(0).to(device)
            img_feat = img_backbone(img_tensor).cpu().numpy().squeeze()
        image_feats.append(img_feat)
        audio_feats.append(item['audio_embedding'])
        locs.append((item['lat'], item['lon']))
        user_ids.append(item['user_id'])

    text_feats = np.vstack(text_feats)
    image_feats = np.vstack(image_feats)
    audio_feats = np.vstack(audio_feats)

    labels = np.array([r.get('label') for r in records])
    label_present = not any([l is None for l in labels])
    if text_model is None and label_present:
        text_model = train_text_model(text_feats, labels, num_classes=len(np.unique(labels)))
    if image_model is None and label_present:
        image_model = train_image_model(image_feats, labels)
    if audio_model is None and label_present:
        audio_model = train_audio_model(audio_feats, labels)

    text_probs = text_model.predict_proba(text_feats)
    image_probs = image_model.predict_proba(image_feats)
    audio_probs = audio_model.predict_proba(audio_feats)

    combined_stack = np.vstack([text_probs.max(axis=1), image_probs.max(axis=1), audio_probs.max(axis=1)]).T
    if meta_model is None and label_present:
        meta_model = combine_models_stack(combined_stack, labels)

    final_probs = meta_model.predict_proba(combined_stack)[:,1]
    concat_emb = np.hstack([text_feats, image_feats[:, :128], audio_feats])
    cluster_ids = cluster_duplicates(concat_emb)

    prior_reported = []
    coords = np.array([[lat if lat is not None else np.nan, lon if lon is not None else np.nan] for lat,lon in locs])
    for i,(lat,lon) in enumerate(locs):
        flag = False
        if lat is not None and lon is not None:
            dists = np.sqrt(np.nansum((coords - np.array([lat,lon]))**2, axis=1))
            nearby = np.where((dists < 0.01) & (~np.isnan(dists)))[0]
            for j in nearby:
                if j != i and records[j].get('label')==1:
                    flag = True
                    break
        prior_reported.append(flag)

    outputs = []
    for i,r in enumerate(records):
        prob = float(final_probs[i])
        urgency = urgency_label_from_prob(prob)
        duplicate_flag = cluster_ids[i] != -1 and sum(cluster_ids==cluster_ids[i])>1
        same_cluster_idxs = np.where(cluster_ids==cluster_ids[i])[0] if cluster_ids[i]!=-1 else np.array([i])
        unique_users = set([user_ids[idx] for idx in same_cluster_idxs if user_ids[idx] is not None])
        high_priority = len(unique_users) > 1
        lat,lon = r.get('lat'), r.get('lon')
        if (lat is None or lon is None) and r.get('text'):
            ext_lat, ext_lon = extract_latlon_from_text(r.get('text'))
            if ext_lat is not None:
                lat,lon = ext_lat, ext_lon
        wave_alt = compute_wave_altitude_from_image(r['image_path']) if r.get('image_path') else None
        pred_label = (meta_model.predict(combined_stack[i:i+1])[0])
        hazard_type = r.get('hazard_type') or ("hazard" if pred_label==1 else "no_hazard")
        review_tag = 'review_needed' if prob >= URGENCY_THRESHOLDS['review'] and prob < URGENCY_THRESHOLDS['danger'] else ('confirm' if prob>=URGENCY_THRESHOLDS['danger'] else 'low_confidence')
        outputs.append({
            'record_id': r.get('id', i),
            'hazard_type': hazard_type,
            'urgency_label': urgency,
            'prior_hazard_at_location': prior_reported[i],
            'review_tag': review_tag,
            'duplicate_flag': bool(duplicate_flag),
            'probability': prob,
            'lat': lat,
            'lon': lon,
            'wave_altitude_proxy': wave_alt,
            'cluster_id': int(cluster_ids[i]) if cluster_ids[i] is not None else -1,
            'high_priority_due_to_multiple_users': bool(high_priority)
        })
    return outputs

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Multimodal Hazard Detection Pipeline")
    parser.add_argument('--text_csv', required=True, help='/content/hazard_dataset.csv')
    parser.add_argument('--image_csv', help='/root/.cache/kagglehub/datasets/mikolajbabula/disaster-images-dataset-cnn-model/versions/1')
    parser.add_argument('--audio_csv', help='/content/hazard_audios')
    parser.add_argument('--video_csv', help='Path to video CSV dataset (optional)')
    args = parser.parse_args([
    "--text_csv", "/content/hazard_dataset.csv",
    "--image_csv", "/root/.cache/kagglehub/datasets/mikolajbabula/disaster-images-dataset-cnn-model/versions/1",
    "--audio_csv", "/content/hazard_audios",
    "--video_csv", "/content/path/to/video.csv"
])

    outputs = build_pipeline(args.text_csv, args.image_csv, args.audio_csv, args.video_csv, device='cpu')
    print(json.dumps(outputs, indent=2))
    
    
    [
      -   {
    "record_id": 0,
    "hazard_type": "no_hazard",
    "urgency_label": "low urgency",
    "prior_hazard_at_location": false,
    "review_tag": "low_confidence",
    "duplicate_flag": true,
    "probability": 0.3321818695776968,
    "lat": null,
    "lon": null,
    "wave_altitude_proxy": null,
    "cluster_id": 0,
    "high_priority_due_to_multiple_users": false
  }

    ]


In [None]:
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

def evaluate_pipeline(dataset_path,
                      text_model,
                      image_model,
                      audio_model,
                      meta_model,
                      device='cpu'):
    """
    Computes overall accuracy and detailed metrics for the fused model.
    """
    # reuse the same feature extraction as build_pipeline
    records = load_dataset(dataset_path)

    text_embedder = SentenceTransformer(
        'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
    )
    img_transform = transforms.Compose([
        transforms.Resize((224,224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
    ])
    img_backbone = ImageEmbeddingModel(device=device).eval()
    ds = MultimodalDataset(records, transform=img_transform,
                           text_embedder=text_embedder)

    text_feats, image_feats, audio_feats, y_true = [], [], [], []
    for i in range(len(ds)):
        item = ds[i]
        text_feats.append(item['text_embedding'])
        with torch.no_grad():
            img_feat = img_backbone(item['image'].unsqueeze(0).to(device)).cpu().numpy().squeeze()
        image_feats.append(img_feat)
        audio_feats.append(item['audio_embedding'])
        y_true.append(item['label'])

    text_feats  = np.vstack(text_feats)
    image_feats = np.vstack(image_feats)
    audio_feats = np.vstack(audio_feats)
    y_true      = np.array(y_true)

    # probability outputs from each branch
    text_probs  = text_model.predict_proba(text_feats)
    image_probs = image_model.predict_proba(image_feats)
    audio_probs = audio_model.predict_proba(audio_feats)

    # stack highest class p
    # robabilities
    stack_X = np.vstack([
        text_probs.max(axis=1),
        image_probs.max(axis=1),
        audio_probs.max(axis=1)
    ]).T

    y_pred = meta_model.predict(stack_X)

    acc = accuracy_score(y_true, y_pred)
    print(f"\nOverall fused model accuracy: {acc:.4f}\n")
    print(classification_report(y_true, y_pred))
    return acc
