In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import transformers
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import pandas as pd
import numpy as np
from tqdm import tqdm
import math
from sklearn.model_selection import KFold


# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(64)

# Load tokenizer and base BERT model
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = transformers.BertModel.from_pretrained('bert-base-uncased')

# ------------------ UTILITY FUNCTIONS ------------------

def haversine(lat1, lon1, lat2, lon2):
    R = 3959
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    dlat, dlon = lat2 - lat1, lon2 - lon1
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return R * c

def get_box_area(lat1, lon1, lat2, lon2):
    side1 = haversine(lat1, lon1, lat1, lon2)
    side2 = haversine(lat1, lon1, lat2, lon1)
    return side1 * side2

def get_target_list(target_list=[]):
    if not target_list:
        target_list = [
            'homeOwnersInsurance', 'floodInsurance', 'destroyed', 'floodDamage', 'roofDamage',
            'tsaEligible', 'tsaCheckedIn', 'rentalAssistanceEligible', 'repairAssistanceEligible',
            'replacementAssistanceEligible', 'personalPropertyEligible'
        ]
    return target_list

# ------------------ DATASET CLASS ------------------

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.float32)
        }

# ------------------ MODEL ------------------

class BERTDeepMultiHeadClassifier(nn.Module):
    def __init__(self, num_targets=11, hidden_dim=256):
        super(BERTDeepMultiHeadClassifier, self).__init__()
        self.bert = model_bert
        self.drop = nn.Dropout(0.3)

        self.heads = nn.ModuleList([
            nn.Sequential(
                nn.Linear(self.bert.config.hidden_size, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, 1)
            ) for _ in range(num_targets)
        ])

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.drop(pooled_output)
        return torch.cat([torch.sigmoid(head(x)) for head in self.heads], dim=1)

# ------------------ TRAINING LOOP ------------------

def train_negative_only_model_kfold(grouped_tweets, target_list=[], max_len=512, batch_size=24, num_epochs=10, num_folds=5):
    target_list = get_target_list(target_list)
    tweet_dict = {int(name): group['text'] for name, group in grouped_tweets}

    texts = ['\n'.join(group.to_list()) for group in tweet_dict.values()]
    
    # 🔧 All-zero labels for negative-only setup
    all_zero_label = np.zeros(len(target_list), dtype=np.float32)
    labels = [all_zero_label.copy() for _ in texts]

    save_path_laura = 'Training_results/Laura_NegativeOnly_KFold/'
    os.makedirs(save_path_laura, exist_ok=True)

    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    best_model_paths = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(texts)):
        print(f"\nFold {fold + 1}/{num_folds}")

        X_train = [texts[i] for i in train_idx]
        y_train = [labels[i] for i in train_idx]
        X_val = [texts[i] for i in val_idx]
        y_val = [labels[i] for i in val_idx]  # Explicitly reassign validation labels as all-zero

        train_dataset = CustomDataset(X_train, y_train, tokenizer, max_len)
        val_dataset = CustomDataset(X_val, y_val, tokenizer, max_len)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        model = BERTDeepMultiHeadClassifier(num_targets=len(target_list)).to(device)
        optimizer = optim.Adam(model.parameters(), lr=1e-5)
        criterion = [nn.BCELoss() for _ in range(len(target_list))]

        best_f1 = 0.0
        best_model_path = os.path.join(save_path_laura, f"bert_model_laura_neg_fold{fold + 1}.pth")

        for epoch in range(num_epochs):
            model.train()
            train_loss = 0.0
            for batch in tqdm(train_loader, desc=f"Fold {fold+1} Epoch {epoch+1} Training"):
                optimizer.zero_grad()
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask)
                loss = sum(c(outputs[:, i], labels[:, i]) for i, c in enumerate(criterion))
                loss.backward()
                optimizer.step()
                train_loss += loss.item()

            model.eval()
            val_loss = 0.0
            all_preds, all_labels = [], []
            with torch.no_grad():
                for batch in tqdm(val_loader, desc=f"Fold {fold+1} Epoch {epoch+1} Validation"):
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels'].to(device)

                    outputs = model(input_ids, attention_mask)
                    loss = sum(c(outputs[:, i], labels[:, i]) for i, c in enumerate(criterion))
                    val_loss += loss.item()

                    all_preds.append(outputs.cpu().numpy())
                    all_labels.append(labels.cpu().numpy())

            all_preds = np.vstack(all_preds)
            all_labels = np.vstack(all_labels)
            f1 = [f1_score(all_labels[:, i], all_preds[:, i] > 0.5, zero_division=0) for i in range(len(target_list))]
            mean_f1 = np.mean(f1)

            print(f"Epoch {epoch+1}: Train Loss={train_loss/len(train_loader):.4f}, Val Loss={val_loss/len(val_loader):.4f}, Mean F1={mean_f1:.4f}")

            if mean_f1 > best_f1:
                best_f1 = mean_f1
                torch.save(model.state_dict(), best_model_path)
                print(f"Saved best model for Fold {fold+1} with F1={best_f1:.4f}")

        #Save final model if F1 never improved
        if best_f1 == 0.0:
            torch.save(model.state_dict(), best_model_path)
            print(f"⚠️ F1 never improved for Fold {fold+1}. Final model saved anyway.")

        best_model_paths.append(best_model_path)

    return best_model_paths


In [None]:
# ------------------ EXECUTION ------------------

size_threshold = 80
tweets2 = pd.read_csv('organized_with_zipcode.csv')
tweets_laura = tweets2[tweets2.storm_name == 'laura']
bboxes_useful = tweets_laura.place_bbox.apply(lambda x: [[float(i.strip('(').strip(')')) for i in x.split(', ')][i] for i in [1,0,3,2]])
bbu_areas = bboxes_useful.apply(lambda x: get_box_area(*x))
tweets_laura = tweets_laura.loc[((tweets_laura.geo.apply(lambda x: 'Point' in str(x))) | (bbu_areas < size_threshold)), :]
tweet_grouped_laura = tweets_laura.groupby('zip_code')

# Train model on negative-only data
best_model_path = train_negative_only_model_kfold(tweet_grouped_laura, max_len=512, batch_size=24, num_epochs=100)