### Problem
Predict multiple disaster impact categories (e.g., damage levels, eligibility) based on tweet text and geospatial metadata using a feedforward neural network trained on grouped tweets and zip-code-level labels.

### Objective
Use geospatially-filtered tweets (based on bounding box size) to train a feedforward model for multi-label binary classification, one model per target label, using zip-code grouped tweet aggregation and target data.



### Imports and Setup

In [None]:
import os
import re
import math
import json
import torch
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch import nn
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import preprocessor
import trainer

warnings.filterwarnings(action='once')

device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
torch.manual_seed(64)
print(f"Using device: {device}")


### Haversine and Bounding Box Area

In [None]:
def haversine(lat1, lon1, lat2, lon2):
    R = 3959  # miles
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    dlat, dlon = lat2 - lat1, lon2 - lon1
    a = math.sin(dlat/2)**2 + math.cos(lat1)*math.cos(lat2)*math.sin(dlon/2)**2
    return R * (2 * math.atan2(math.sqrt(a), math.sqrt(1 - a)))

def get_box_area(lat1, lon1, lat2, lon2):
    return haversine(lat1, lon1, lat1, lon2) * haversine(lat1, lon1, lat2, lon1)


### Model Architectures

In [None]:
class SimplerNet(nn.Module):
    def __init__(self, in_out_degrees, output_size, sigmoid=False):
        super().__init__()
        self.sigmoid = sigmoid
        self.layers = nn.ModuleList()
        self.batch_norms = nn.ModuleList()

        for i in range(len(in_out_degrees) - 1):
            self.layers.append(nn.Linear(in_out_degrees[i], in_out_degrees[i + 1]))
            self.batch_norms.append(nn.BatchNorm1d(in_out_degrees[i + 1]))

        self.output_layer = nn.Linear(in_out_degrees[-1], output_size)

    def forward(self, x):
        for fc, bn in zip(self.layers, self.batch_norms):
            x = torch.relu(bn(fc(x)))
        x = self.output_layer(x)
        return torch.sigmoid(x) if self.sigmoid else x


### Data Loading and Preprocessing

In [None]:
tweets2 = pd.read_csv('organized_with_zipcode.csv')
tweets_harvey2 = pd.read_csv('harvey_corrected.csv')
tweets_harvey2.rename(columns={'zipcode': 'zip_code'}, inplace=True)

# Filter Harvey tweets
size_threshold = 60
bboxes_useful = tweets_harvey2.place_bbox.apply(lambda x: [[float(i.strip('()[]')) for i in x.split(', ')][i] for i in [1, 0, 3, 2]])
bbu_areas = bboxes_useful.apply(lambda x: get_box_area(*x))
tweets_harvey = tweets_harvey2.loc[((tweets_harvey2.geo.apply(lambda x: 'Point' in str(x))) | (bbu_areas < size_threshold)), :]
tweets_harvey['zip_code'] = tweets_harvey['zip_code'].astype(int)

# Filter other storms
tweets = tweets2[tweets2.storm_name.isin(['imelda', 'beryl'])]
bboxes_useful = tweets.place_bbox.apply(lambda x: [[float(i.strip('()[]')) for i in x.split(', ')][i] for i in [1, 0, 3, 2]])
bbu_areas = bboxes_useful.apply(lambda x: get_box_area(*x))
tweets = tweets.loc[((tweets.geo.apply(lambda x: 'Point' in str(x))) | (bbu_areas < size_threshold)), :]

tweet_grouped_everything = pd.concat([
    tweets2.loc[:, tweets2.columns.intersection(tweets_harvey2.columns)],
    tweets_harvey2.loc[:, tweets2.columns.intersection(tweets_harvey2.columns)]
])
bboxes_useful = tweet_grouped_everything.place_bbox.apply(lambda x: [[float(i.strip('()[]')) for i in x.split(', ')][i] for i in [1, 0, 3, 2]])
bbu_areas = bboxes_useful.apply(lambda x: get_box_area(*x))
tweet_grouped_everything = tweet_grouped_everything.loc[((tweet_grouped_everything.geo.apply(lambda x: 'Point' in str(x))) | (bbu_areas < size_threshold)), :]

# Load targets
targets_beryl = pd.read_csv('targets/disaster_4798.csv')
targets_imelda = pd.read_csv('targets/disaster_4466.csv')
targets_harvey = pd.read_csv('targets/disaster_4332.csv')
targets_everything = pd.concat([targets_beryl, targets_imelda, targets_harvey])

# Group tweets and targets
tweet_grouped = tweet_grouped_everything.groupby('zip_code')
target_grouped = targets_everything.groupby('damagedZipCode')

# Preprocessing via external script
train_dl, val_dl, class_weights = preprocessor.clean(tweet_grouped, target_grouped, val_batch_size=100)


### Train All Target Models

In [None]:
histories = []
metadatas = []
in_out_degrees = [128, 32]
num_epochs = 300

for i in range(11):
    model = SimplerNet(in_out_degrees, output_size=1).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, weight_decay=1e-5)
    criterion = nn.BCEWithLogitsLoss(weight=class_weights[i].to(device))
    
    model, history = trainer.single_target_loop(
        model, optimizer, criterion, num_epochs,
        train_dl, val_dl, which_target=i, device=device,
        previous_loss_scale=1.007, epoch_percentage=0.1
    )
    
    histories.append(history)
    meta = f"""Loss: BCEWithLogits; Layers={len(in_out_degrees)}, BBox < {size_threshold} mi², Optimizer=Adam"""
    metadatas.append(meta)


### Visualization

In [None]:
storm_labels = ['everything ' + i for i in preprocessor.get_target_list()]

def make_canvas(num_plots):
    nrows = math.ceil(num_plots / 4)
    fig, axes = plt.subplots(nrows=nrows, ncols=4, figsize=(6 * 4, 6 * nrows))
    return fig, axes

def train_val_loss_plot(axes, metrics, labels, meta, fontsize=8, xlab='', ylab='Loss'):
    nrows = axes.shape[0] if hasattr(axes, 'shape') else 1
    flat_axes = axes.flatten() if hasattr(axes, 'flatten') else axes
    for i, ax in enumerate(flat_axes[:len(metrics)]):
        ax.plot(metrics[i]['train'], label='Train')
        ax.plot(metrics[i]['test'], label='Validation')
        ax.set_title(f"{labels[i]}\n{meta[i]}", fontsize=fontsize)
        ax.set_xlabel(xlab)
        ax.set_ylabel(ylab)
        ax.legend()

def train_val_f1_plot(axes, metrics, labels, meta, target_labs, fontsize=8, xlab='', ylab='F1'):
    nrows = axes.shape[0] if hasattr(axes, 'shape') else 1
    flat_axes = axes.flatten() if hasattr(axes, 'flatten') else axes
    for i, ax in enumerate(flat_axes[:len(metrics)]):
        ax.plot(metrics[i]['f1'], label=target_labs)
        ax.set_title(f"{labels[i]}\n{meta[i]}", fontsize=fontsize)
        ax.set_xlabel(xlab)
        ax.set_ylabel(ylab)
        ax.legend()

# Plot loss
fig, axes = make_canvas(len(histories))
train_val_loss_plot(axes, histories, storm_labels, metadatas)
plt.tight_layout()
plt.show()

# Plot F1 scores
fig, axes = make_canvas(len(histories))
train_val_f1_plot(axes, histories, storm_labels, metadatas, target_labs='F1')
plt.tight_layout()
plt.show()
