<a href="https://colab.research.google.com/github/TerrorismAnalyticsBureau/TAB-AI/blob/main/TABAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as func
import torch.optim as optim
import matplotlib.pyplot as plt
import math
import os
import chardet
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset

# ------------------------------ Data Preprocessing ------------------------------

# Set pyTorch local env to use segmented GPU memory
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Clear GPU cache & Set the device to use GPU
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
# Skip rows = 1 because those are the column names
X = np.array([])

# Now, read the file using the detected encoding
data = pd.read_csv('./globalterrorismdb_0718dist.csv', encoding="Windows-1252")

# Extract relevant columns (adjust indices or column names as needed)
input_columns = data.iloc[:, [1, 2, 3, 7, 11]]
input_columns = input_columns.fillna(0)

# Convert non-numeric to numeric and fill missing values
for col in input_columns.columns:
    input_columns[col] = pd.to_numeric(input_columns[col], errors='coerce')  # Convert non-numeric to NaN
input_columns = input_columns.fillna(0)  # Replace NaN with 0

# Normalize the first three columns
scaler = MinMaxScaler()
input_columns.iloc[:, :3] = scaler.fit_transform(input_columns.iloc[:, :3].astype(float))

attack_target = data.iloc[:, [28]]
group_target = data.iloc[:, [57]]
date_target = data.iloc[:, [1, 2, 3]]

# Extract unique group names
unique_groups = list(set(data['gname']))
unique_provstates = list(set(data['provstate']))
unique_cities = list(set(data['city']))

attack_encoder = LabelEncoder()
attack_encoder.fit(data['attacktype1_txt'].unique())  # Fit to unique attack types in the text column

# Encode the attack types numerically based on attacktype1 (which will be your target)
attack_target_tensor = torch.tensor(data['attacktype1'].values, dtype=torch.long)

# Set the output size based on the number of unique attack types
num_attack_types = len(attack_encoder.classes_)
num_groups = len(unique_groups)
num_provstates = len(unique_provstates)
num_cities = len(unique_cities)
num_dates = len(date_target)

# Initialize LabelEncoder and fit to the unique groups
group_encoder = LabelEncoder()
group_encoder.fit(unique_groups)

provstate_encoder = LabelEncoder()
provstate_encoder.fit(unique_provstates)

city_encoder = LabelEncoder()
city_encoder.fit(unique_cities)

# Create a dictionary to map names to their encoded IDs
group_dict = pd.Series(group_encoder.transform(unique_groups), index=unique_groups)
provstate_dict = pd.Series(provstate_encoder.transform(unique_provstates), index=unique_provstates)
city_dict = pd.Series(city_encoder.transform(unique_cities), index=unique_cities)

# Assign values to tensors for processing
input_tensor = torch.tensor(input_columns.to_numpy(), dtype=torch.float32)
attack_target_tensor = torch.tensor(attack_target.values, dtype=torch.float32)
group_target_tensor = torch.tensor(group_encoder.fit_transform(group_target.values), dtype=torch.float32)
date_target_tensor = torch.tensor(date_target.values, dtype=torch.float32)
provstate_target_tensor = torch.tensor(provstate_encoder.fit_transform(data['provstate'].values), dtype=torch.float32)
city_target_tensor = torch.tensor(city_encoder.fit_transform(data['city'].values), dtype=torch.float32)

# TESTING - PRINT DICTIONARY ITEMS
#for key, value in group_dict.items():
#  print("group: ", key, "| ID #:", value)

#for key, value in provstate_dict.items():
#  print("provstate: ", key, "| ID #:", value)

#for key, value in city_dict.items():
#  print("city: ", key, "| ID #:", value)

# Assign values to tensors for processing
X_tensor = input_tensor
Y_tensor_attack = attack_target_tensor
Y_tensor_group = group_target_tensor
Y_tensor_date = date_target_tensor
Y_tensor_provstate = provstate_target_tensor
Y_tensor_city = city_target_tensor

# Set tensors to use GPU
X_tensor = X_tensor.to(device)
Y_tensor_attack = Y_tensor_attack.to(device)
Y_tensor_group = Y_tensor_group.to(device)
Y_tensor_provstate = Y_tensor_provstate.to(device)
Y_tensor_date = Y_tensor_date.to(device)
Y_tensor_city = Y_tensor_city.to(device)

# ------------------------------ Model 1: Attack Type Prediction ------------------------------
class LSTMAttackPredictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMAttackPredictor, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = self.dropout(lstm_out)  # Dropout after LSTM
        logits = self.fc(lstm_out[:, -1, :])  # Fully connected on the last time step
        return logits  # Return raw logits for training with CrossEntropyLoss

sequence_length = 10

# Create sequences from the input_tensor
def create_sequences(input_data, seq_length):
    sequences = []
    for i in range(len(input_data) - seq_length + 1):
        seq = input_data[i:i+seq_length]
        sequences.append(seq)
    return torch.stack(sequences)

# Assuming input_tensor is of shape (num_samples, num_features)
sequences = create_sequences(X_tensor, sequence_length)
print(sequences.shape)  # (num_sequences, sequence_length, num_features)

batch_size = 32

# A target tensor (e.g. attack_target_tensor)
dataset = TensorDataset(sequences, Y_tensor_attack[:len(sequences)])  # Adjust target size
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

model = LSTMAttackPredictor(input_size=input_tensor.shape[1], hidden_size=128, output_size=num_attack_types)
model = model.to(device)

criterion = nn.CrossEntropyLoss()  # Assuming classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


# Training loop
for epoch in range(10):  # Number of epochs
    for batch_x, batch_y in dataloader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)  # Move to GPU if available

        # Forward pass
        outputs = model(batch_x)

        if batch_y.ndim > 1:  # Handle one-hot or extra dimensions
          batch_y = batch_y.argmax(dim=1)  # Convert one-hot to class indices

        # Ensure target tensor has correct data type
        batch_y = batch_y.long()

        # Compute loss
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/10], Loss: {loss.item():.4f}")

# ------------------------------ Model 2 test ------------------------------
def train_model(X_tensor, Y_tensor, num_classes, sequence_length=10, hidden_size=128, num_epochs=10, batch_size=32):
    class LSTMPredictor(nn.Module):
        def __init__(self, input_size, hidden_size, output_size):
            super(LSTMPredictor, self).__init__()
            self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
            self.dropout = nn.Dropout(0.2)
            self.fc = nn.Linear(hidden_size, output_size)

        def forward(self, x):
            lstm_out, _ = self.lstm(x)
            lstm_out = self.dropout(lstm_out)
            logits = self.fc(lstm_out[:, -1, :])
            return logits

    # Create sequences
    def create_sequences(input_data, seq_length):
        sequences = []
        for i in range(len(input_data) - seq_length + 1):
            seq = input_data[i:i + seq_length]
            sequences.append(seq)
        return torch.stack(sequences)

    sequences = create_sequences(X_tensor, sequence_length)

    # Create DataLoader
    dataset = TensorDataset(sequences, Y_tensor[:len(sequences)])
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Initialize model, loss, and optimizer
    model = LSTMPredictor(input_size=X_tensor.shape[1], hidden_size=hidden_size, output_size=num_classes).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    for epoch in range(num_epochs):
        for batch_x, batch_y in dataloader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            # Forward pass
            outputs = model(batch_x)
            if batch_y.ndim > 1:
                batch_y = batch_y.argmax(dim=1)  # Convert one-hot to indices
            batch_y = batch_y.long()  # Ensure correct type

            loss = criterion(outputs, batch_y)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")

    return model

# ------------------------------ Main Workflow ------------------------------

# Set the model to evaluation mode
model.eval()
with torch.no_grad():
    # Prepare the most recent sequence for prediction
    #recent_sequence = X_tensor[-1:].unsqueeze(0).to(device)  # Add batch dimension
    #prediction = model(recent_sequence)  # Get model's prediction (logits)

    # Get the predicted class (argmax of logits)
    #predicted_class = torch.argmax(prediction, dim=1).item()  # Convert logits to class index

    # Decode the predicted class back to attack type using the encoder
    #attack_type = attack_encoder.inverse_transform([predicted_class])

    #print("Predicted Attack Type:", attack_type[0])
    model_attack = train_model(X_tensor, Y_tensor_attack, num_classes=num_attack_types)
    model_groups = train_model(X_tensor, Y_tensor_group, num_classes=num_groups)
    model_provstate = train_model(X_tensor, Y_tensor_provstate, num_classes=num_provstates)
    model_city = train_model(X_tensor, Y_tensor_provstate, num_classes=num_cities)
    model_date = train_model(X_tensor, Y_tensor_date, num_classes=num_dates)







  data = pd.read_csv('./globalterrorismdb_0718dist.csv', encoding="Windows-1252")
  input_columns.iloc[:, :3] = scaler.fit_transform(input_columns.iloc[:, :3].astype(float))
  input_columns.iloc[:, :3] = scaler.fit_transform(input_columns.iloc[:, :3].astype(float))
  input_columns.iloc[:, :3] = scaler.fit_transform(input_columns.iloc[:, :3].astype(float))
  y = column_or_1d(y, warn=True)


torch.Size([181682, 10, 5])
Epoch [1/10], Loss: 0.0000
Epoch [2/10], Loss: 0.0000
Epoch [3/10], Loss: 0.0000
Epoch [4/10], Loss: 0.0000
Epoch [5/10], Loss: 0.0000
Epoch [6/10], Loss: 0.0000
Epoch [7/10], Loss: 0.0000
Epoch [8/10], Loss: 0.0000
Epoch [9/10], Loss: 0.0000
Epoch [10/10], Loss: 0.0000
Predicted Attack Type: Armed Assault


name (csv column number)
```
•    iyear (1), imonth (2), iday (3)
•    country code (7) and country_txt (8)
•    region code (9) and region_txt (10)
•    provstate (11) and city (12)
•    latitude (13) and longitude (14)
•    attacktype1 (28) and attacktype1_txt (29)
•    targtype1 (34) and targtype1_txt (35)
•    targsubtype1 (36) and targsubtype1_txt (37)
•    target1 (39) (the specific target by name, building or person)

•    natity1 (40) and natity1_txt (41) (maybe later)
•    gname (group name) (57)
•    weaptype1 (81) and weaptype1_txt (82) (maybe)
•    nkill (98) and nwound (101)

```

