<a href="https://colab.research.google.com/github/TerrorismAnalyticsBureau/TAB-AI/blob/main/TABAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as func
import torch.optim as optim
import os
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from torch.utils.data import DataLoader, TensorDataset

# ------------------------------ Data Preprocessing ------------------------------

# Set pyTorch local env to use segmented GPU memory
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Clear GPU cache & Set the device to use GPU
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
# Skip rows = 1 because those are the column names
X = np.array([])

# Read the file using its encoding
data = pd.read_csv('./globalterrorismdb_0718dist.csv', encoding="Windows-1252")

# Extract relevant columns (adjust indices or column names as needed)
input_columns = data.iloc[:, [1, 2, 3, 7, 11]]
input_columns = input_columns.fillna(0)

# Convert non-numeric to numeric and fill missing values
for col in input_columns.columns:
    input_columns[col] = pd.to_numeric(input_columns[col], errors='coerce')  # Convert non-numeric to NaN
input_columns = input_columns.fillna(0)  # Replace NaN with 0

attack_target = data.iloc[:, [28]]
group_target = data.iloc[:, [58]]

# Set the base date (last day of 2017)
last_date = datetime(2017, 12, 31)

# Convert last date to numeric form
last_date_numeric = last_date.toordinal()

# Get date from dataset
data['imonth'].fillna(pd.to_datetime(1), inplace=True)
data['iday'].fillna(pd.to_datetime(1), inplace=True)

data['date_str'] = data['iyear'].astype(str) + '-' + data['imonth'].astype(str).str.zfill(2) + '-' + data['iday'].astype(str).str.zfill(2)
data['date'] = pd.to_datetime(data['date_str'], errors='coerce')


# Convert dates to numeric by subtracting the last date of 2017
# Get number of days since Dec 31, 2017
data['date_numeric'] = (data['date'] - last_date).dt.days

# Extract unique values
unique_attacks = list(set(data['attacktype1_txt']))
unique_groups = list(set(data['gname']))
unique_provstates = list(set(data['provstate']))
unique_cities = list(set(data['city']))

# Initialize LabelEncoder and fit to the unique groups
attack_encoder = LabelEncoder()
attack_encoder.fit(unique_attacks)

group_encoder = LabelEncoder()
group_encoder.fit(unique_groups)

provstate_encoder = LabelEncoder()
provstate_encoder.fit(unique_provstates)

city_encoder = LabelEncoder()
city_encoder.fit(unique_cities)

# Set the output size based on the number of unique attack types
num_attack_types = len(unique_attacks)
num_groups = len(unique_groups)
num_cities = len(unique_cities)
num_provstates = len(unique_provstates)

# Create a dictionary to map names to their encoded IDs
group_dict = pd.Series(group_encoder.transform(unique_groups), index=unique_groups)
provstate_dict = pd.Series(provstate_encoder.transform(unique_provstates), index=unique_provstates)
city_dict = pd.Series(city_encoder.transform(unique_cities), index=unique_cities)

# Assign values to tensors for processing
input_tensor = torch.tensor(input_columns.to_numpy(), dtype=torch.float32)
attack_target_tensor = torch.tensor(attack_target.values, dtype=torch.float32)
group_target_tensor = torch.tensor(group_encoder.fit_transform(group_target.values), dtype=torch.float32)
city_target_tensor = torch.tensor(city_encoder.fit_transform(data['city'].values), dtype=torch.float32)
provstate_target_tensor = torch.tensor(provstate_encoder.fit_transform(data['provstate'].values), dtype=torch.float32)

# TESTING - PRINT DICTIONARY ITEMS
#for key, value in group_dict.items():
#  print("group: ", key, "| ID #:", value)

#for key, value in provstate_dict.items():
#  print("provstate: ", key, "| ID #:", value)

#for key, value in city_dict.items():
#  print("city: ", key, "| ID #:", value)

# Assign values to tensors for processing
X_tensor = input_tensor

# Normalize: mean and std for each feature
mean = X_tensor.mean(dim=0, keepdim=True)
std = X_tensor.std(dim=0, keepdim=True)
X_tensor = (X_tensor - mean) / std
X_tensor_norm = X_tensor

Y_tensor_attack = attack_target_tensor
Y_tensor_group = group_target_tensor
Y_tensor_city = city_target_tensor
Y_tensor_provstate = provstate_target_tensor
Y_tensor_date = data['date_numeric'] - last_date_numeric

# Set tensors to use GPU
X_tensor = X_tensor.to(device)
Y_tensor_attack = Y_tensor_attack.to(device)
Y_tensor_group = Y_tensor_group.to(device)
Y_tensor_city = Y_tensor_city.to(device)
Y_tensor_provstate = Y_tensor_provstate.to(device)

# ------------------------------ LSTM Prediction Model ------------------------------
def train_model(X_tensor, Y_tensor, num_classes, sequence_length=30, hidden_size=128, num_epochs=10, batch_size=32):
    class LSTMPredictor(nn.Module):
        def __init__(self, input_size, hidden_size, output_size):
            super(LSTMPredictor, self).__init__()
            self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
            self.dropout = nn.Dropout(0.2)
            self.fc = nn.Linear(hidden_size, output_size)

        def forward(self, x):
            lstm_out, _ = self.lstm(x)
            lstm_out = self.dropout(lstm_out)
            logits = self.fc(lstm_out[:, -1, :])
            return logits

    # Create sequences
    def create_sequences(input_data, seq_length):
        sequences = []
        for i in range(len(input_data) - seq_length + 1):
            seq = input_data[i:i + seq_length]
            sequences.append(seq)
        return torch.stack(sequences)

    sequences = create_sequences(X_tensor, sequence_length)

    # Create DataLoader
    dataset = TensorDataset(sequences, Y_tensor[:len(sequences)])
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Initialize model, loss, and optimizer
    model = LSTMPredictor(input_size=X_tensor.shape[1], hidden_size=hidden_size, output_size=num_classes).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)

    # Training loop
    for epoch in range(num_epochs):
        for batch_x, batch_y in dataloader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            # Forward pass
            outputs = model(batch_x)
            if batch_y.ndim > 1:
                batch_y = batch_y.argmax(dim=1)  # Convert one-hot to indices
            batch_y = batch_y.long()  # Ensure correct type

            loss = criterion(outputs, batch_y)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")

    return model

# ------------------------------ Linear Regression Prediction Model ------------------------------

class LinearRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LinearRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.linear(x)

# Initialize the model
model_date = LinearRegressionModel(X_tensor.shape[1])
criterion = nn.MSELoss()
optimizer = optim.SGD(model_date.parameters(), lr=0.001)

# Training loop
for epoch in range(100):  # Adjust epochs as needed
    optimizer.zero_grad()
    predictions = model_date(X_tensor)
    loss = criterion(predictions, Y_tensor_date)
    loss.backward()
    optimizer.step()

# ------------------------------ Train & Evaluate Models ------------------------------
model_attack = train_model(X_tensor, Y_tensor_attack, num_classes=num_attack_types)
model_attack = model_attack.to(device)

model_groups = train_model(X_tensor, Y_tensor_group, num_classes=num_groups)
model_groups = model_groups.to(device)

model_city = train_model(X_tensor, Y_tensor_city, num_classes=num_cities)
model_city = model_city.to(device)

model_provstate = train_model(X_tensor, Y_tensor_provstate, num_classes=num_provstates)
model_provstate = model_provstate.to(device)

model_date = LinearRegressionModel(X_tensor.shape[1])
model_date.fit(X_tensor, Y_tensor_date)

# Set the model to evaluation mode
model_attack.eval()
model_groups.eval()
model_city.eval()
model_provstate.eval()
model_date.eval()

# ------------------------------ Testing ------------------------------
with torch.no_grad():
    # Prepare the most recent sequence for prediction
    recent_sequence = X_tensor[-1:].unsqueeze(0).to(device)  # Add batch dimension

    # Model 1: Attack prediction
    prediction_attack = model_attack(recent_sequence)  # Get model's prediction (logits)
    # Get the predicted class (argmax of logits)
    predicted_class_attack = torch.argmax(prediction_attack, dim=1).item()  # Convert logits to class index
    # Decode the predicted class back to attack type using the encoder
    attack_type = attack_encoder.inverse_transform([predicted_class_attack])
    print("Predicted Attack Type:", attack_type[0])

    # Model 2: Group prediction
    prediction_group = model_groups(recent_sequence)  # Get model's prediction (logits)
    # Get the predicted class (argmax of logits)
    predicted_class_group = torch.argmax(prediction_group, dim=1).item()  # Convert logits to class index
    # Decode the predicted class back to attack type using the encoder
    group_name = group_encoder.inverse_transform([predicted_class_group])
    print("Predicted Group Name:", group_name[0])

    # Model 3: City prediction
    prediction_city = model_city(recent_sequence)  # Get model's prediction (logits)
    # Get the predicted class (argmax of logits)
    predicted_class_city = torch.argmax(prediction_city, dim=1).item()  # Convert logits to class index
    # Decode the predicted class back to attack type using the encoder
    city_name = city_encoder.inverse_transform([predicted_class_city])
    print("Predicted City Name:", city_name[0])

    # Model 4: provstate prediction
    prediction_provstate = model_provstate(recent_sequence)  # Get model's prediction (logits)
    # Get the predicted class (argmax of logits)
    predicted_class_provstate = torch.argmax(prediction_provstate, dim=1).item()  # Convert logits to class index
    # Decode the predicted class back to attack type using the encoder
    provstate_name = provstate_encoder.inverse_transform([predicted_class_provstate])
    print("Predicted State Name:", provstate_name[0])

    # Model 5: Date prediction
    # Predict the offset (number of days) for a new input
    future_input = torch.tensor([X_tensor]).unsqueeze(0)  # Replace with actual input features
    predicted_offset = model_date.predict(future_input)[0]

    # Calculate the predicted numeric date (days since Dec 31, 2017)
    predicted_numeric_date = last_date_numeric + predicted_offset

    # Convert the predicted numeric date back to a datetime object
    predicted_date = datetime.fromordinal(predicted_numeric_date)

    print("Predicted Date:", predicted_date)







  data = pd.read_csv('./globalterrorismdb_0718dist.csv', encoding="Windows-1252")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['imonth'].fillna(pd.to_datetime(1), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['iday'].fillna(pd.to_datetime(1), inplace=True)
  y = column_or_1d(y, warn=True)


Epoch [1/10], Loss: nan
Epoch [2/10], Loss: nan
Epoch [3/10], Loss: nan
Epoch [4/10], Loss: nan
Epoch [5/10], Loss: nan
Epoch [6/10], Loss: nan
Epoch [7/10], Loss: nan
Epoch [8/10], Loss: nan
Epoch [9/10], Loss: nan
Epoch [10/10], Loss: nan
Epoch [1/10], Loss: nan
Epoch [2/10], Loss: nan
Epoch [3/10], Loss: nan
Epoch [4/10], Loss: nan
Epoch [5/10], Loss: nan
Epoch [6/10], Loss: nan
Epoch [7/10], Loss: nan
Epoch [8/10], Loss: nan
Epoch [9/10], Loss: nan
Epoch [10/10], Loss: nan
Epoch [1/10], Loss: nan
Epoch [2/10], Loss: nan
Epoch [3/10], Loss: nan
Epoch [4/10], Loss: nan
Epoch [5/10], Loss: nan
Epoch [6/10], Loss: nan
Epoch [7/10], Loss: nan
Epoch [8/10], Loss: nan
Epoch [9/10], Loss: nan
Epoch [10/10], Loss: nan
Epoch [1/10], Loss: nan
Epoch [2/10], Loss: nan
Epoch [3/10], Loss: nan
Epoch [4/10], Loss: nan
Epoch [5/10], Loss: nan
Epoch [6/10], Loss: nan
Epoch [7/10], Loss: nan
Epoch [8/10], Loss: nan
Epoch [9/10], Loss: nan
Epoch [10/10], Loss: nan


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

name (csv column number)
```
•    iyear (1), imonth (2), iday (3)
•    country code (7) and country_txt (8)
•    region code (9) and region_txt (10)
•    provstate (11) and city (12)
•    latitude (13) and longitude (14)
•    attacktype1 (28) and attacktype1_txt (29)
•    targtype1 (34) and targtype1_txt (35)
•    targsubtype1 (36) and targsubtype1_txt (37)
•    target1 (39) (the specific target by name, building or person)

•    natity1 (40) and natity1_txt (41) (maybe later)
•    gname (group name) (58)
•    weaptype1 (81) and weaptype1_txt (82) (maybe)
•    nkill (98) and nwound (101)

```

