## Notebook showing how to make predictions with a trained model for sub-event detection

#### Import relevant libraries:

In [1]:
from datetime import datetime
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import os
import gc
import json
import sys

sys.path.append('../.')
from preprocessor import filter_tweet
import EnhancedPeriodClassifier.train as train

# if sentiment analysis is needed
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /users/eleves-a/2024/tim-
[nltk_data]     luka.horstmann.m2/nltk_data...


#### Constants:

In [None]:
DATA_DIR = '/Data/tlh45/'   # directory where the HF models will be saved, and where the challenge data is stored
MODEL_DIR = 'Model'     # directory where the model as well as the normalization params to be used are saved (obtained from training)
MAX_TWEETS = 1500           # maximum number of tweets per period to be considered
BATCH_SIZE = 32             # batch size for the model, depending on the GPU memory

EMBEDDING_DIM = 768         # dimension of the embeddings (BERT-base)
MAX_LENGTH = 128            #maximum number of tokens in a tweet

# CHOOSE WHICH MODEL TO USE ("normal" version or "modified" version)
from enhancedperiodclassifier_modified import EnhancedPeriodClassifier
# from enhancedperiodclassifier import EnhancedPeriodClassifier

# Define output directory for predictions
today = datetime.today().strftime('%Y-%m-%d')
output_dir = os.path.join("..", "..", "..", "predictions", f"preds-{today}")
os.makedirs(output_dir, exist_ok=True)


normalization_params_path = os.path.join(MODEL_DIR, "normalization_params.json")

#### Function Definitions:

In [7]:
def compute_and_normalize_features_sentiment(tweets, period_id, normalization_params):
    num_tweets = len(tweets)
    if num_tweets > 0:
        avg_tweet_length = np.mean([len(str(tweet)) for tweet in tweets])
        avg_sentiment = train.compute_sentiment_scores(tweets, sia)
    else:
        avg_tweet_length = 0
        avg_sentiment = 0

    num_tweets_norm = (num_tweets - normalization_params['num_tweets_mean']) / normalization_params['num_tweets_std']
    avg_tweet_length_norm = (avg_tweet_length - normalization_params['avg_tweet_length_mean']) / normalization_params['avg_tweet_length_std']
    normalized_period = (period_id - normalization_params['periodID_mean']) / normalization_params['periodID_std']
    normalized_sentiment = (avg_sentiment - normalization_params['sentiment_mean']) / normalization_params['sentiment_std']

    return num_tweets_norm, avg_tweet_length_norm, normalized_period, normalized_sentiment

def compute_and_normalize_features(tweets, period_id, normalization_params):
    num_tweets = len(tweets)
    if num_tweets > 0:
        avg_tweet_length = np.mean([len(str(tweet)) for tweet in tweets])
    else:
        avg_tweet_length = 0

    num_tweets_norm = (num_tweets - normalization_params['num_tweets_mean']) / normalization_params['num_tweets_std']
    avg_tweet_length_norm = (avg_tweet_length - normalization_params['avg_tweet_length_mean']) / normalization_params['avg_tweet_length_std']
    normalized_period = (period_id - normalization_params['periodID_mean']) / normalization_params['periodID_std']

    return num_tweets_norm, avg_tweet_length_norm, normalized_period

def prepare_embeddings_and_masks(tweets, tokenizer, bertweet_model, device):
    if len(tweets) > MAX_TWEETS:
        tweets = tweets[:MAX_TWEETS]
        mask = [1] * MAX_TWEETS
    else:
        mask = [1] * len(tweets) + [0] * (MAX_TWEETS - len(tweets))
        tweets += [tokenizer.pad_token] * (MAX_TWEETS - len(tweets))

    # embeddings = train.compute_mean_embeddings_batch(tweets, bertweet_model=bertweet_model, tokenizer=tokenizer, device=device, max_length=MAX_LENGTH)
    # OR (experimental)
    embeddings = train.compute_cls_embeddings_batch(tweets, bertweet_model=bertweet_model, tokenizer=tokenizer, device=device, max_length=MAX_LENGTH)

    return embeddings, mask

def predict_batch(tweets_batch, period_ids, normalization_params, tokenizer, bertweet_model, model, device):
    num_tweets_norm = []
    avg_tweet_length_norm = []
    normalized_period = []
    normalized_sentiment = []
    for tweets, period_id in zip(tweets_batch, period_ids):
        nt_norm, at_norm, np_norm = compute_and_normalize_features(tweets, period_id, normalization_params) # choose this line instead of below if sentiment analysis is not needed
        # nt_norm, at_norm, np_norm, ns_norm = compute_and_normalize_features_sentiment(tweets, period_id, normalization_params)
        num_tweets_norm.append(nt_norm)
        avg_tweet_length_norm.append(at_norm)
        normalized_period.append(np_norm)
        # normalized_sentiment.append(ns_norm) # with sentiment

    embeddings_batch = []
    masks_batch = []
    for tweets in tweets_batch:
        embeddings, mask = prepare_embeddings_and_masks(tweets, tokenizer, bertweet_model, device)
        embeddings_batch.append(embeddings)
        masks_batch.append(mask)

    embeddings_tensor = torch.tensor(np.stack(embeddings_batch), dtype=torch.float32).to(device) 
    masks_tensor = torch.tensor(np.stack(masks_batch), dtype=torch.float32).to(device) 
    additional_features = torch.tensor([
        [nt, at, np,] for nt, at, np in zip(num_tweets_norm, avg_tweet_length_norm, normalized_period) #  choose this line instead of below if sentiment analysis is not needed
        # [nt, at, np, ns] for nt, at, np, ns in zip(num_tweets_norm, avg_tweet_length_norm, normalized_period, normalized_sentiment) # with sentiment
    ], dtype=torch.float32).to(device)

    with torch.no_grad():
        outputs = model(
            embeddings=embeddings_tensor,
            masks=masks_tensor,
            additional_features=additional_features
        )
        logits = outputs['logits']
        probabilities = torch.nn.functional.softmax(logits, dim=-1)
        predictions = torch.argmax(probabilities, dim=-1)

    torch.cuda.empty_cache()
    gc.collect()

    return predictions.cpu().numpy(), probabilities.cpu().numpy()[:, 1]

def group_evaluation_data(df):
    grouped = df.groupby('ID').agg({
        'MatchID': 'first',
        'PeriodID': 'first',
        'Tweet': list 
    }).reset_index()
    return grouped

def load_and_preprocess_eval_data(eval_path):
    df = pd.read_csv(eval_path)
    df = df.drop_duplicates(subset=['Tweet'])
    df['Tweet'] = df['Tweet'].apply(filter_tweet)
    df = df.dropna(subset=['Tweet']).reset_index(drop=True)
    return df

def load_predictions(directory):
    all_data = []
    for filename in os.listdir(directory):
        if filename.startswith("predictions_") and filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            df = pd.read_csv(filepath)
            all_data.append(df)
    return pd.concat(all_data, ignore_index=True)

#### Start inference:

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
try:
    model = EnhancedPeriodClassifier.from_pretrained(MODEL_DIR)
    model.to(device).eval()
    print("Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")
    exit()

tokenizer = AutoTokenizer.from_pretrained(
    'vinai/bertweet-base',
    cache_dir=os.path.join(DATA_DIR, "hf_cache"),
    normalization=True,
    use_fast=False
)
bertweet_model = AutoModel.from_pretrained(
    'vinai/bertweet-base',
    cache_dir=os.path.join(DATA_DIR, "hf_cache")
)
bertweet_model.to(device).eval()
torch.backends.cudnn.benchmark = True

if not os.path.exists(normalization_params_path):
    raise FileNotFoundError(f"Normalization parameters not found at {normalization_params_path}")

with open(normalization_params_path, 'r') as f:
    normalization_params = json.load(f)

eval_dir = os.path.join(DATA_DIR, "challenge_data", "eval_tweets")
for eval_file in tqdm(os.listdir(eval_dir), desc="Processing evaluation files"):
    if eval_file.endswith(".csv"):
        eval_path = os.path.join(eval_dir, eval_file)
        df = load_and_preprocess_eval_data(eval_path)
        grouped = group_evaluation_data(df)

        tweets_list = grouped['Tweet'].tolist()
        period_ids = grouped['PeriodID'].tolist()

        predictions = []
        probabilities = []

        for i in tqdm(range(0, len(tweets_list), BATCH_SIZE), desc=f"Evaluating {eval_file}"):
            batch_tweets = tweets_list[i:i + BATCH_SIZE]
            batch_period_ids = period_ids[i:i + BATCH_SIZE]

            preds, probs = predict_batch(
                batch_tweets, batch_period_ids, normalization_params, tokenizer, bertweet_model, model, device
            )
            predictions.extend(preds)
            probabilities.extend(probs)

        grouped['EventType'] = predictions
        grouped['Probability'] = probabilities

        output_path = os.path.join(output_dir, f"predictions_{eval_file}")
        grouped.to_csv(output_path, index=False)

print("Inference completed. Predictions saved to:", output_dir)

df_predictions = load_predictions(output_dir)
combined_output_path = os.path.join(output_dir, "predictions.csv")
df_predictions.to_csv(combined_output_path, index=False)

print(f"Combined predictions saved to {combined_output_path}")
print(df_predictions['EventType'].value_counts())

Embedding dimension: 768
Number of classes: 2
Number of attention heads: 8
Dropout probability: 0.5
Additional features dimension: 3
FC1 dimension: 4096
Model loaded successfully!


Processing evaluation files:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluating GermanyGhana32.csv:   0%|          | 0/5 [00:00<?, ?it/s]

  with torch.no_grad(), autocast(enabled=True):


Evaluating GreeceIvoryCoast44.csv:   0%|          | 0/5 [00:00<?, ?it/s]

  with torch.no_grad(), autocast(enabled=True):


Evaluating NetherlandsMexico64.csv:   0%|          | 0/4 [00:00<?, ?it/s]

  with torch.no_grad(), autocast(enabled=True):


Evaluating GermanySerbia2010.csv:   0%|          | 0/5 [00:00<?, ?it/s]

  with torch.no_grad(), autocast(enabled=True):


Inference completed. Predictions saved to: ../../predictions/preds-2024-12-11
Combined predictions saved to ../../predictions/preds-2024-12-11/predictions.csv
EventType
1    282
0    234
Name: count, dtype: int64


#### Prepare predictions for submission

In [6]:
df_predictions

Unnamed: 0,ID,MatchID,PeriodID,Tweet,EventType,Probability
0,6_0,6,0,['I Finally get to see Germany play\n#GER 🇩🇪...,0,0.187946
1,6_1,6,1,"['""In a few minutes #BigGame of #GER x #GHA......",0,0.348002
2,6_10,6,10,['Ghana invented the gravity bong #WorldCup #f...,1,0.944888
3,6_100,6,100,"['THIS GAME. #GhanavsGermany #WorldCup', ""Let'...",1,0.959938
4,6_101,6,101,"['Klose! You come on the pitch, the ball goes ...",1,0.952688
...,...,...,...,...,...,...
511,15_95,15,95,['Want Mexico to win just for their goal keepe...,0,0.262070
512,15_96,15,96,"[""I'm gonna go laugh to my netherlands uncle's...",0,0.124372
513,15_97,15,97,"['Put chicharito in pleaseee ! #CH14 #mex', 'D...",0,0.112108
514,15_98,15,98,['Dirk Kuyt must be one of the fittest players...,0,0.119479


In [7]:
# order by periodID, then by MatchID
df = df_predictions.copy()
df = df.sort_values(by=['MatchID', 'PeriodID']).reset_index(drop=True)
df = df[['ID', 'EventType']]
df

Unnamed: 0,ID,EventType
0,6_0,0
1,6_1,0
2,6_2,0
3,6_3,0
4,6_4,0
...,...,...
511,16_125,1
512,16_126,1
513,16_127,1
514,16_128,1


In [8]:
submission_path = os.path.join(output_dir, "filtered_predictions.csv")
df.to_csv(submission_path, index=False)
print(f"Submission file saved to {submission_path}")

Submission file saved to ../predictions/preds-2024-12-08/filtered_predictions.csv
