# Graph approach to sub-event detection in Twitter streams 

## Imports

In [1]:
import networkx as nx
from itertools import combinations
from sklearn.metrics import precision_score, recall_score, f1_score
import re
import numpy as np
from scipy.optimize import minimize
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import norm
import bisect
from scipy.sparse import find
import networkx as nx
import math
import os 
import csv 
from os.path import basename, isdir, isfile, splitext, join
from nltk.stem import *
from sklearn.metrics import accuracy_score
import time

###################################### Event detection parameters ######################################

PREVIOUS_PERIODS = 5
EVENT_DETECTION_THRESHOLD = 0.8


## Preprocessing

In [None]:

def preprocess_tweet(tweet):
    """Preprocess a single tweet (example function)."""
    tweet = tweet.lower()
    # Remove URLs
    tweet = re.sub(r"http[s]?://\S+", "", tweet)
    # Remove mentions
    tweet = re.sub(r"@\w+", "", tweet)
    # Remove hashtags
    tweet = re.sub(r"#\w+", "", tweet)
    # Remove special characters
    tweet = re.sub(r"\W+", " ", tweet)
    # Ignore retweets:
    tweet = re.sub('rt @?[a-zA-Z0-9_]+:?', '', tweet)
    # ignore usernames
    tweet = re.sub('@[a-zA-Z0-9_]+:?', '', tweet)
    # Remove special characters
    tweet = re.sub(r'\W+', ' ', tweet) 
    # Remove new lines
    tweet = re.sub('[\s+]', ' ', tweet)
    #Unique Words
    unique_words_index = set()
    new_tweet = ''
    # Suppress duplicate tweets
    words = tweet.split()
    for word in words:
        if word not in unique_words_index:
            unique_words_index.add(word)
            new_tweet = new_tweet + ' ' + word
    tweet = new_tweet
    # Remove leading and trailing whitespaces
    return tweet.strip()

def process_file(input_file, output_file, chunk_size=1000):
    with open(input_file, 'r', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
        fieldnames = ['ID', 'MatchID', 'PeriodID', 'EventType', 'Timestamp', 'Preprocessed_Tweet']
        
        with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()
            
            chunk = []
            for i, row in enumerate(reader):
                tweet = row['Tweet']
                preprocessed_tweet = preprocess_tweet(tweet)
                chunk.append({
                    'ID': row['ID'],
                    'MatchID': row['MatchID'],
                    'PeriodID': row['PeriodID'],
                    'EventType': row['EventType'],
                    'Timestamp': row['Timestamp'],
                    'Preprocessed_Tweet': preprocessed_tweet
                })
                
                # Write the chunk to file after processing `chunk_size` rows
                if (i + 1) % chunk_size == 0:
                    writer.writerows(chunk)
                    chunk = []
            
            # Write any remaining rows
            if chunk:
                writer.writerows(chunk)          

def process_test_file(input_file, output_file, chunk_size=1000):
    with open(input_file, 'r', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
        fieldnames = ['ID', 'MatchID', 'PeriodID', 'Timestamp', 'Preprocessed_Tweet']
        
        with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()
            
            chunk = []
            for i, row in enumerate(reader):
                tweet = row['Tweet']
                preprocessed_tweet = preprocess_tweet(tweet)
                chunk.append({
                    'ID': row['ID'],
                    'MatchID': row['MatchID'],
                    'PeriodID': row['PeriodID'],
                    'Timestamp': row['Timestamp'],
                    'Preprocessed_Tweet': preprocessed_tweet
                })
                
                # Write the chunk to file after processing `chunk_size` rows
                if (i + 1) % chunk_size == 0:
                    writer.writerows(chunk)
                    chunk = []
            
            # Write any remaining rows
            if chunk:
                writer.writerows(chunk)

#Importing the data and define output directories-----------------------------------------------

# Uncomment to test on train data
# train_data = "./../../challenge_data/train_tweets"  
test_data = "./../../challenge_data"
# output_dir = "./../../predictions/challenge_data/processed_train_data"
output_dir2 = "./../../predictions/challenge_data/processed_test_data"

# Create directories if they don't exist
# os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_dir2, exist_ok=True)

# train_files = [join(train_data, f) for f in os.listdir(train_data) if isfile(join(train_data, f))]
test_files = [join(test_data, f) for f in os.listdir(test_data) if isfile(join(test_data, f))]

# Processing loop (Uncomment to run)------------------------------------------------------------

# # Process training files
# for file in train_files:
#     print(f"Processing file: {file}")
#     output_file = join(output_dir, os.path.basename(file))
#     os.makedirs(os.path.dirname(output_file), exist_ok=True)
#     process_file(file, output_file)
#     print(f"Processed file saved to: {output_file}")
#     print("---------------------------------------------------")

# # Process testing files
# for file in test_files:
#     print(f"Processing file: {file}")
#     output_file = join(output_dir2, os.path.basename(file))
#     os.makedirs(os.path.dirname(output_file), exist_ok=True)
#     process_test_file(file, output_file)
#     print(f"Processed file saved to: {output_file}")
#     print("---------------------------------------------------")


## Event detection

In [6]:

"""
Deviation Method:
- Sensitive to abrupt changes in graph structure over time.
- Uses context (previous periods) for comparison.

Evolution Method:
- Sensitive to differences between consecutive periods.
- Detects significant structural changes using the Frobenius norm.
"""

def detect_event_deviation(tweets_number, adjacency_matrix, previous_periods, vocabulary, full_tweets, period_id):
    """Function that is used to decide if the current period is an event by taking advantage of the Least Squares Optimization"""
    if tweets_number == 0:
        return [[], False, "No tweets found in the current period."]
    
    period_score = -1

    vector, vector_nodes, vector_edges, weighted_edges = generate_vector(adjacency_matrix, vocabulary)
    # consider at most the last P periods
    P = PREVIOUS_PERIODS
    if len(previous_periods) > P:
        previous_periods_to_consider = previous_periods[-P:]
    elif len(previous_periods) > 0:
        previous_periods_to_consider = previous_periods
    else:
        return [period_id, period_score >= EVENT_DETECTION_THRESHOLD]

    #initialize the weights matrix
    weights = np.zeros((len(vector_edges), len(previous_periods_to_consider)))

    #fill the weights matrix
    for i in range(len(previous_periods_to_consider)):
        weights[:, i] = np.asarray(get_edges_weight(previous_periods_to_consider[i]['adjacency_matrix'], 
                                                    previous_periods_to_consider[i]['vocabulary'], 
                                                    vector_edges, vector_nodes))
        
    #deduct score
    period_score = least_square(weights, vector)

    is_event = period_score >= EVENT_DETECTION_THRESHOLD

    return [period_id, is_event]

def detect_event_evolution(tweets_number, adjacency_matrix, previous_periods,vocabulary, full_tweets, period_id):
    """Function that is used to decide if the current period is an event by taking advantage of the Frobenius Norm"""
    if tweets_number == 0:
        return [[], False, "No tweets found in the current period."]
    
    if len(previous_periods) > 0:
        previous_event = previous_periods[-1]['is_event']
    else:
        previous_event = False

    if len(previous_periods) > 0:
        diff = frob(adjacency_matrix, previous_periods[-1]['adjacency_matrix'], EVENT_DETECTION_THRESHOLD)
    else:
        diff = False

    if diff:
        is_event = not(previous_event)
    else:
        is_event = previous_event 
        

    return [period_id, is_event]

def generate_vector(adjacency_matrix, vocabulary):
    """Function that is used to generate a vector for the current period"""
    non_zero_edges = get_nonzero_edges(adjacency_matrix)
    vector = np.zeros((len(non_zero_edges), 1))
    vector_edges = []
    vector_nodes = set()
    weighted_edges = {}
    counter = 0
    for row, column, value in non_zero_edges:
        vector[counter] = value
        nodes = [vocabulary[row], vocabulary[column]]
        vector_edges.append(nodes)
        vector_nodes.update(nodes)
        weighted_edges[tuple(sorted(nodes))] = value
        counter += 1
    return vector, vector_nodes, vector_edges, weighted_edges

def get_nonzero_edges(matrix):
    """Function that is used to extract from the adjacency matrix the edges with no-negative weights"""
    rows, columns, values = find(matrix)
    return [[rows[i], columns[i], float(values[i])] for i in range(len(rows))]

def generate_adjacency_matrix_dense(tweets, vocabulary):
    """Function that is used to generate the adjacency matrix of the given tweets"""
    wordsNumber = len(vocabulary)
    adjacency_matrix = np.zeros((wordsNumber, wordsNumber))
    tweets_edges = []
    tweet_counter = -1
    for tweet in tweets:
        tweet = set(tweet)  
        indexes = []
        for word in tweet:
            if word in vocabulary:
                indexes.append(vocabulary.index(word))
        counter = 0
        tweet_counter += 1
        tweets_edges.append([])
        for i in indexes:
            for j in indexes[counter:]:
                if i == j:
                    continue
                adjacency_matrix[i, j] += 1.0 / len(tweet)
                adjacency_matrix[j, i] += 1.0 / len(tweet)
                tweets_edges[tweet_counter].append(sorted([vocabulary[i], vocabulary[j]]))
            counter += 1
    return adjacency_matrix, tweets_edges

def get_edges_weight(adjacency_matrix, vocabulary, edges_list, nodes_list):
    """Function that is used to extract the weight for each edge in the given list. The nodes_list parameter is a
    list that contains the nodes that are included in the given edges """
    nodes = {}
    for node in nodes_list:
        index = bisect.bisect(vocabulary, node) - 1
        if (0 <= index <= len(vocabulary)) and vocabulary[index] == node:
            nodes[node] = index

    weight_list = []
    for edge in edges_list:
        first_word, second_word = edge[0], edge[1]
        if all(word in nodes for word in (first_word, second_word)):
            indexes = [nodes[first_word], nodes[second_word]]
            indexes.sort()
            weight_list.append(adjacency_matrix[indexes[0], indexes[1]])
        else:
            weight_list.append(0)
    return weight_list

def frob(A_current, A_previous, threshold=0.1):
    """
    Compares adjacency matrices of consecutive graphs.
    Operates on sparse matrices to save memory.
    """

    A_current = csr_matrix(A_current)
    A_previous = csr_matrix(A_previous)
    
    # Compute the difference between the two matrices
    diff = A_current - A_previous
    
    # Compute frobenius norm
    change_norm = norm(diff, ord='fro')  
    
    return change_norm > threshold

def least_square(A, b):
    """Method that solves the Least Squares problem"""
    return np.linalg.lstsq(A, b, rcond=None)[0].sum()

def main(file_path, output_file, method = 'deviation'):

    # Load dataset
    data = pd.read_csv(file_path)
    data['Preprocessed_Tweet'] = data['Preprocessed_Tweet'].astype(str).fillna('')  # Convert to strings and handle NaN values
    data = data.drop_duplicates(subset='Preprocessed_Tweet')
    # Create a fixed vocabulary based on the entire dataset
    all_words = set()
    for tweet in data['Preprocessed_Tweet']:
        all_words.update(tweet.split())
    vocabulary = sorted(list(all_words))
    
    # Group tweets into time periods
    grouped = data.groupby(pd.Grouper(key='PeriodID'))
    
    previous_periods = []
    results = []
    for period, group in grouped:
        tweets = group['Preprocessed_Tweet'].tolist()
        full_tweets = group['Preprocessed_Tweet'].tolist()
        timestamps = [str(period), str(period)]
        
        # Generate the adjacency matrix and tweet edges
        adjacency_matrix, tweets_edges = generate_adjacency_matrix_dense(tweets, vocabulary)
        
        # Detect event using the specified method
        if method == 'deviation':
            result = detect_event_deviation(len(tweets), adjacency_matrix, previous_periods, vocabulary, full_tweets, timestamps)
        elif method == 'evolution':
            result = detect_event_evolution(len(tweets), adjacency_matrix, previous_periods, vocabulary, full_tweets, timestamps)
        else:
            raise ValueError(f"Invalid method: {method}. Method must be either 'deviation' or 'evolution'.")
        
        # result = detect_event(len(tweets), adjacency_matrix, previous_periods, vocabulary, full_tweets, timestamps)
        # Append period ID and event detection result (1 for True, 0 for False)
        results.append({'PeriodID': period, 'Event': int(result[1])})

        # Update previous periods
        previous_periods.append({
            'adjacency_matrix': adjacency_matrix,
            'vocabulary': vocabulary,
            'is_event': result[1]
        })

    output_df = pd.DataFrame(results)
    output_df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")
        

## Main (train set)

(only run to test on training set)

In [None]:
def run(file_path, output_file = "./../../predictions/test_output.csv"):
    start_time = time.time()
    print("-------------------------------- Running the model --------------------------------")
    print(f"      File: {file_path}")
    print(f"      Output: {output_file}")
    print(f"      Threshold: {EVENT_DETECTION_THRESHOLD}")
    # load test data in chunks
    test_file_path = "./../../challenge_data/test.csv"
    data = pd.read_csv(file_path)
    data_test = data.drop("EventType", axis=1)

    # Smaller datasets for testing
    # data_test = data_test.drop(data_test.index[70000:])
    # data = data.drop(data.index[70000:])

    data_test.to_csv("./../../predictions/test.csv", index=False)

    # Create results files for comparaison
    data = data.drop_duplicates(subset=['PeriodID'], keep='first')
    exp_results = data[['PeriodID', 'EventType']]
    # exp_results.to_csv("outputs/expected_results.csv", index=False)
                       
    #Execute main function
    print("      Executing main function...")
    main(test_file_path, output_file)

    # Load the results
    output = pd.read_csv(output_file)

    # Calculate accuracy
    print(f"     Calculating accuracy...")
    accuracy = accuracy_score(exp_results['EventType'], output['Event'])
    print(f"     Accuracy before postprocessing: {accuracy:.2f}")
    print(f"     Postprocessing...")
    # #PostProcessing
    output['Event'][:5] = 0
    # # output['Event'][5:10] = 1
    output['Event'][122:127] = 1
    output['Event'][127:] = 0

    accuracy2 = accuracy_score(exp_results['EventType'], output['Event'])
    print(f"     Accuracy after postprocessing: {accuracy2:.2f}")
    if accuracy2 > accuracy:
        print("     Postprocessing improved the accuracy.")
        output.to_csv(output_file, index=False)
    else:
        print("     Postprocessing did not improve the accuracy.")
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"""      Execution time: {execution_time:.2f} seconds ({execution_time/60:.2f} minutes)""")
    print("-----------------------------------------------------------------------------------")
    return output, accuracy

def get_all_expected_results(dir_path, output_dir):
    for files in os.listdir(dir_path):
        file_path = f"{dir_path}/{files}"
        game = files.split(".")[0]
        data = pd.read_csv(file_path)
        data = data.drop_duplicates(subset=['PeriodID'], keep='first')
        expected_results = data[["ID", "EventType"]]
        os.makedirs(f"{output_dir}/{game}", exist_ok=True)
        expected_results.to_csv("{output_dir}/" + game + "/expected_results.csv", index=False,) 

### Deviation method

In [None]:
PREVIOUS_PERIODS = 10
print(PREVIOUS_PERIODS)
# Best: P = 12, threshold = 1.06, 
periods = [12, 13, 14]
files = os.listdir("./../predictions/challenge_data/processed_train_data")
files = files[:1]
thresholds = [1.06]
# for P in periods:
#     PREVIOUS_PERIODS = P
#     print("P: ", P)
#     for file in files:
#         for threshold in thresholds:
#             EVENT_DETECTION_THRESHOLD = threshold
#             os.makedirs("./../../predictions/test", exist_ok=True)
#             output, acc = run("./../predictions/challenge_data/processed_train_data/" + file, output_file="./../../predictions/test/" + file)
        # print("Accuracy: ", acc)

10



| **Threshold** | **Description** |
|---------------|-----------------|
| 0.5           | Returns 1's only |
| 0.8           | Acc: 0.6         |
| 0.9           | Acc: 0.61        |
| 1             | Acc: 0.63        |
| 1.06          | Acc: 0.69        |
| 1.1           | Acc: 0.63        |
| 1.2           | Returns 0's only |



### Evolution method:


In [None]:
# (Modify main function before running the following: method = evolution)
# thresholds = [95]
# # for files in os.listdir("./../predictions/challenge_data/processed_train_data"):
# for threshold in thresholds:
#     EVENT_DETECTION_THRESHOLD = threshold
#     th = f"{threshold}"
#     output, acc = run("challenge_data/processed_train_data/ArgentinaBelgium72.csv", output_file="./../../predictions/test/arg_" + th + ".csv")
#     # print("Accuracy: ", acc)

| **Threshold** | **Accuracy** |
|---------------|--------------|
| 75            | 0.64         |
| 80            | 0.72         |
| 85            | 0.72         |
| 90            | 0.72         |
| 95            | 0.72         |
| 100           | 0.72         |
| 105           | 0.72         |
| 110           | 0.72         |
| 115           | 0.72         |
| 120           | 0.68         |
| 125           | 0.68         |
| 130           | 0.64         |
| 135           | 0.64         |
| 140           | 0.64         |
| 145           | 0.64         |
| 150           | 0.64         |
| 200           | 0.59         |
| 250           | 0.59         |
| 300           | 0.53         |
| 350           | 0.53         |
| 400           | 0.59         |
| 450           | 0.59         |
| 500           | 0.59         |
| 550           | 0.59         |
| 600           | 0.59         |


## Main (test set)

In [None]:
def run_test(file_path, output_file = "outputs/output.csv"):
    print("-------------------------------- Running the model --------------------------------")
    print(f"      File: {file_path}")
    print(f"      Output: {output_file}")
    print(f"      Threshold: {EVENT_DETECTION_THRESHOLD}")
    
    #Execute main function
    print("      Executing main function...")
    main(file_path, output_file)

    # Load the results
    output = pd.read_csv(output_file)

    print(f"     Postprocessing...")
    #PostProcessing
    output['Event'][:5] = 0
    # output['Event'][5:10] = 1
    output['Event'][122:127] = 1
    output['Event'][127:] = 0

    output.to_csv(output_file, index=False)
    print("-----------------------------------------------------------------------------------")
    return output

In [21]:
thresholds = [95]
EVENT_DETECTION_THRESHOLD = 1.06
# periods = [12, 130]
# for P in periods:
PREVIOUS_PERIODS = 130
os.makedirs(f"""./../../predictions/single_game_pred/""", exist_ok=True)
for file in os.listdir("./../../predictions/challenge_data/processed_test_data"):
        output, acc = run_test("./../../predictions/challenge_data/processed_test_data/" + file, output_file=f"""./../../predictions/single_game_pred/""" + files)
        
    # print("Accuracy: ", acc)

-------------------------------- Running the model --------------------------------
      File: ./../../predictions/challenge_data/processed_test_data/GermanyGhana32.csv
      Output: ./../../predictions/single_game_pred/GermanyGhana32.csv
      Threshold: 1.06
      Executing main function...
Results saved to ./../../predictions/single_game_pred/GermanyGhana32.csv
     Postprocessing...
-----------------------------------------------------------------------------------


In [22]:
# Submission format:
# ID, Event
def submission_format(output_file):
    submission = pd.DataFrame(columns=["ID", "EventType"])
    ids = ["6_", "16_", "9_", "15_"]
    id = 0
    for files in os.listdir("./../../predictions/single_game_pred"):
         output = pd.read_csv("./../../predictions/single_game_pred" + files)
         # Change the ID
         output['ID'] = output['PeriodID'].apply(lambda x: ids[id] + str(x))
         id += 1
         output = output[["ID", "Event"]]
         output["EventType"] = output['Event']
         output = output.drop("Event", axis=1)
         submission = pd.concat([submission, output], ignore_index=False)
    print(f"Results saved to {output_file}")
    submimsion = submission[["ID", "EventType"]]
    submission.to_csv(output_file, index=False)

submission_format("./../../predictions/submission_graph.csv")


Results saved to ./../../predictions/submission_graph.csv
