In [2]:
import numpy as np
import pandas as pd
from collections import Counter
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from keras import regularizers
from Dataset import get_data

In [4]:
def tokenize_tweet(tweet):
    tweet = tweet.lower()
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(tweet)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words and len(t) > 3]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

In [5]:
def get_top_n_tokens_per_period(all_matchs, n):
    period_token_counts = {}
    for match_name, df in all_matchs.items():
        period_token_counts[match_name] = {}
        grouped = df.groupby('ID')['Tweet']
        for id_, tweets in grouped:
            all_tokens = []
            for tweet in tweets:
                all_tokens.extend(tokenize_tweet(tweet))
            counter = Counter(all_tokens)
            top_n = counter.most_common(n)
            period_token_counts[match_name][id_] = dict(top_n)
    return period_token_counts

In [6]:
def create_token_matrix(period_token_counts):
    all_tokens = set()
    all_ids = set()
    for match_dict in period_token_counts.values():
        for id_, token_count_dict in match_dict.items():
            all_tokens.update(token_count_dict.keys())
            all_ids.add(id_)
    
    all_tokens = sorted(list(all_tokens))
    all_ids = sorted(list(all_ids))
    
    matrix = pd.DataFrame(0, index=all_tokens, columns=all_ids)
    
    for match_dict in period_token_counts.values():
        for id_, token_count_dict in match_dict.items():
            for token, count in token_count_dict.items():
                matrix.at[token, id_] = count
    
    matrix = matrix.fillna(0).astype(int)
    return matrix

In [7]:
def create_evaluation_matrix(all_matchs_eval, train_vocab, n):
    period_token_counts_eval = get_top_n_tokens_per_period(all_matchs_eval, n)
    
    all_ids_eval = set()
    for match_dict in period_token_counts_eval.values():
        for id_ in match_dict.keys():
            all_ids_eval.add(id_)
    all_ids_eval = sorted(list(all_ids_eval))
    
    evaluation_matrix = pd.DataFrame(0, index=train_vocab, columns=all_ids_eval)
    
    for match_dict in period_token_counts_eval.values():
        for id_, token_count_dict in match_dict.items():
            for token, count in token_count_dict.items():
                if token in evaluation_matrix.index:
                    evaluation_matrix.at[token, id_] = count
    
    evaluation_matrix = evaluation_matrix.fillna(0).astype(int)
    return evaluation_matrix

In [8]:
def build_mlp_model(input_dim: int) -> Sequential:
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim,
                    kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu',
                    kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))  # binaire
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [9]:
def run_mlp_classification(token_matrix_all, id_labels):
    ids = token_matrix_all.columns.tolist()
    y = np.array([id_labels[id_] for id_ in ids])
    
    X = token_matrix_all.T.values
    
    X_temp, X_test, y_temp, y_test, ids_temp, ids_test = train_test_split(
        X, y, ids, test_size=0.2, stratify=y, random_state=42
    )
    X_train, X_val, y_train, y_val, ids_train, ids_val = train_test_split(
        X_temp, y_temp, ids_temp, test_size=0.25, stratify=y_temp, random_state=42
    )
    
    print(f"Train set: {X_train.shape[0]} samples")
    print(f"Validation set: {X_val.shape[0]} samples")
    print(f"Test set: {X_test.shape[0]} samples")
    
    model = build_mlp_model(input_dim=X_train.shape[1])
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=50,
        batch_size=32,
        callbacks=[early_stopping],
        verbose=1
    )
    
    val_loss, val_acc = model.evaluate(X_val, y_val, verbose=0)
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")
    
    return model, (X_test, y_test, ids_test)

In [10]:
def predict_on_evaluation_set(model, evaluation_matrix):
    ids_eval = evaluation_matrix.columns.tolist()
    X_eval = evaluation_matrix.T.values
    
    y_pred_proba = model.predict(X_eval)
    y_pred = (y_pred_proba > 0.5).astype(int).ravel()
    
    result_df = pd.DataFrame({
        'ID': ids_eval,
        'EventType': y_pred
    })
    return result_df

In [None]:
_, all_matchs_cleaned, _, all_matchs_cleaned_eval = get_data()

In [None]:
n = 100
period_token_counts = get_top_n_tokens_per_period(all_matchs_cleaned, n)
token_matrix_all = create_token_matrix(period_token_counts)

In [None]:
id_labels = {}
for df_match in all_matchs_cleaned.values():
    for _, row in df_match.iterrows():
        id_labels[row['ID']] = row['EventType']

In [None]:
model, test_data = run_mlp_classification(token_matrix_all, id_labels)

In [None]:
train_vocab = token_matrix_all.index
evaluation_matrix = create_evaluation_matrix(all_matchs_cleaned_eval, train_vocab, n)

In [None]:
predictions_df = predict_on_evaluation_set(model, evaluation_matrix)
predictions_df.to_csv('predictions.csv', index=False)