### This model contains experimentations with Embeddings and Basic Features

In [1]:
import numpy as np
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from Dataset import get_data
import gensim.downloader as api
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
import math
import matplotlib.dates as mdates


In [5]:
all_matchs, all_matchs_cleaned, all_matchs_eval, all_matchs_cleaned_eval = get_data()

In [3]:
pd.options.mode.chained_assignment = None

### Comparaison of the evolution of the number of Tweets over time between raw and cleaned dataset

In [None]:
tweets_per_minute_original = {}

for fileName, df in all_matchs.items():
    
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='ms')
    
    aggregated_data = (
        df.set_index('Timestamp')
        .resample('min')
        .size()
        .reset_index(name='tweet_count_original')
    )
    tweets_per_minute_original[fileName] = aggregated_data

tweets_per_minute_cleaned = {}

for file_name, df in all_matchs_cleaned.items():
    
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='ms')
    
    aggregated_data = (
        df.set_index('Timestamp')
        .resample('min')
        .size()
        .reset_index(name='tweet_count_cleaned')
    )
    tweets_per_minute_cleaned[file_name] = aggregated_data


if not tweets_per_minute_original or not tweets_per_minute_cleaned:
    raise ValueError("Aucune donnée agrégée trouvée pour les fichiers CSV.")


num_matches = len(tweets_per_minute_original)
num_cols = 4
num_rows = math.ceil(num_matches / num_cols)


fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, 5 * num_rows), sharex=False, sharey=False)


axes = axes.flatten()


for idx, (fileName, original_data) in enumerate(tweets_per_minute_original.items()):
    ax = axes[idx]
    
    
    cleaned_key = f"{fileName}_clean"
    cleaned_data = tweets_per_minute_cleaned.get(cleaned_key)
    
    if cleaned_data is not None:
        
        ax.plot(
            original_data['Timestamp'],
            original_data['tweet_count_original'],
            color='blue',
            label='Original'
        )
        
        
        ax.plot(
            cleaned_data['Timestamp'],
            cleaned_data['tweet_count_cleaned'],
            color='red',
            label='Cleaned'
        )
        
        
        ax.set_title(f"Match: {fileName}", fontsize=10)
        ax.set_xlabel("Time", fontsize=8)
        ax.set_ylabel("Number of Tweets", fontsize=8)
        ax.grid(True)
        
        
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
        ax.xaxis.set_major_locator(mdates.AutoDateLocator())
        plt.setp(ax.get_xticklabels(), rotation=45, ha='right', fontsize=6)
        plt.setp(ax.get_yticklabels(), fontsize=6)
        
        
        ax.legend(fontsize=6)
    else:
        
        ax.plot(
            original_data['Timestamp'],
            original_data['tweet_count_original'],
            color='blue',
            label='Original'
        )
        ax.set_title(f"Match: {fileName} (No Cleaned Data)", fontsize=10)
        ax.set_xlabel("Time", fontsize=8)
        ax.set_ylabel("Number of Tweets", fontsize=8)
        ax.grid(True)
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
        ax.xaxis.set_major_locator(mdates.AutoDateLocator())
        plt.setp(ax.get_xticklabels(), rotation=45, ha='right', fontsize=6)
        plt.setp(ax.get_yticklabels(), fontsize=6)
        ax.legend(fontsize=6)


for j in range(idx + 1, num_rows * num_cols):
    fig.delaxes(axes[j])

plt.tight_layout()

plt.show()

### Amount of Tweets with important word per PeriodID, based on the EventType.

In [None]:
def has_important_words(tweet):
    pattern = r'g+o+a+l+|' \
              r'full\s+time|' \
              r'half\s+time|' \
              r'kick\s+off|' \
              r'owngoal|' \
              r'penalty|' \
              r'red\s+card|' \
              r'yellow\s+card|' \
              r'other'
    
    if re.search(pattern, tweet, re.IGNORECASE):
        return True
    return False

important_word_per_period = {}

for match_name, df in all_matchs_cleaned.items():
    df = df.copy()
    df['has_important_word'] = df['Tweet'].apply(has_important_words)

    important_word_df = df[df['has_important_word']]

    aggregated_data = (
        important_word_df.groupby(['PeriodID', 'EventType'])
        .size()
        .unstack(level=1, fill_value=0)
        .reset_index()
    )

    important_word_per_period[match_name] = aggregated_data

num_matches = len(important_word_per_period)
num_cols = 4
num_rows = math.ceil(num_matches / num_cols)

fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, 5 * num_rows), sharex=False, sharey=False)
axes = axes.flatten()

for idx, (match_name, aggregated_data) in enumerate(important_word_per_period.items()):
    ax = axes[idx]

    aggregated_data['TotalTweets'] = aggregated_data.iloc[:, 1:].sum(axis=1)

    average = aggregated_data['TotalTweets'].mean()

    for _, row in aggregated_data.iterrows():
        period_id = row['PeriodID']
        retweets_event_1 = row.get(1, 0)
        retweets_event_0 = row.get(0, 0)

        ax.bar(period_id, retweets_event_1, color='red', label='EventType 1' if period_id == aggregated_data['PeriodID'].iloc[0] else "")
        ax.bar(period_id, retweets_event_0, bottom=retweets_event_1, color='blue', label='EventType 0' if period_id == aggregated_data['PeriodID'].iloc[0] else "")

    ax.axhline(y=average, color='yellow', linestyle='--', linewidth=2, label='Mean')

    ax.set_title(f"Match: {match_name}", fontsize=10)
    ax.set_xlabel("PeriodID", fontsize=8)
    ax.set_ylabel("Number of tweet with Important Word", fontsize=8)
    ax.grid(True)

    handles, labels = ax.get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    ax.legend(by_label.values(), by_label.keys(), fontsize=6)

for j in range(idx + 1, num_rows * num_cols):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


### Average tweet length per PeriodID, based on the EventType. (not relevant)

In [None]:
def tweet_length(tweet):
    return len(tweet.split())

tweet_length_per_period = {}

for match_name, df in all_matchs_cleaned.items():
    df['tweet_length'] = df['Tweet'].apply(tweet_length)

    aggregated_data = (
        df.groupby(['PeriodID', 'EventType'])['tweet_length']
        .mean()
        .unstack(level=1, fill_value=0)
        .reset_index()
    )

    tweet_length_per_period[match_name] = aggregated_data

num_matches = len(tweet_length_per_period)
num_cols = 4
num_rows = math.ceil(num_matches / num_cols)

fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, 5 * num_rows), sharex=False, sharey=False)
axes = axes.flatten()

for idx, (match_name, aggregated_data) in enumerate(tweet_length_per_period.items()):
    ax = axes[idx]

    for period_id in aggregated_data['PeriodID']:
        row = aggregated_data[aggregated_data['PeriodID'] == period_id]
        if 1 in row.columns:
            avg_length_event_1 = row[1].values[0]
        else:
            avg_length_event_1 = 0
        if 0 in row.columns:
            avg_length_event_0 = row[0].values[0]
        else:
            avg_length_event_0 = 0

        ax.bar(period_id, avg_length_event_1, color='red', label='EventType 1' if period_id == aggregated_data['PeriodID'].iloc[0] else "")
        ax.bar(period_id, avg_length_event_0, bottom=avg_length_event_1, color='blue', label='EventType 0' if period_id == aggregated_data['PeriodID'].iloc[0] else "")

    ax.set_title(f"Match: {match_name}", fontsize=10)
    ax.set_xlabel("PeriodID", fontsize=8)
    ax.set_ylabel("Average Tweet Length", fontsize=8)
    ax.grid(True)
    ax.legend(fontsize=6)

for j in range(idx + 1, num_rows * num_cols):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


### Amount of Retweets per PeriodID, based on the EventType (not relevant).

In [None]:
def is_retweet(tweet):
    return tweet.startswith('RT @')

retweet_per_period = {}

for match_name, df in all_matchs.items():
    df['is_retweet'] = df['Tweet'].apply(is_retweet)

    retweets_df = df[df['is_retweet']]

    aggregated_data = (
        retweets_df.groupby(['PeriodID', 'EventType'])
        .size()
        .unstack(level=1, fill_value=0)
        .reset_index()
    )

    retweet_per_period[match_name] = aggregated_data

num_matches = len(retweet_per_period)
num_cols = 4
num_rows = math.ceil(num_matches / num_cols)

fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, 5 * num_rows), sharex=False, sharey=False)
axes = axes.flatten()

for idx, (match_name, aggregated_data) in enumerate(retweet_per_period.items()):
    ax = axes[idx]

    for period_id in aggregated_data['PeriodID']:
        row = aggregated_data[aggregated_data['PeriodID'] == period_id]
        if 1 in row.columns:
            retweets_event_1 = row[1].values[0]
        else:
            retweets_event_1 = 0
        if 0 in row.columns:
            retweets_event_0 = row[0].values[0]
        else:
            retweets_event_0 = 0

        ax.bar(period_id, retweets_event_1, color='red', label='EventType 1' if period_id == aggregated_data['PeriodID'].iloc[0] else "")
        ax.bar(period_id, retweets_event_0, bottom=retweets_event_1, color='blue', label='EventType 0' if period_id == aggregated_data['PeriodID'].iloc[0] else "")

    ax.set_title(f"Match: {match_name}", fontsize=10)
    ax.set_xlabel("PeriodID", fontsize=8)
    ax.set_ylabel("Number of Retweets", fontsize=8)
    ax.grid(True)
    ax.legend(fontsize=6)

for j in range(idx + 1, num_rows * num_cols):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
embeddings_model_name = "glove-twitter-200"
embedding_dim = 200
embeddings_model = api.load(embeddings_model_name)

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
def clean_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r'https?://\S+|www\.\S+', '', tweet)
    tweet = re.sub(r'@\w+', '', tweet)
    tweet = re.sub(r'rt', '', tweet)
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    return tweet

def tokenize_and_lemmatize(tweet):
    tokens = word_tokenize(tweet)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return tokens

def preprocess_tweets(tweets):
    cleaned = [clean_tweet(tweet) for tweet in tweets]
    tokenized = [tokenize_and_lemmatize(tweet) for tweet in cleaned]
    return tokenized

def get_embedding(tokens, embeddings_model, embedding_dim):
    valid_embeddings = [embeddings_model[token] for token in tokens if token in embeddings_model]
    if valid_embeddings:
        return np.mean(valid_embeddings, axis=0)
    else:
        return np.zeros(embedding_dim)

def create_embeddings(data, embeddings_model, embedding_dim):
    embeddings = []
    for tweets in data['Tweets']:
        tokenized_tweets = preprocess_tweets(tweets)
        all_tokens = [token for tweet in tokenized_tweets for token in tweet]
        embedding = get_embedding(all_tokens, embeddings_model, embedding_dim)
        embeddings.append(embedding)
    return np.array(embeddings)

In [None]:
def add_id_column(df):
    df['ID'] = df['MatchID'].astype(str) + '_' + df['PeriodID'].astype(str)
    return df

def process_match_cleaned(df, is_test=False):
    df = add_id_column(df.copy())
    
    nb_tweets = df.groupby('ID').size().reset_index(name='nb_tweets')
    
    advanced_patterns = {
        "goal": [
                "goal", "scored", "scoring", "nets", "shot", "strikes", "finishes", "hits", "equalizer", 
                "header", "top", "corner", "bottom", "worldie", "screamer", "wonder", "amazing", "unbelievable", 
                "finish", "golazo", "back", "net", "brace", "hat", "trick", "free", "kick", "open", "play", 
                "tapped", "in", "clinical", "last", "minute", "top", "class", "strike", "blasted", "goalie", 
                "keeper", "goalkeeper", "shoot", "target", "unstoppable", "rebound", "assist", "attack", 
                "goalbound", "counter", "offensive", "ball"
            ],
            "half_time": [
                "halftime", "half-time", "first", "half", "break", "intermission", "end", "ht", "whistle", 
                "time", "rest", "half", "stats", "rest", "period", "interval", "cooling", "pause", "review", 
                "recap", "highlights", "analysis", "midway", "stop", "halfway", "breakdown", "mid", "review",
            ],
            "kick_off": [
                "kickoff", "kick-off", "start", "game", "on", "match", "started", "blown", "underway", 
                "begins", "starting", "early", "minutes", "live", "first", "opening", "whistle", "action", 
                "beginning", "intro", "debut", "matchday", "clash", "duel", "showdown", "kick",
            ],
            "full_time": [
                "fulltime", "full-time", "end", "final", "finished", "over", "match", "ends", "score", 
                "game", "result", "wraps", "ending", "finish", "summary", "conclusion", "closure", 
                "last", "blown", "finalized", "full", "stoppage", "post", "reaction", "outcome", "finalized",
            ],
            "penalty": [
                "penalty", "spot", "kick", "pk", "pen", "awarded", "given", "shoots", "saved", 
                "converts", "miss", "scored", "shootout", "decider", "call", "shot", "var", "decision", 
                "penalized", "keeper", "goalkeeper", "foul", "penalties", "shoot", "retake", "blocked", 
                "controversy", "chance", "conversion"
            ],
            "red_card": [
                "red", "card", "sent", "off", "ejected", "foul", "dangerous", "tackle", "reckless", 
                "violent", "conduct", "second", "straight", "dismissed", "player", "bad", 
                "issued", "poor", "team", "ten", "man", "short", "disciplinary", "harsh", "controversial", 
                "deserved", "unfair", "redcard", "offense", "foulplay", "referee", "decision", "call",
            ],
            "yellow_card": [
                "yellow", "card", "booked", "caution", "warning", "reckless", "tackle", "late", 
                "challenge", "rough", "play", "soft", "minor", "team", "warned", "fouled", "cheap", 
                "booking", "sloppy", "physical", "yellowcard", "persistence", "persistent", "referee",
            ],
            "own_goal": [
                "own", "goal", "og", "unlucky", "deflected", "mistake", "blunder", "misjudged", 
                "wrong", "net", "disaster", "horror", "show", "scored", "self", "tragic", "moment", 
                "oops", "misplay", "accidental", "misstep", "owngoal", "oopsie", "calamity",
            ],
            "win": [
                "win", "victory", "triumph", "beat", "beats", "defeats", "wins", "victorious", 
                "champions", "conquered", "game", "match", "big", "close", "amazing", "sealed", 
                "final", "team", "domination", "celebration", "winning", "champion", "success", 
                "through", "qualified", "advanced"
            ],
            "loss": [
                "loss", "defeated", "lose", "beaten", "eliminated", "lost", "downfall", "knocked", 
                "out", "upset", "heartbreak", "poor", "final", "last", "chance", "missed", "opportunity", 
                "team", "falls", "failure", "fallen", "blown", "defeat", "crushed", "disappointed", 
                "despair", "heartbreaking", "disappointment", "failure", "elimination", "out", "crash",
            ],
            "other_event": [
                "match", "game", "whistle", "score", "var", "referee", "foul", "play", "team", 
                "offside", "substitution", "injury", "celebration", "comeback", "drama", "crowd", 
                "roar", "pitch", "stadium", "matchday", "football", "fans", "spirit", "final", 
                "minutes", "second", "extra", "time", "stoppage", "preview", "post", "pre", "live", 
                "performance", "form", "support", "home", "away", "underdog", "giant", "clash", 
                "fixture", "tournament", "season", "points", "table", "record", "momentum", "suspense"
            ]
    }
    
    def construct_pattern(word):
        pattern_chars = []
        for c in word:
            if c == ' ' or c == '-':
                pattern_chars.append(r'\s*')
            else:
                pattern_chars.append(f'{re.escape(c)}+')
        return ''.join(pattern_chars)
    
    compiled_patterns = {}
    for key, words in advanced_patterns.items():
        regex_patterns = [construct_pattern(word) for word in words]
        combined_pattern = r'|'.join(regex_patterns)
        compiled_patterns[key] = combined_pattern

    for event, pattern in compiled_patterns.items():
        df[f'has_{event}'] = df['Tweet'].str.contains(pattern, flags=re.IGNORECASE, regex=True)
    
    event_columns = [f'has_{event}' for event in compiled_patterns.keys()]
    event_mentions = df.groupby('ID')[event_columns].sum().reset_index()
    
    result = pd.merge(event_mentions, nb_tweets, on='ID')
    return result

In [None]:
def create_new_features(all_matchs_cleaned, is_test=False):
    all_features_cleaned = []
    for match_id, df in all_matchs_cleaned.items():
        features = process_match_cleaned(df, is_test)
        all_features_cleaned.append(features)
    all_features_cleaned_df = pd.concat(all_features_cleaned, ignore_index=True)

    def sort_ids(df):
        df['MatchID_sort'] = df['ID'].str.split('_').str[0].astype(int)
        df['PeriodID_sort'] = df['ID'].str.split('_').str[1].astype(int)
        df = df.sort_values(by=['MatchID_sort', 'PeriodID_sort'])
        df = df.drop(columns=['MatchID_sort', 'PeriodID_sort'])
        return df

    final_df = sort_ids(all_features_cleaned_df).reset_index(drop=True)
    return final_df


In [None]:
data_list = []
for match, df in all_matchs_cleaned.items():
    for id, group in df.groupby('ID'):
        tweets = group['Tweet'].tolist()
        event_type = group['EventType'].iloc[0]
        data_list.append({
            'ID': id,
            'EventType': event_type,
            'Tweets': tweets
        })

data = pd.DataFrame(data_list)
data[['MatchID', 'PeriodID']] = data['ID'].str.split('_', expand=True)
data['MatchID'] = data['MatchID'].astype(int)
data['PeriodID'] = data['PeriodID'].astype(int)
data = data.sort_values(by=['MatchID', 'PeriodID']).reset_index(drop=True)
data = data.drop(columns=['MatchID', 'PeriodID'])

In [None]:
new_features_df = create_new_features(all_matchs_cleaned)

In [None]:
scaler = StandardScaler()

final_df_scaled = new_features_df.copy()
columns_to_scale = new_features_df.columns.difference(['ID'])
final_df_scaled[columns_to_scale] = scaler.fit_transform(new_features_df[columns_to_scale])

In [None]:
print(final_df_scaled.head())

In [None]:
final_data = pd.merge(data, final_df_scaled, on='ID')

In [None]:
print(final_data.head())

In [None]:
X_embeddings = create_embeddings(data, embeddings_model, embedding_dim)

In [None]:
# y = data['EventType'].values

In [None]:
# np.save('Embeddings.npy', X)
# np.save('Labels.npy', y)

In [None]:
# X = np.load('Embeddings.npy')
# y = np.load('Labels.npy')

In [None]:
additional_features = [col for col in final_data.columns if col.startswith('has_')] + ['nb_tweets']
X_additional = final_data[additional_features].values

In [None]:
X = np.hstack((X_embeddings, X_additional))
y = final_data['EventType'].values

In [None]:
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)

plt.figure(figsize=(10, 6))
plt.scatter(X_tsne[y == 0, 0], X_tsne[y == 0, 1], c='blue', label='Class 0', alpha=0.5)
plt.scatter(X_tsne[y == 1, 0], X_tsne[y == 1, 1], c='red', label='Class 1', alpha=0.5)
plt.title('t-SNE 2D Visualization')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend()
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.3, 
    random_state=42, 
    stratify=y
)

In [None]:
print("\nEntraînement du modèle Logistic Regression...")
clf = LogisticRegression(max_iter=439, C = 2.1944043721683357, penalty = 'l2', solver = 'liblinear', random_state=42)
clf.fit(X_train, y_train)
y_pred_lr = clf.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f'Accuracy Logistic Regression: {accuracy_lr:.4f}')
print('Classification Report Logistic Regression:')
print(classification_report(y_test, y_pred_lr))

In [None]:
param_dist_lr = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': uniform(0.01, 10),
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': randint(100, 1000)
}

random_search_lr = RandomizedSearchCV(
    estimator=LogisticRegression(random_state=42),
    param_distributions=param_dist_lr,
    n_iter=100,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

random_search_lr.fit(X_train, y_train)
print("Meilleurs paramètres :", random_search_lr.best_params_)
print("Meilleure précision (Cross-validation):", random_search_lr.best_score_)

best_lr = random_search_lr.best_estimator_
y_pred_lr = best_lr.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f'Accuracy Logistic Regression: {accuracy_lr:.4f}')
print('Classification Report Logistic Regression:')
print(classification_report(y_test, y_pred_lr))

In [None]:
print("\nEntraînement du modèle Random Forest...")
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Accuracy Random Forest: {accuracy_rf:.4f}')
print('Classification Report Random Forest:')
print(classification_report(y_test, y_pred_rf))

# Meilleurs paramètres : {'bootstrap': True, 'max_depth': 95, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1247}
# Meilleure précision (Cross-validation): 0.7765886287625419
# Accuracy Random Forest: 0.7336
# Classification Report Random Forest:
#               precision    recall  f1-score   support

#            0       0.73      0.67      0.70       296
#            1       0.74      0.79      0.76       346

#     accuracy                           0.73       642
#    macro avg       0.73      0.73      0.73       642
# weighted avg       0.73      0.73      0.73       642

In [None]:
param_dist_rf = {
    'n_estimators': randint(100, 2000),
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': randint(10, 100),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10),
    'bootstrap': [True]
}

random_search_rf = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist_rf,
    n_iter=100,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

random_search_rf.fit(X_train, y_train)
print("Meilleurs paramètres :", random_search_rf.best_params_)
print("Meilleure précision (Cross-validation):", random_search_rf.best_score_)

best_rf = random_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Accuracy Random Forest: {accuracy_rf:.4f}')
print('Classification Report Random Forest:')
print(classification_report(y_test, y_pred_rf))

In [None]:
print("\nEntraînement du modèle Linéaire SVM...")
svm_clf = LinearSVC(random_state=42, C = 100, loss = 'hinge', max_iter = 3000, tol = 0.0001)
svm_clf.fit(X_train, y_train)
y_pred_svm = svm_clf.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f'Accuracy SVM: {accuracy_svm:.4f}')
print('Classification Report Linéaire SVM:')
print(classification_report(y_test, y_pred_svm))

In [None]:
param_dist_svc = {
    'C': uniform(0.01, 10),
    'loss': ['hinge', 'squared_hinge'],
    'max_iter': randint(1000, 5000),
    'tol': uniform(1e-5, 1e-1)
}

random_search_svc = RandomizedSearchCV(
    estimator=LinearSVC(random_state=42),
    param_distributions=param_dist_svc,
    n_iter=100,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

random_search_svc.fit(X_train, y_train)
print("Meilleurs paramètres :", random_search_svc.best_params_)
print("Meilleure précision (Cross-validation):", random_search_svc.best_score_)

best_svc = random_search_svc.best_estimator_
y_pred_svc = best_svc.predict(X_test)
accuracy_svc = accuracy_score(y_test, y_pred_svc)
print(f'Accuracy Linear SVC: {accuracy_svc:.4f}')
print('Classification Report Linear SVC:')
print(classification_report(y_test, y_pred_svc))

In [None]:
print("\nEntraînement du modèle non linéaire SVM...")
svm_rbf = SVC(kernel='rbf', gamma='scale', C=1.0, random_state=42)
svm_rbf.fit(X_train, y_train)
y_pred_svm_rbf = svm_rbf.predict(X_test)
accuracy_svm_rbf = accuracy_score(y_test, y_pred_svm_rbf)
print(f'Accuracy SVM: {accuracy_svm_rbf:.4f}')
print('Classification Report Linéaire SVM:')
print(classification_report(y_test, y_pred_svm_rbf))

In [None]:
param_dist_svc = {
    'C': uniform(0.01, 10),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': randint(2, 5),
    'gamma': ['scale', 'auto'],
    'coef0': uniform(0, 1),
    'tol': uniform(1e-5, 1e-1),
    'max_iter': randint(1000, 5000)
}

random_search_svc = G(
    estimator=SVC(random_state=42),
    param_distributions=param_dist_svc,
    n_iter=100,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

random_search_svc.fit(X_train, y_train)
print("Meilleurs paramètres :", random_search_svc.best_params_)
print("Meilleure précision (Cross-validation):", random_search_svc.best_score_)

best_svc = random_search_svc.best_estimator_
y_pred_svc = best_svc.predict(X_test)
accuracy_svc = accuracy_score(y_test, y_pred_svc)
print(f'Accuracy SVC: {accuracy_svc:.4f}')
print('Classification Report SVC:')
print(classification_report(y_test, y_pred_svc))

In [None]:
param_dist_mlp = {
    'hidden_layer_sizes': [(10,), (10,10), (50,10), (100,10), (50, 50), (100, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [0.001, 0.004, 0.002, 0.003],
    'learning_rate': ['constant'],
    'max_iter': [500, 750, 1000],
    'batch_size': [32, 64, 128],
}

grid_search_mlp = GridSearchCV(
    estimator=MLPClassifier(random_state=42),
    param_grid=param_dist_mlp,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search_mlp.fit(X_train, y_train)
print("Meilleurs paramètres :", grid_search_mlp.best_params_)
print("Meilleure précision (Cross-validation):", grid_search_mlp.best_score_)

best_mlp = grid_search_mlp.best_estimator_
y_pred_mlp = best_mlp.predict(X_test)
accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
print(f'Accuracy MLP: {accuracy_mlp:.4f}')
print('Classification Report MLP:')
print(classification_report(y_test, y_pred_mlp))

# random_search_mlp = RandomizedSearchCV(
#     estimator=MLPClassifier(random_state=42),
#     param_distributions=param_dist_mlp,
#     n_iter=100,
#     scoring='accuracy',
#     verbose=2,
#     cv=5,
#     n_jobs=-1,
#     random_state=42
# )

# random_search_mlp.fit(X_train, y_train)
# print("Meilleurs paramètres :", random_search_mlp.best_params_)
# print("Meilleure précision (Cross-validation):", random_search_mlp.best_score_)

# best_mlp = random_search_mlp.best_estimator_
# y_pred_mlp = best_mlp.predict(X_test)
# accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
# print(f'Accuracy MLP: {accuracy_mlp:.4f}')
# print('Classification Report MLP:')
# print(classification_report(y_test, y_pred_mlp))

In [None]:
mlp = MLPClassifier(
    hidden_layer_sizes=(20, 10), 
    activation='relu', 
    solver='adam', 
    alpha=0.003, 
    learning_rate='constant', 
    batch_size=64,
    max_iter=980, 
    random_state=42
)

mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)
accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
print(f'Accuracy MLP: {accuracy_mlp:.4f}')
print('Classification Report MLP:')
print(classification_report(y_test, y_pred_mlp))

# Meilleurs paramètres : {'activation': 'logistic', 'alpha': 0.003132655146732228, 'batch_size': 122, 'hidden_layer_sizes': (10,), 'learning_rate': 'constant', 'max_iter': 980, 'solver': 'adam'}
# Meilleure précision (Cross-validation): 0.7498327759197324
# Accuracy MLP: 0.7445
# Classification Report MLP:
#               precision    recall  f1-score   support

#            0       0.74      0.69      0.71       296
#            1       0.75      0.79      0.77       346

#     accuracy                           0.74       642
#    macro avg       0.74      0.74      0.74       642
# weighted avg       0.74      0.74      0.74       642


In [None]:
param_dist = {
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'min_child_weight': randint(1, 10),
    'gamma': uniform(0, 1),
    'max_depth': randint(3, 30),
    'n_estimators': randint(1000, 10000),
}

random_search = RandomizedSearchCV(
    estimator=XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42
    ),
    param_distributions=param_dist,
    n_iter=10,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    random_state=42
)



random_search.fit(X_train, y_train)
print("Meilleurs paramètres :", random_search.best_params_)
print("Meilleure précision (Cross-validation):", random_search.best_score_)

best_xgb_clf = random_search.best_estimator_
y_pred_xgb = best_xgb_clf.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f'Accuracy XGBoost: {accuracy_xgb:.4f}')
print('Classification Report XGBoost:')
print(classification_report(y_test, y_pred_xgb))


In [None]:
#mac
# Meilleurs paramètres : {'colsample_bytree': 0.5110923710151508, 'gamma': 0.49816518664589293, 'learning_rate': 0.15286320903670425, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 5363, 'subsample': 0.9081928676065312}
# Meilleure précision (Cross-validation): 0.7872909698996656
# Accuracy XGBoost: 0.7523
# Classification Report XGBoost:
#               precision    recall  f1-score   support

#            0       0.72      0.76      0.74       296
#            1       0.79      0.74      0.76       346

#     accuracy                           0.75       642
#    macro avg       0.75      0.75      0.75       642
# weighted avg       0.75      0.75      0.75       642

#poly :
# Meilleurs paramètres : {'colsample_bytree': 0.6289514135224699, 'gamma': 0.17088758739006582, 'learning_rate': 0.2105929659773293, 'max_depth': 27, 'min_child_weight': 2, 'n_estimators': 1846, 'subsample': 0.7675183660224553}
# Meilleure précision (Cross-validation): 0.7832775919732442
# Accuracy XGBoost: 0.7477
# Classification Report XGBoost:
#               precision    recall  f1-score   support

#            0       0.72      0.74      0.73       296
#            1       0.77      0.76      0.76       346

#     accuracy                           0.75       642
#    macro avg       0.75      0.75      0.75       642
# weighted avg       0.75      0.75      0.75       642

#anciens parametres probablement 0.7
# xgb_clf = XGBClassifier(
#     n_estimators=1000,          # Nombre d'arbres
#     max_depth=10,               # Profondeur maximale des arbres
#     learning_rate=0.1,         # Taux d'apprentissage
#     subsample=0.8,             # Fraction des échantillons à utiliser pour chaque arbre
#     colsample_bytree=0.8,      # Fraction des caractéristiques à utiliser pour chaque arbre
#     objective='binary:logistic',  # Objectif pour classification binaire
#     use_label_encoder=False,   # Désactiver l'encodeur de labels (pour éviter les avertissements)
#     eval_metric='logloss',     # Métrique d'évaluation
#     random_state=42,
# )

# Meilleurs paramètres : {'colsample_bytree': 0.9090073829612466, 'gamma': 0.8607305832563434, 'learning_rate': 0.01208563915935721, 'max_depth': 10, 'min_child_weight': 9, 'n_estimators': 5413, 'subsample': 0.6110539052353652}
# Meilleure précision (Cross-validation): 0.765886287625418
# Accuracy XGBoost: 0.7430
# Classification Report XGBoost:
#               precision    recall  f1-score   support

#            0       0.72      0.73      0.72       296
#            1       0.76      0.76      0.76       346

#     accuracy                           0.74       642
#    macro avg       0.74      0.74      0.74       642
# weighted avg       0.74      0.74      0.74       642

In [None]:
xgb_clf = XGBClassifier(
    n_estimators=5000,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    eta=0.01,
)

xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f'Accuracy XGBoost: {accuracy_xgb:.4f}')
print('Classification Report XGBoost:')
print(classification_report(y_test, y_pred_xgb))

# xgb_clf = XGBClassifier(
#     n_estimators=1000,
#     max_depth=6,
#     learning_rate=0.01,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     objective='binary:logistic',
#     eval_metric='logloss',
#     random_state=42,
#     reg_lambda=10,
#     reg_alpha=10,
#     min_child_weight=30,
# )

In [None]:
chosen_model = xgb_clf

In [None]:
data_eval_list = []
for match, df in all_matchs_cleaned_eval.items():
    for id, group in df.groupby('ID'):
        tweets = group['Tweet'].tolist()
        data_eval_list.append({
            'ID': id,
            'Tweets': tweets
        })

data_eval = pd.DataFrame(data_eval_list)

In [None]:
new_features_eval_df = create_new_features(all_matchs_cleaned_eval, is_test=True)

In [None]:
final_eval_df_scaled = new_features_eval_df.copy()
columns_to_scale_eval = new_features_eval_df.columns.difference(['ID'])
final_eval_df_scaled[columns_to_scale_eval] = scaler.fit_transform(new_features_eval_df[columns_to_scale_eval])

In [None]:
final_data_eval = pd.merge(data_eval, final_eval_df_scaled, on=['ID'], how='left')

In [None]:
# print(final_data_eval)

In [None]:
print("Création des embeddings pour les données d'évaluation...")
X_eval_embeddings = create_embeddings(data_eval, embeddings_model, embedding_dim)
print("Création des embeddings terminée.")

In [None]:
np.save('Embeddings_eval.npy', X_eval_embeddings)

In [None]:
# X_eval_embeddings = np.load('Embeddings_eval.npy')

In [None]:
additional_features_eval = [col for col in final_data.columns if col.startswith('has_')] + ['nb_tweets']
X_eval_additional = final_data_eval[additional_features_eval].values
X_eval = np.hstack((X_eval_embeddings, X_eval_additional))

In [None]:
print(X_eval.shape)

In [None]:
y_pred_eval = chosen_model.predict(X_eval)

In [None]:
submission = pd.DataFrame({
    'ID': data_eval['ID'],
    'EventType': y_pred_eval
})

submission[['num1', 'num2']] = submission['ID'].str.split('_', expand=True).astype(int)
submission = submission.sort_values(by=['num1', 'num2']).drop(columns=['num1', 'num2'])

submission.to_csv(f'submission_XGB_scaled_ha.csv', index=False)
print("Les prédictions ont été sauvegardées dans 'submission.csv'")

In [None]:
y_pred_proba = chosen_model.predict_proba(X)
y_eval_proba = chosen_model.predict_proba(X_eval_embeddings)

submission_proba_y_pred = pd.DataFrame({
    'ID': data['ID'],
    'xgb_proba_0': y_pred_proba[:, 0],
    'xgb_proba_1': y_pred_proba[:, 1]
})

submission_proba_y_eval = pd.DataFrame({
    'ID': data_eval['ID'],
    'xgb_proba_0': y_eval_proba[:, 0],
    'xgb_proba_1': y_eval_proba[:, 1]
})

submission_proba = pd.concat([submission_proba_y_pred, submission_proba_y_eval], ignore_index=True)
submission_proba[['num1', 'num2']] = submission_proba['ID'].str.split('_', expand=True).astype(int)
submission_proba = submission_proba.sort_values(by=['num1', 'num2']).drop(columns=['num1', 'num2'])

submission_proba.to_csv(f'submission_proba_XGB_2.csv', index=False)