## Basic Approaches
Basic Scickit learn methods to solve the task.

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import os
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

Load and preprocess the data

In [3]:
# Define the directory containing the training files
training_dir = "challenge_data/train_tweets"

# Function to load all CSV files into a single DataFrame
def load_training_data(directory):
    all_data = []
    for filename in tqdm(os.listdir(directory)):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            df = pd.read_csv(filepath)
            all_data.append(df)
    return pd.concat(all_data, ignore_index=True)

# Load the data
df_original = load_training_data(training_dir)

  0%|          | 0/16 [00:00<?, ?it/s]

In [4]:
def preprocess_tweets(tweets):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    processed_tweets = []
    for tweet in tqdm(tweets):
        tweet = tweet.lower()
        tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet)
        tweet = re.sub(r"@\S+", "", tweet)
        tweet = re.sub(r"[^\w\s#]", "", tweet)
        words = tweet.split()
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        processed_tweets.append(" ".join(words))
    return processed_tweets
    

Choose which data to consider for the training:
- Fraction of the data to compute and debug the model.
- Whole dataset to actually train the model.

In [None]:
# Sample a fraction of the original dataset (e.g., 1%) to see if the models actually work
fraction = 0.001
df = df_original.sample(frac=fraction, random_state=42)

df['CleanedTweet'] = preprocess_tweets(df['Tweet'])

  0%|          | 0/5056 [00:00<?, ?it/s]

: 

In [None]:
# Actually use the whole DataSet
df = df_original.copy()
df['CleanedTweet'] = preprocess_tweets(df['Tweet'])

  0%|          | 0/5056050 [00:00<?, ?it/s]

KeyboardInterrupt: 

Tokenize

In [None]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
# Vectorize the tweets
X = vectorizer.fit_transform(df["CleanedTweet"])
tfidf_df = pd.DataFrame(X.toarray(), columns=[f"tfidf_{i}" for i in range(1, 5001)])
df = pd.concat([df_original.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)
print(df.head())
# Print the chosen words
# vocabulary = vectorizer.get_feature_names_out()
# for word in vocabulary:
#     print(word)

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, df["EventType"], test_size=0.2, random_state=42)

## Building the matrix for training

In [None]:
def matrixBuild(df, features):
    result = []
    for period_id in sorted(df['PeriodID'].unique()):
        period_tweets = df[df['PeriodID'] == period_id]
        embeddings = np.array(period_tweets['embedding'].tolist())
        if "length" in features:

    return result

## Train the basic models

Train a panoply of Models to have some benchmark models

In [26]:
model = LogisticRegression(penalty = 'l2', max_iter=1000)
model.fit(X_train, y_train)
y_pred_LogReg = model.predict(X_test)

print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_LogReg))
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_LogReg))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.47      0.52     21968
           1       0.64      0.74      0.69     28593

    accuracy                           0.62     50561
   macro avg       0.61      0.60      0.60     50561
weighted avg       0.61      0.62      0.61     50561

Logistic Regression Accuracy: 0.6196079982595281


In [27]:
from sklearn.kernel_approximation import Nystroem
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer


nystroem = Nystroem(kernel='rbf', gamma=0.1, n_components=100)
svm_model = LinearSVC()

to_dense = FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)
pipeline = Pipeline([
    ("vectorizer", to_dense),
    ("nystroem", nystroem),
    ("svc", svm_model)
])

pipeline.fit(X_train, y_train)
y_pred_svm = pipeline.predict(X_test)

print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.27      0.37     21968
           1       0.60      0.83      0.70     28593

    accuracy                           0.59     50561
   macro avg       0.58      0.55      0.53     50561
weighted avg       0.58      0.59      0.55     50561

SVM Accuracy: 0.5912264393504876


In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.46      0.51      8818
           1       0.64      0.75      0.69     11407

    accuracy                           0.62     20225
   macro avg       0.61      0.60      0.60     20225
weighted avg       0.62      0.62      0.61     20225

Random Forest Accuracy: 0.62274412855377


In [10]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=10)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

print("K nearest neighbours Report:")
print(classification_report(y_test, y_pred_nb))
print("K nearest neighbours Accuracy:", accuracy_score(y_test, y_pred_nb))

K nearest neighbours Report:
              precision    recall  f1-score   support

           0       0.56      0.45      0.50      8818
           1       0.63      0.73      0.67     11407

    accuracy                           0.61     20225
   macro avg       0.59      0.59      0.59     20225
weighted avg       0.60      0.61      0.60     20225

K nearest neighbours Accuracy: 0.6050927070457355


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

print("Gradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_gb))
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))

In [36]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

xgbr = XGBClassifier(n_estimators=1000, max_depth=1, learning_rate=0.15, random_state=0)
xgbr.fit(X_train, y_train)
y_pred_xgbr = xgbr.predict(X_test)

lgbmr = LGBMClassifier(n_estimators=1000, max_depth=1, learning_rate=0.15, random_state=0)
lgbmr.fit(X_train, y_train)
y_pred_lgbmr = lgbmr.predict(X_test)

print("Gradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_xgbr))
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_xgbr))

print("Light Gradient BoostingClassification Report:")
print(classification_report(y_test, y_pred_lgbmr))
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_lgbmr))

Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.34      0.41       429
           1       0.61      0.75      0.67       583

    accuracy                           0.58      1012
   macro avg       0.55      0.55      0.54      1012
weighted avg       0.56      0.58      0.56      1012

Gradient Boosting Accuracy: 0.5770750988142292
Light Gradient BoostingClassification Report:
              precision    recall  f1-score   support

           0       0.54      0.34      0.42       429
           1       0.62      0.78      0.69       583

    accuracy                           0.60      1012
   macro avg       0.58      0.56      0.56      1012
weighted avg       0.58      0.60      0.58      1012

Gradient Boosting Accuracy: 0.5968379446640316


## Implementing a CNN

In [44]:
import spacy

# Sample a fraction of the original dataset (e.g., 1%) to see if the models actually work
fraction = 0.001
dfs = df_original.sample(frac=fraction, random_state=42)

dfs['CleanedTweet'] = preprocess_tweets(df['Tweet'])

# Actually use the whole DataSet
df = df_original.copy()
df['CleanedTweet'] = preprocess_tweets(df['Tweet'])

nlp = spacy.load("en_core_web_md")

def tweet_to_embedding_spacy(tweet, model, maxlen=50):
    """
    Converts a tweet into a padded embedding matrix using spaCy.
    Each token is represented by its corresponding word vector.
    """
    embedding_dim = model.vocab.vectors_length
    embedding_matrix = np.zeros((maxlen, embedding_dim))
    
    # Process the tweet with spaCy
    doc = model(tweet)
    for i, token in enumerate(doc[:maxlen]):
        if token.has_vector:  # Check if the token has a vector representation
            embedding_matrix[i] = token.vector
    return embedding_matrix

max_length = 50  # Maximum number of tokens per tweet
X = np.array(df['CleanedTweet'].apply(lambda x: tweet_to_embedding_spacy(x, nlp, maxlen=max_length)).tolist())

y = dfs['EventType'].values

  0%|          | 0/5056 [00:00<?, ?it/s]

  0%|          | 0/5056050 [00:00<?, ?it/s]

OSError: [E050] Can't find model 'en_core_web_md'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

# Parameters
vocab_size = 5000
embedding_dim = 50

# Build the model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [11]:
# Train the model
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_test, y_test)
)

# Predict and evaluate
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print("Classification report: ", classification_report(y_test, y_pred))



TypeError: fit() got an unexpected keyword argument 'epochs'

New Idea : Tweet counting

In [20]:
import pandas as pd

# Function to count the number of tweets per timestamp and list associated event types
def count_tweets_per_timestamp(df, timestamp_column='Timestamp', event_column='EventType'):
    """
    Count the number of tweets per timestamp and associate event types with each timestamp.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing tweet data.
        timestamp_column (str): The column name representing timestamps.
        event_column (str): The column name representing event types.

    Returns:
        pd.DataFrame: A DataFrame with TweetCount and associated EventType(s) for each timestamp.
    """
    # Count the number of tweets per timestamp
    tweet_counts = df.groupby(timestamp_column).size().reset_index(name='TweetCount')
    
    # Gather unique event types for each timestamp
    event_types = df.groupby(timestamp_column)[event_column].apply(lambda x: ', '.join(map(str, x.unique()))).reset_index(name='EventTypes')
    
    # Merge tweet counts with event types
    result = pd.merge(tweet_counts, event_types, on=timestamp_column)
    result = result.sort_values(by=timestamp_column).reset_index(drop=True)
    
    return result



In [22]:
import pandas as pd

def load_csv(filepath):
    """
    Load a CSV file into a pandas DataFrame.

    Parameters:
        filepath (str): Path to the CSV file.

    Returns:
        pd.DataFrame: The loaded DataFrame.
    """
    # Load the CSV file
    df = pd.read_csv(filepath)
    print(f"CSV file loaded successfully! Shape: {df.shape}")
    
    return df

df = load_csv("challenge_data/train_tweets/ArgentinaBelgium72.csv")
tc = count_tweets_per_timestamp(df)
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):
    print(tc)


CSV file loaded successfully! Shape: (313803, 6)
          Timestamp  TweetCount EventTypes
0     1404575400000          34          0
1     1404575401000          28          0
2     1404575402000          37          0
3     1404575403000          34          0
4     1404575404000          47          0
5     1404575405000          40          0
6     1404575406000          38          0
7     1404575407000          56          0
8     1404575408000          46          0
9     1404575409000          45          0
10    1404575410000          44          0
11    1404575411000          45          0
12    1404575412000          33          0
13    1404575413000          35          0
14    1404575414000          34          0
15    1404575415000          40          0
16    1404575416000          37          0
17    1404575417000          52          0
18    1404575418000          42          0
19    1404575419000          43          0
20    1404575420000          42          0
21   

In [None]:
# Function to load all CSV files into different DataFrames
def load_training_data(directory):
    all_data = []
    for filename in tqdm(os.listdir(directory)):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            df = pd.read_csv(filepath)
            all_data.append(df)
            print(count_tweets_per_timestamp(df))
    return pd.concat(all_data, ignore_index=True)

# Load the data
df_original = load_training_data(training_dir)

  0%|          | 0/16 [00:00<?, ?it/s]

          Timestamp  TweetCount
0     1403538600000           7
1     1403538601000           5
2     1403538602000           1
3     1403538603000           2
4     1403538604000           8
...             ...         ...
7767  1403546396000          22
7768  1403546397000          16
7769  1403546398000          10
7770  1403546399000          18
7771  1403546400000          12

[7772 rows x 2 columns]
          Timestamp  TweetCount
0     1403797800000          19
1     1403797801000          11
2     1403797802000          10
3     1403797803000          20
4     1403797804000          16
...             ...         ...
7796  1403805596000          56
7797  1403805597000          50
7798  1403805598000          47
7799  1403805599000          44
7800  1403805600000          50

[7801 rows x 2 columns]
          Timestamp  TweetCount
0     1403553000000           9
1     1403553001000          12
2     1403553002000           8
3     1403553003000          11
4     1403553004000   

## Use the trained models on the eval_tweets

Train a panoply of Models to have some benchmark models

Load and preprocess the tweets to evaluate

In [None]:
# Define the directory containing the training files
training_dir = "challenge_data/eval_tweets"

# Load the data
df_eval = load_training_data(training_dir)

# Actually use the whole DataSet
df = df_eval.copy()
df['CleanedTweet'] = preprocess_tweets(df['Tweet'])

Evaluate the tweets

In [None]:
# Vectorize the tweets
X_eval = vectorizer.fit_transform(df["CleanedTweet"])

y_eval_LogReg = model.predict(X_eval)

y_eval_svm = pipeline.predict(X_eval)

y_eval_rf = rf_model.predict(X_eval)

y_eval_knn = knn_model.predict(X_eval)

y_eval_xgbr = xgbr.predict(X_eval)

y_eval_lgbmr = lgbmr.predict(X_eval)

Make the csv files to submit

In [None]:
def Result(df, y, name):
    ID = df["ID"]
    EventType = pd.Series(y, name="EventType")
    result_df = pd.DataFrame({
        "ID": ID,
        "EventType": EventType
    })
    result_df.to_csv("predictions/prediction_Baptiste/" + name + ".csv", index=False)

Result(df, y_eval_LogReg, "LogReg")
Result(df, y_eval_svm, "SVM")
Result(df, y_eval_rf, "RF")
Result(df, y_eval_knn, "knn")
Result(df, y_eval_xgbr, "xgbr")
Result(df, y_eval_lgbmr, "lgbmr")

