In [1]:
import numpy as np
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from Dataset import get_data
import gensim.downloader as api
from xgboost import XGBClassifier

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [3]:
embeddings_model_name = "glove-twitter-200"
embedding_dim = 200
embeddings_model = api.load(embeddings_model_name)

In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [5]:
def clean_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r'https?://\S+|www\.\S+', '', tweet)
    tweet = re.sub(r'@\w+', '', tweet)
    tweet = re.sub(r'rt', '', tweet)
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    return tweet

def tokenize_and_lemmatize(tweet):
    tokens = word_tokenize(tweet)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return tokens

def preprocess_tweets(tweets):
    cleaned = [clean_tweet(tweet) for tweet in tweets]
    tokenized = [tokenize_and_lemmatize(tweet) for tweet in cleaned]
    return tokenized

def get_embedding(tokens, embeddings_model, embedding_dim):
    valid_embeddings = [embeddings_model[token] for token in tokens if token in embeddings_model]
    if valid_embeddings:
        return np.mean(valid_embeddings, axis=0)
    else:
        return np.zeros(embedding_dim)

def create_embeddings(data, embeddings_model, embedding_dim):
    embeddings = []
    for tweets in data['Tweets']:
        tokenized_tweets = preprocess_tweets(tweets)
        all_tokens = [token for tweet in tokenized_tweets for token in tweet]
        embedding = get_embedding(all_tokens, embeddings_model, embedding_dim)
        embeddings.append(embedding)
    return np.array(embeddings)

In [6]:
all_matchs, all_matchs_cleaned, all_matchs_eval, all_matchs_cleaned_eval = get_data()

In [7]:
data_list = []
for match, df in all_matchs_cleaned.items():
    for id, group in df.groupby('ID'):
        tweets = group['Tweet'].tolist()
        event_type = group['EventType'].iloc[0]
        data_list.append({
            'ID': id,
            'EventType': event_type,
            'Tweets': tweets
        })

data = pd.DataFrame(data_list)
data[['MatchID', 'PeriodID']] = data['ID'].str.split('_', expand=True)
data['MatchID'] = data['MatchID'].astype(int)
data['PeriodID'] = data['PeriodID'].astype(int)
data = data.sort_values(by=['MatchID', 'PeriodID']).reset_index(drop=True)
data = data.drop(columns=['MatchID', 'PeriodID'])

In [8]:
X = create_embeddings(data, embeddings_model, embedding_dim)
y = data['EventType'].values

In [9]:
np.save('Embeddings.npy', X)
np.save('Labels.npy', y)

In [10]:
# X = np.load('Embeddings.npy')
# y = np.load('Labels.npy')

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.3, 
    random_state=42, 
    stratify=y
)

In [None]:
xgb_clf = XGBClassifier(
    n_estimators=1000,
    max_depth=10,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
)

xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f'Accuracy XGBoost: {accuracy_xgb:.4f}')
print('Classification Report XGBoost:')
print(classification_report(y_test, y_pred_xgb))

In [13]:
data_eval_list = []
for match, df in all_matchs_cleaned_eval.items():
    for id, group in df.groupby('ID'):
        tweets = group['Tweet'].tolist()
        data_eval_list.append({
            'ID': id,
            'Tweets': tweets
        })

data_eval = pd.DataFrame(data_eval_list)

In [14]:
X_eval_embeddings = create_embeddings(data_eval, embeddings_model, embedding_dim)

In [15]:
np.save('Embeddings_eval.npy', X_eval_embeddings)

In [16]:
# X_eval_embeddings = np.load('Embeddings_eval.npy')

In [17]:
y_pred_eval = xgb_clf.predict(X_eval_embeddings)

In [None]:
submission = pd.DataFrame({
    'ID': data_eval['ID'],
    'EventType': y_pred_eval
})

submission[['num1', 'num2']] = submission['ID'].str.split('_', expand=True).astype(int)
submission = submission.sort_values(by=['num1', 'num2']).drop(columns=['num1', 'num2'])

submission.to_csv(f'Result', index=False)
print("Prediction has been saved in 'Result.csv'.")