Import Libraries

In [None]:
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

import numpy as np
import gensim.downloader as d
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

Preprocessing

In [2]:
def preprocess(text):
    
    # simple tokenization and lowercasing
    tokens = text.lower().split()
    
    stopwords = ENGLISH_STOP_WORDS
    tokens = [token for token in tokens if token not in stopwords]

    return " ".join(tokens)

def tokenize(text):
    return [sentence.split() for sentence in text]

Feature Engineering

In [3]:
class GloveTransformer():
    def __init__(self, embedding_dim=100):
        self.embedding_dim = embedding_dim

        self.glove = d.load(f"glove-wiki-gigaword-{embedding_dim}")
    
    def transform(self, X):
        features = []
        for text in X:

            tokens = text.split()

            token_vectors = [self.glove[word] for word in tokens if word in self.glove]

            if token_vectors:
                avg_vector = np.mean(token_vectors, axis=0)
            else:
                avg_vector = np.zeros(self.embedding_dim)
            features.append(avg_vector)
        
        # convert dense to sparse vector
        return np.array(features)

In [4]:
def w2v(tokens, model, embedding_dim=50):
    token_vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(token_vectors, axis=0) if token_vectors else np.zeros(embedding_dim)


Train Model

In [5]:
def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)

Predict Test Set

In [6]:
def predict(model, X_test):
    return model.predict(X_test)

Export Predictions

In [7]:
def generate_result(test, y_pred, filename):
    test['Verdict'] = pd.Series(y_pred)
    test.drop(columns=['Text'], inplace=True)
    test.to_csv(filename, index=False)

Main

In [8]:
_NAME = "ChowJieSeth"
_STUDENT_NUM = 'E0725441'

In [None]:
# load dataset
train = pd.read_csv('train.csv')
X_train = train['Text']
y_train = train['Verdict']
test = pd.read_csv('test.csv')
X_test = test['Text']

# preprocessing
X_train_tokens = tokenize(X_train)
X_test_tokens = tokenize(X_test)

# feature engineering / vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

glove_transformer = GloveTransformer(embedding_dim=100)
X_train_glove = glove_transformer.transform(X_train)
X_test_glove = glove_transformer.transform(X_test)

w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, sg=1, epochs=10)
X_train_sg = np.array([w2v(tokens, w2v_model, embedding_dim=100) for tokens in X_train_tokens])
X_test_sg = np.array([w2v(tokens, w2v_model, embedding_dim=100) for tokens in X_test_tokens])

X_train_combined = np.hstack([X_train_tfidf.toarray(), X_train_glove, X_train_sg])
X_test_combined = np.hstack([X_test_tfidf.toarray(), X_test_glove, X_test_sg])

In [None]:
# MLP hyperparameters using gridsearchcv
param_grid = {
    'hidden_layer_sizes': [(64,), (128,),],
    'activation': ['relu'],
    'solver': ['sgd'],
    'alpha': [0.01, 0.1],
    'batch_size': [64],
    'learning_rate': ['adaptive'],
    'learning_rate_init': [0.01, 0.02],
    'max_iter': [125],
    'tol' : [0.01],
    'early_stopping': [True],
    'n_iter_no_change' : [5]
}

# define model
mlp = MLPClassifier(verbose=True)

grid_search = GridSearchCV(estimator=mlp,
                           param_grid=param_grid,
                           scoring='f1_macro',
                           cv=3,
                           n_jobs=1,
                           verbose=3)

grid_search.fit(X_train_combined, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best F1-Macro Score:", grid_search.best_score_)

# test model using f1-score
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_train_combined)
print("Validation F1-Macro Score:", f1_score(y_train, y_pred, average='macro'))

# generate prediction on test data
y_pred = predict(best_model, X_test_combined)

output_filename = f"A2_{_NAME}_{_STUDENT_NUM}.csv"
generate_result(test, y_pred, output_filename)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Iteration 1, loss = 0.73714820
Validation score: 0.702000
Iteration 2, loss = 0.64491479
Validation score: 0.729333
Iteration 3, loss = 0.62073721
Validation score: 0.735333
Iteration 4, loss = 0.60534010
Validation score: 0.738000
Iteration 5, loss = 0.59709215
Validation score: 0.748000
Iteration 6, loss = 0.58505783
Validation score: 0.746000
Iteration 7, loss = 0.57842226
Validation score: 0.746667
Iteration 8, loss = 0.57190801
Validation score: 0.747333
Iteration 9, loss = 0.56454304
Validation score: 0.756000
Iteration 10, loss = 0.55781603
Validation score: 0.749333
Iteration 11, loss = 0.55292653
Validation score: 0.756000
Validation score did not improve more than tol=0.010000 for 5 consecutive epochs. Setting learning rate to 0.002000
Iteration 12, loss = 0.53837962
Validation score: 0.758000
Iteration 13, loss = 0.53579214
Validation score: 0.758000
Iteration 14, loss = 0.53368052
Validation score: 0.760667
Iterati