Import Libraries

In [10]:
# Import libraries.  Add any additional ones here.
# Generally, system libraries precede others.
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

import numpy as np
import gensim.downloader as d
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
_NAME = "ChowJieSeth"
_STUDENT_NUM = 'E0725441'

Preprocessing

In [12]:
def preprocess(text):
    
    # simple tokenization and lowercasing
    tokens = text.lower().split()
    
    stopwords = ENGLISH_STOP_WORDS
    tokens = [token for token in tokens if token not in stopwords]

    return " ".join(tokens)

def tokenize(text):
    return [sentence.split() for sentence in text]

Feature Engineering

In [13]:
class GloveTransformer():
    def __init__(self, embedding_dim=100):
        self.embedding_dim = embedding_dim

        self.glove = d.load(f"glove-wiki-gigaword-{embedding_dim}")
    
    def transform(self, X):
        features = []
        for text in X:

            tokens = text.split()

            token_vectors = [self.glove[word] for word in tokens if word in self.glove]

            if token_vectors:
                avg_vector = np.mean(token_vectors, axis=0)
            else:
                avg_vector = np.zeros(self.embedding_dim)
            features.append(avg_vector)
        
        # convert dense to sparse vector
        return np.array(features)

In [14]:
def w2v(tokens, model, embedding_dim=50):
    token_vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(token_vectors, axis=0) if token_vectors else np.zeros(embedding_dim)


Train Model

In [15]:
def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)

Predict Test Set

In [16]:
def predict(model, X_test):
    return model.predict(X_test)

Export Predictions

In [17]:
def generate_result(test, y_pred, filename):
    test['Verdict'] = pd.Series(y_pred)
    test.drop(columns=['Text'], inplace=True)
    test.to_csv(filename, index=False)

Main

In [18]:
# load dataset
train = pd.read_csv('train.csv')
X_train = train['Text']
y_train = train['Verdict']
test = pd.read_csv('test.csv')
X_test = test['Text']

# preprocessing
X_train_tokens = tokenize(X_train)
X_test_tokens = tokenize(X_test)

# feature engineering / vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# glove_transformer = GloveTransformer(embedding_dim=100)
# X_train_glove = glove_transformer.transform(X_train)
# X_test_glove = glove_transformer.transform(X_test)

w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, sg=0, epochs=10)
X_train_cbow = np.array([w2v(tokens, w2v_model, embedding_dim=100) for tokens in X_train_tokens])
X_test_cbow = np.array([w2v(tokens, w2v_model, embedding_dim=100) for tokens in X_test_tokens])

# w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, sg=1, epochs=10)
# X_train_sg = np.array([w2v(tokens, w2v_model, embedding_dim=100) for tokens in X_train_tokens])
# X_test_sg = np.array([w2v(tokens, w2v_model, embedding_dim=100) for tokens in X_test_tokens])


# define model
model = LogisticRegression()

X_train_combined = np.hstack([X_train_tfidf.toarray(), X_train_cbow])
X_test_combined = np.hstack([X_test_tfidf.toarray(), X_test_cbow])

train_model(model, X_train_combined, y_train)
# test your model
y_pred = predict(model, X_train_combined)

# Use f1-macro as the metric
score = f1_score(y_train, y_pred, average='macro')
print('score on validation = {}'.format(score))

# generate prediction on test data
y_pred = predict(model, X_test_combined)

output_filename = f"A2_{_NAME}_{_STUDENT_NUM}.csv"
generate_result(test, y_pred, output_filename)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


score on validation = 0.6993687108773874
