In [1]:
import csv
import re
import string

import numpy as np
import pandas as pd
import scipy.sparse

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [2]:
train_path = "train.txt"                
test_path  = "ttds_2025_cw2_test.txt"   

# train/dev data
train_df = pd.read_csv(
    train_path,
    sep="\t",
    header=None,
    names=["id", "sentiment", "tweet"],
    quoting=csv.QUOTE_NONE,
    on_bad_lines="skip"
)

# keep only the 3 sentiment labels
train_df = train_df[train_df["sentiment"].isin(["positive", "negative", "neutral"])]

print("Loaded train/dev data:", train_df.shape)
print(train_df["sentiment"].value_counts(), "\n")

# labelled test data
test_df = pd.read_csv(
    test_path,
    sep="\t",
    header=None,
    names=["id", "sentiment", "tweet"],
    quoting=csv.QUOTE_NONE,
    on_bad_lines="skip"
)

test_df = test_df[test_df["sentiment"].isin(["positive", "negative", "neutral"])]

print("Loaded test data:", test_df.shape)
print(test_df["sentiment"].value_counts(), "\n")

Loaded train/dev data: (18646, 3)
sentiment
neutral     8789
positive    5979
negative    3878
Name: count, dtype: int64 

Loaded test data: (4662, 3)
sentiment
neutral     2197
positive    1495
negative     970
Name: count, dtype: int64 



In [3]:
train_split_df, dev_split_df = train_test_split(
    train_df,
    test_size=0.1,             # 90% train, 10% dev
    random_state=42,
    stratify=train_df["sentiment"]
)

print("Train split size:", len(train_split_df))
print("Dev split size:  ", len(dev_split_df), "\n")

Train split size: 16781
Dev split size:   1865 



In [4]:
punct_re = re.compile(f"[{re.escape(string.punctuation)}]")

def tokenize(text: str):
    text = str(text)
    text = punct_re.sub(" ", text)
    text = text.lower()
    return text.split()

# tokenised docs
train_tokens = [tokenize(t) for t in train_split_df["tweet"]]
dev_tokens   = [tokenize(t) for t in dev_split_df["tweet"]]
test_tokens  = [tokenize(t) for t in test_df["tweet"]]


In [5]:
def build_vocab(docs_tokens):
    vocab = set()
    for doc in docs_tokens:
        vocab.update(doc)
    return {word: idx for idx, word in enumerate(sorted(vocab))}

word2id = build_vocab(train_tokens)
vocab_size = len(word2id)
print("Vocabulary size:", vocab_size)

def convert_to_bow_matrix(preprocessed_docs, word2id):
    n_docs = len(preprocessed_docs)
    oov_index = len(word2id)
    mat_size = (n_docs, oov_index + 1)

    X_dok = scipy.sparse.dok_matrix(mat_size, dtype=np.int32)

    for doc_id, doc in enumerate(preprocessed_docs):
        for word in doc:
            token_id = word2id.get(word, oov_index)
            X_dok[doc_id, token_id] += 1

    return X_dok.tocsr()

X_train = convert_to_bow_matrix(train_tokens, word2id)
X_dev   = convert_to_bow_matrix(dev_tokens,   word2id)
X_test  = convert_to_bow_matrix(test_tokens,  word2id)

print("Feature matrix shapes:")
print("  X_train:", X_train.shape)
print("  X_dev:  ", X_dev.shape)
print("  X_test: ", X_test.shape, "\n")

Vocabulary size: 37127
Feature matrix shapes:
  X_train: (16781, 37128)
  X_dev:   (1865, 37128)
  X_test:  (4662, 37128) 



In [6]:
sentiment2id = {"positive": 0, "negative": 1, "neutral": 2}
id2sentiment = {v: k for k, v in sentiment2id.items()}

y_train = train_split_df["sentiment"].map(sentiment2id).to_numpy()
y_dev   = dev_split_df["sentiment"].map(sentiment2id).to_numpy()
y_test  = test_df["sentiment"].map(sentiment2id).to_numpy()


In [7]:
def compute_scores(y_true, y_pred):
    labels = [
        sentiment2id["positive"],
        sentiment2id["negative"],
        sentiment2id["neutral"],
    ]
    p, r, f, _ = precision_recall_fscore_support(
        y_true,
        y_pred,
        labels=labels,
        zero_division=0
    )

    scores = {
        "p-pos": p[0], "r-pos": r[0], "f-pos": f[0],
        "p-neg": p[1], "r-neg": r[1], "f-neg": f[1],
        "p-neu": p[2], "r-neu": r[2], "f-neu": f[2],
        "p-macro": p.mean(),
        "r-macro": r.mean(),
        "f-macro": f.mean(),
    }
    return scores

In [8]:
print("================================")
print(" BASELINE MODEL: SVC (BOW)     ")
print("================================")

baseline_clf = SVC(C=1000, kernel="linear")

baseline_clf.fit(X_train, y_train)

y_train_pred_baseline = baseline_clf.predict(X_train)
y_dev_pred_baseline   = baseline_clf.predict(X_dev)
y_test_pred_baseline  = baseline_clf.predict(X_test)

baseline_dev_scores = compute_scores(y_dev, y_dev_pred_baseline)
baseline_test_scores = compute_scores(y_test, y_test_pred_baseline)

print("Baseline dev macro F1:",
      f"{baseline_dev_scores['f-macro']:.4f}")
print("Baseline test macro F1:",
      f"{baseline_test_scores['f-macro']:.4f}", "\n")

 BASELINE MODEL: SVC (BOW)     
Baseline dev macro F1: 0.5603
Baseline test macro F1: 0.5583 



In [9]:
def train_and_eval_model(name, clf, X_train, y_train, X_dev, y_dev, X_test, y_test,
                         use_dense=False):
    
    if use_dense:
        X_train_fit = X_train.toarray()
        X_dev_fit   = X_dev.toarray()
        X_test_fit  = X_test.toarray()
    else:
        X_train_fit = X_train
        X_dev_fit   = X_dev
        X_test_fit  = X_test

    print(f"----- {name} -----")
    clf.fit(X_train_fit, y_train)

    y_train_pred = clf.predict(X_train_fit)
    y_dev_pred   = clf.predict(X_dev_fit)
    y_test_pred  = clf.predict(X_test_fit)

    dev_scores = compute_scores(y_dev, y_dev_pred)
    test_scores = compute_scores(y_test, y_test_pred)

    print("Dev macro F1: ", f"{dev_scores['f-macro']:.4f}")
    print("Test macro F1:", f"{test_scores['f-macro']:.4f}", "\n")

    return {
        "name": name,
        "clf": clf,
        "y_train_pred": y_train_pred,
        "y_dev_pred": y_dev_pred,
        "y_test_pred": y_test_pred,
        "dev_scores": dev_scores,
        "test_scores": test_scores,
    }

improved_candidates = []

# 1) Logistic Regression 
logreg = LogisticRegression(max_iter=2000, n_jobs=-1)
improved_candidates.append(
    train_and_eval_model(
        "LogisticRegression (BOW)",
        logreg,
        X_train, y_train,
        X_dev,   y_dev,
        X_test,  y_test,
        use_dense=False
    )
)

# 2) Random Forest 
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
improved_candidates.append(
    train_and_eval_model(
        "RandomForestClassifier (BOW)",
        rf,
        X_train, y_train,
        X_dev,   y_dev,
        X_test,  y_test,
        use_dense=True  
    )
)

# 3) Multinomial Naive Bayes 
mnb = MultinomialNB()
improved_candidates.append(
    train_and_eval_model(
        "MultinomialNB (BOW)",
        mnb,
        X_train, y_train,
        X_dev,   y_dev,
        X_test,  y_test,
        use_dense=False
    )
)

# 4) LinearSVC 
linsvm = LinearSVC(C=1.0)
improved_candidates.append(
    train_and_eval_model(
        "LinearSVC (BOW)",
        linsvm,
        X_train, y_train,
        X_dev,   y_dev,
        X_test,  y_test,
        use_dense=False
    )
)

----- LogisticRegression (BOW) -----
Dev macro F1:  0.6147
Test macro F1: 0.6094 

----- RandomForestClassifier (BOW) -----
Dev macro F1:  0.4843
Test macro F1: 0.4783 

----- MultinomialNB (BOW) -----
Dev macro F1:  0.5926
Test macro F1: 0.5986 

----- LinearSVC (BOW) -----
Dev macro F1:  0.5957
Test macro F1: 0.5810 



In [10]:

best_candidate = None
best_f_macro = -1.0

for cand in improved_candidates:
    f_macro = cand["dev_scores"]["f-macro"]
    if f_macro > best_f_macro:
        best_f_macro = f_macro
        best_candidate = cand

print("=============================================")
print(" BEST IMPROVED MODEL (among 4 candidates)")
print(" Model:       ", best_candidate["name"])
print(" Dev macro F1:", f"{best_candidate['dev_scores']['f-macro']:.4f}")
print(" Test macro F1:",
      f"{best_candidate['test_scores']['f-macro']:.4f}")
print("=============================================\n")

improved_name = best_candidate["name"]
improved_clf = best_candidate["clf"]
y_train_pred_improved = best_candidate["y_train_pred"]
y_dev_pred_improved   = best_candidate["y_dev_pred"]
y_test_pred_improved  = best_candidate["y_test_pred"]

 BEST IMPROVED MODEL (among 4 candidates)
 Model:        LogisticRegression (BOW)
 Dev macro F1: 0.6147
 Test macro F1: 0.6094



In [11]:
def add_result_row(result_list, system_name, split_name, y_true, y_pred):
    scores = compute_scores(y_true, y_pred)
    row = {"system": system_name, "split": split_name}
    row.update(scores)
    result_list.append(row)

results = []

# baseline rows
add_result_row(results, "baseline", "train", y_train, y_train_pred_baseline)
add_result_row(results, "baseline", "dev",   y_dev,   y_dev_pred_baseline)
add_result_row(results, "baseline", "test",  y_test,  y_test_pred_baseline)

# improved rows (using best candidate)
add_result_row(results, "improved", "train", y_train, y_train_pred_improved)
add_result_row(results, "improved", "dev",   y_dev,   y_dev_pred_improved)
add_result_row(results, "improved", "test",  y_test,  y_test_pred_improved)

header = [
    "system", "split",
    "p-pos", "r-pos", "f-pos",
    "p-neg", "r-neg", "f-neg",
    "p-neu", "r-neu", "f-neu",
    "p-macro", "r-macro", "f-macro"
]

with open("classification.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=header)
    writer.writeheader()
    for row in results:
        writer.writerow(row)

print("Saved classification.csv with rows:")
for row in results:
    print(f"{row['system']:8s} {row['split']:5s}  f-macro={row['f-macro']:.4f}")

Saved classification.csv with rows:
baseline train  f-macro=0.9993
baseline dev    f-macro=0.5603
baseline test   f-macro=0.5583
improved train  f-macro=0.9721
improved dev    f-macro=0.6147
improved test   f-macro=0.6094
