In [42]:
import csv
import re
import string

import numpy as np
import pandas as pd
import scipy.sparse

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer


In [43]:
# ---------- LOAD LABELED DATA ----------
train_path = "train.txt"  # your provided file
df = pd.read_csv(train_path, sep="\t")

# make sure we only keep the three labels we care about
df = df[df["sentiment"].isin(["positive", "negative", "neutral"])]

print(df.head())
print(df["sentiment"].value_counts())

# ---------- SHUFFLE + SPLIT INTO TRAIN / DEV ----------
train_df, dev_df = train_test_split(
    df,
    test_size=0.1,              # you can change this ratio
    random_state=42,
    stratify=df["sentiment"]    # keep label proportions
)

print("Train size:", len(train_df))
print("Dev size:", len(dev_df))
print(train_df["sentiment"].value_counts())
print(dev_df["sentiment"].value_counts())


                   id sentiment  \
0  638165350966669312  negative   
1  640169120600862720   neutral   
2  635946254254624769   neutral   
3  667121920333258752  negative   
4  628637519643586560   neutral   

                                               tweet  
0  Nicki's butt is just too big like c'mon that's...  
1  Haruna Lukmon may av just played himself out o...  
2  Zach Putnam will be unavailable for the White ...  
3  """@daithimckay what about the victims of IRA ...  
4  """LHP Matt Boyd, traded to @tigers in David P...  
sentiment
neutral     8789
positive    5979
negative    3878
Name: count, dtype: int64
Train size: 16781
Dev size: 1865
sentiment
neutral     7910
positive    5381
negative    3490
Name: count, dtype: int64
sentiment
neutral     879
positive    598
negative    388
Name: count, dtype: int64


In [44]:
# basic tokeniser: lowercase, remove punctuation, split on whitespace
punct_re = re.compile(f"[{re.escape(string.punctuation)}]")

def tokenize(text):
    text = punct_re.sub(" ", str(text))
    text = text.lower()
    return text.split()

# tokenise train and dev
train_tokens = [tokenize(t) for t in train_df["tweet"]]
dev_tokens   = [tokenize(t) for t in dev_df["tweet"]]

# build vocabulary ONLY from the training split
def build_vocab(docs_tokens):
    vocab = set()
    for doc in docs_tokens:
        vocab.update(doc)
    # give each word a unique ID
    return {word: idx for idx, word in enumerate(sorted(vocab))}

word2id = build_vocab(train_tokens)
vocab_size = len(word2id)
print("Vocab size:", vocab_size)

# map labels to IDs in the order pos, neg, neu
sentiment2id = {"positive": 0, "negative": 1, "neutral": 2}
id2sentiment = {v: k for k, v in sentiment2id.items()}

y_train = train_df["sentiment"].map(sentiment2id).to_numpy()
y_dev   = dev_df["sentiment"].map(sentiment2id).to_numpy()


Vocab size: 37127


In [45]:
def convert_to_bow_matrix(preprocessed_docs, word2id):
    """
    preprocessed_docs: list of token lists
    word2id: dict mapping word -> column index
    
    Returns: csr_matrix [n_docs x (vocab_size + 1)]
             last column is OOV bucket
    """
    n_docs = len(preprocessed_docs)
    oov_index = len(word2id)
    mat_size = (n_docs, oov_index + 1)
    
    X_dok = scipy.sparse.dok_matrix(mat_size, dtype=np.int32)
    
    for doc_id, doc in enumerate(preprocessed_docs):
        for word in doc:
            token_id = word2id.get(word, oov_index)  # OOV -> last column
            X_dok[doc_id, token_id] += 1
    
    # convert to CSR for faster training
    return X_dok.tocsr()

X_train_baseline = convert_to_bow_matrix(train_tokens, word2id)
X_dev_baseline   = convert_to_bow_matrix(dev_tokens, word2id)

print("Baseline feature shapes:", X_train_baseline.shape, X_dev_baseline.shape)


Baseline feature shapes: (16781, 37128) (1865, 37128)


In [46]:
# ---------- BASELINE MODEL ----------
baseline_clf = SVC(C=1000, kernel="linear")  # or LinearSVC(C=1000)

baseline_clf.fit(X_train_baseline, y_train)

y_train_pred_baseline = baseline_clf.predict(X_train_baseline)
y_dev_pred_baseline   = baseline_clf.predict(X_dev_baseline)


In [47]:
mis_idx = np.where(y_dev_pred_baseline != y_dev)[0]

print("First 3 misclassified dev examples (baseline):\n")
for i in mis_idx[:3]:
    row = dev_df.iloc[i]
    print("Tweet ID:", row["id"])
    print("Tweet:", row["tweet"])
    print("Gold:", id2sentiment[y_dev[i]])
    print("Pred:", id2sentiment[y_dev_pred_baseline[i]])
    print("-" * 80)


First 3 misclassified dev examples (baseline):

Tweet ID: 624660020832002048
Tweet: Can't wait for @Korn @suicidesilence in October. Hopefully I run into @DanKenny so I can bust his balls about Conor McGregor.
Gold: neutral
Pred: positive
--------------------------------------------------------------------------------
Tweet ID: 636914085506904065
Tweet: Why is Rousey worried bout money may pay and I'm pretty sure they not even in the same tax bracket
Gold: neutral
Pred: negative
--------------------------------------------------------------------------------
Tweet ID: 639528821956460548
Tweet: I'm selling a lawn ticket for Jason Aldean this coming Friday! DM me if you're interested!
Gold: neutral
Pred: positive
--------------------------------------------------------------------------------


In [48]:
# ---------- IMPROVED SYSTEM ----------
# Use TF-IDF, unigrams + bigrams, ignore very rare terms
tfidf_vectorizer = TfidfVectorizer(
    lowercase=True,
    token_pattern=r"\b\w+\b",
    ngram_range=(1, 2),
    min_df=2
)

X_train_improved = tfidf_vectorizer.fit_transform(train_df["tweet"])
X_dev_improved   = tfidf_vectorizer.transform(dev_df["tweet"])

improved_clf = LinearSVC(C=1.0)  # you can tune this

improved_clf.fit(X_train_improved, y_train)

y_train_pred_improved = improved_clf.predict(X_train_improved)
y_dev_pred_improved   = improved_clf.predict(X_dev_improved)


In [49]:
import os

# ---------- LOAD LABELLED TEST SET + FEATURES FOR BASELINE & IMPROVED ----------
test_path = "ttds_2025_cw2_test.txt"   # <- your coursework test file
test_df = pd.read_csv(test_path, sep="\t")

# keep only the 3 sentiment labels we care about
test_df = test_df[test_df["sentiment"].isin(["positive", "negative", "neutral"])]
print("Test size:", len(test_df))
print(test_df["sentiment"].value_counts())

# gold labels for test
y_test = test_df["sentiment"].map(sentiment2id).to_numpy()

# ---------- baseline features for test (BOW with same vocab) ----------
test_tokens = [tokenize(t) for t in test_df["tweet"]]
X_test_baseline = convert_to_bow_matrix(test_tokens, word2id)

# ---------- improved features for test (TF-IDF, same vectorizer) ----------
X_test_improved = tfidf_vectorizer.transform(test_df["tweet"])

# ---------- predictions ----------
y_test_pred_baseline = baseline_clf.predict(X_test_baseline)
y_test_pred_improved = improved_clf.predict(X_test_improved)

print("Baseline test predictions shape:", y_test_pred_baseline.shape)
print("Improved test predictions shape:", y_test_pred_improved.shape)




Test size: 4662
sentiment
neutral     2197
positive    1495
negative     970
Name: count, dtype: int64
Baseline test predictions shape: (4662,)
Improved test predictions shape: (4662,)


In [50]:
def compute_scores(y_true, y_pred):
    """
    Returns a dict with all 12 scores in the required order:
    p-pos,r-pos,f-pos,p-neg,r-neg,f-neg,p-neu,r-neu,f-neu,p-macro,r-macro,f-macro
    """
    labels = [
        sentiment2id["positive"],
        sentiment2id["negative"],
        sentiment2id["neutral"],
    ]
    
    p, r, f, _ = precision_recall_fscore_support(
        y_true,
        y_pred,
        labels=labels,
        zero_division=0
    )
    
    scores = {
        "p-pos": p[0], "r-pos": r[0], "f-pos": f[0],
        "p-neg": p[1], "r-neg": r[1], "f-neg": f[1],
        "p-neu": p[2], "r-neu": r[2], "f-neu": f[2],
        "p-macro": p.mean(),
        "r-macro": r.mean(),
        "f-macro": f.mean(),
    }
    return scores


In [51]:
results = []

def add_result_row(system_name, split_name, y_true, y_pred):
    scores = compute_scores(y_true, y_pred)
    row = {"system": system_name, "split": split_name}
    row.update(scores)
    results.append(row)

# baseline
add_result_row("baseline", "train", y_train, y_train_pred_baseline)
add_result_row("baseline", "dev",   y_dev,   y_dev_pred_baseline)
add_result_row("baseline", "test",  y_test,  y_test_pred_baseline)

# improved: train + dev
add_result_row("improved", "train", y_train, y_train_pred_improved)
add_result_row("improved", "dev",   y_dev,   y_dev_pred_improved)
add_result_row("improved", "test",  y_test,  y_test_pred_improved)

# write CSV
header = [
    "system", "split",
    "p-pos", "r-pos", "f-pos",
    "p-neg", "r-neg", "f-neg",
    "p-neu", "r-neu", "f-neu",
    "p-macro", "r-macro", "f-macro"
]

with open("classification.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=header)
    writer.writeheader()
    # keep order: baseline train/dev/test, then improved train/dev/test
    for row in results:
        writer.writerow(row)

print("Saved classification.csv")


Saved classification.csv
