In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils import shuffle as sk_shuffle

In [2]:
base_dir = os.getcwd()  
df = pd.read_csv(os.path.join(base_dir, "train.tsv") , sep = '\t')
X_text = df['Phrase']
y = df['Sentiment']

In [None]:
def run_experiment(X_text, y, test_size, vectorizer, loss, lr, batch_mode):
    X_train_text, X_test_text, y_train, y_test = train_test_split(X_text, y, test_size = test_size, random_state=42)
    vec = vectorizer.fit(X_train_text)
    X_train = vec.transform(X_train_text)
    X_test = vec.transform(X_test_text)

    clf = SGDClassifier(loss, learning_rate="constant", eta0=lr,
                        max_iter=5, tol=None, random_state=42)

    if batch_mode == "batch":
        clf.fit(X_train, y_train)
    elif batch_mode == "mini-batch":
        batch_size = 1024
        for epoch in range(5):
            X_train, y_train = sk_shuffle(X_train, y_train, random_state=epoch)
            for i in range(0, X_train.shape[0], batch_size):
                clf.partial_fit(\
                    X_train[i:i+batch_size], \
                    y_train[i:i+batch_size],\
                    classes=np.unique(y))
    elif batch_mode == "shuffle":
        X_train, y_train = sk_shuffle(X_train, y_train, random_state=42)
        clf.fit(X_train, y_train)
    else:
        raise ValueError("Unknown batch_mode")

    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    return acc, f1


vectorizers = {
    "BoW": CountVectorizer(max_features=20000, ngram_range=(1,1), min_df = 5, max_df = 0.75),
    "N_grams": CountVectorizer(max_features=20000, ngram_range=(1,2), min_df = 5, max_df = 0.75)
}
test_sizes = [0.2, 0.15, 0.1]
losses = ["hinge", "log_loss"]
lrs = [1e-3, 1e-2, 1e-1]
batch_modes = ["batch", "mini-batch", "shuffle"]
results = []

for name, vectorizer in vectorizers.items():
    for test_size in test_sizes:
        for loss in losses:
            for lr in lrs:
                for batch_mode in batch_modes:
                    acc, f1 = run_experiment(
                        X_text,          
                        y,              
                        test_size,     
                        vectorizer,         
                        loss, 
                        lr, 
                        batch_mode
                    )
                    results.append({
                        'Feature': name,
                        "test_size": test_size,
                        "Loss": loss,
                        "LR": lr,
                        "BatchMode": batch_mode,
                        "Accuracy": acc,
                        "F1": f1
                    })
                    print(
                            f"Feature={name} | TestSize={test_size} | Loss={loss} | LR={lr} | BatchMode={batch_mode} "
                            f" => "
                            f"Accuracy={acc:.4f}, F1={f1:.4f}"
                        )

df_results = pd.DataFrame(results)
df_results.to_excel(save_path + "experiment_machine_learning_results.xlsx", index=False)