In [32]:
###
# Import Libraries
###

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, f1_score
from scipy.sparse import vstack
from scipy.optimize import linear_sum_assignment
from sklearn.mixture import GaussianMixture
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

In [33]:
np.random.seed(0)

In [34]:
###
# Import Data and some preparation
###

test_path = './data/test.csv'
train_path = './data/train.csv'
val_path = './data/val.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
val_data = pd.read_csv(val_path)

nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thomasli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/thomasli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [35]:
### 
# text_preprocess
###

def preprocess_text(text):
    # Lowercase and remove special characters
    if text is None or text == "":
        return ""
    if not isinstance(text, str):
        return ""
    # text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    # Remove stop words
    text = " ".join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
    return text

In [36]:
train_data['Phrase'] = train_data['Phrase'].apply(preprocess_text)
test_data['Phrase'] = test_data['Phrase'].apply(preprocess_text)
val_data['Phrase'] = val_data['Phrase'].apply(preprocess_text)

# Separate labeled and unlabeled data
labeled_train = train_data[train_data['Sentiment'] != -100]
labeled_val = val_data[val_data['Sentiment'] != -100]

#run MVNB on the labeled data
def run_MVNB( labeled_train, labeled_val, test_data):
    # Create a count vectorizer
    count_vectorizer = CountVectorizer(binary=True, min_df=6)
    X_train = count_vectorizer.fit_transform(labeled_train['Phrase'])
    y_train = labeled_train['Sentiment']
    X_val = count_vectorizer.transform(labeled_val['Phrase'])
    y_val = labeled_val['Sentiment']
    X_test = count_vectorizer.transform(test_data['Phrase'])

    # Train a Multinomial Naive Bayes classifier
    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    print("Accuracy: ", accuracy_score(y_val, y_pred))
    print("F1 Score: ", f1_score(y_val, y_pred, average='weighted'))
    test_data['Sentiment'] = clf.predict(X_test)
    return test_data[['PhraseID', 'Sentiment']]

# Run the model and save the output
test_results = run_MVNB(labeled_train, labeled_val, test_data)
test_results.to_csv('test_sentiment_output.csv', index=False)



Accuracy:  0.8799879600963192
F1 Score:  0.8802267627301972


In [213]:
###
# vectorize the phrase (1 for bag of words, 2 for N-gram)
###

def phrase_vectorize(vectorize_method, train_data):
    if (vectorize_method == 1):
        vectorizer = CountVectorizer(binary=True, min_df=5)
    elif(vectorize_method == 2):
        vectorizer = CountVectorizer(binary=True, min_df=2, ngram_range=(2,2))
    else:
        raise ValueError("vectorize_method error")
    vectorizer.fit(train_data['Phrase'])
    return vectorizer

In [214]:
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from scipy.sparse import vstack
import numpy as np
from scipy.optimize import linear_sum_assignment

def unsup_learn(learn_method, K, X_train_unlabeled, X_train_labeled, y_train_labeled):
    # Apply PCA only on the feature data
    pca = PCA(n_components=200, random_state=42)
    X_train_unlabeled_reduced = pca.fit_transform(X_train_unlabeled)
    X_train_labeled_reduced = pca.transform(X_train_labeled)

    # Select and fit the clustering model on the reduced unlabeled data
    if learn_method == 1:
        learn_model = KMeans(n_clusters=K, random_state=42)
    elif learn_method == 2:
        learn_model = GaussianMixture(n_components=K, covariance_type='diag', random_state=42, max_iter=50, tol=1e-3)
    else:
        raise ValueError("Invalid 'learn_method' specified. Use 1 for KMeans or 2 for GaussianMixture.")
    
    # Generate pseudo-labels for the unlabeled data
    pseudo_labels = learn_model.fit_predict(X_train_unlabeled_reduced)

    # Map clusters to labels using the Hungarian algorithm for best alignment with original labels
    cost_matrix = np.zeros((K, K))
    for cluster_id in range(K):
        labeled_in_cluster = y_train_labeled[learn_model.predict(X_train_labeled_reduced) == cluster_id]
        for label in range(K):
            cost_matrix[cluster_id, label] = np.sum(labeled_in_cluster != label)

    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    cluster_to_label_map = {row: col for row, col in zip(row_ind, col_ind)}
    pseudo_labels_remapped = np.array([cluster_to_label_map[cluster] for cluster in pseudo_labels])


    # Combine the original labeled and pseudo-labeled data in the original feature dimension
    X_combined = vstack([X_train_labeled, X_train_unlabeled])
    y_combined = np.concatenate((y_train_labeled, pseudo_labels_remapped))  # Labels are kept in original form

    return X_combined, y_combined


In [215]:
###
# supervised learning (1 for Multinomial Naive Bayes, 2 for Softmax Regression)
###

def sup_learn(learn_method, X_combined, y_combined, X_val, y_val):

    if (learn_method == 1):
        learn_model = MultinomialNB(alpha=1) 
    elif (learn_method == 2):
        learn_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
    else:
        raise ValueError("supervise learn_method error")

    learn_model.fit(X_combined, y_combined)

    y_pred_val = learn_model.predict(X_val)

    val_accuracy = accuracy_score(y_val, y_pred_val)
    f1 = f1_score(y_val, y_pred_val, average="macro")
    return learn_method, val_accuracy, f1


In [216]:
train_data['Phrase'] = train_data['Phrase'].apply(preprocess_text)
test_data['Phrase'] = test_data['Phrase'].apply(preprocess_text)
val_data['Phrase'] = val_data['Phrase'].apply(preprocess_text)

# Separate labeled and unlabeled data
labeled_train = train_data[train_data['Sentiment'] != -100]
unlabeled_train= train_data[train_data['Sentiment'] == -100]
labeled_val = val_data[val_data['Sentiment'] != -100]
unlabeled_val = val_data[val_data['Sentiment'] == -100]

In [217]:
def final_training(preprocess_method, unsupervised_method, supervised_method):
    # Vectorize the data
    my_vectorizer = phrase_vectorize(preprocess_method, train_data)
    
    X_train_labeled = my_vectorizer.transform(labeled_train['Phrase']).toarray()  # Ensuring dense format
    X_val_labeled = my_vectorizer.transform(labeled_val['Phrase']).toarray()
    X_unlabeled = my_vectorizer.transform(unlabeled_train['Phrase']).toarray()
    
    y_train_labeled = labeled_train['Sentiment']
    y_val = labeled_val['Sentiment']
    K = len(np.unique(y_train_labeled))  # number of classes
    X_combined, y_combined = unsup_learn(unsupervised_method, K, X_unlabeled, X_train_labeled, y_train_labeled)
    final_model, val_accuracy, f1 = sup_learn(supervised_method, X_combined, y_combined, X_val_labeled, y_val)
    
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"F1 Score on Validation Set: {f1:.4f}")

In [218]:
for i in range (1, 3):
    for j in range (1, 3):
        for k in range (1, 3):
            prepocess = ["bag of words", "N-gram"]
            unsupervised = ["K-means", "GMM"]
            supervised = ["MNB", "Softmax"]
            model_string = f'\n{prepocess[i-1]}, {unsupervised[j-1]}, {supervised[k-1]}'
            print(model_string)
            final_training(i, j, k)
            print("\n")


bag of words, K-means, MNB
Validation Accuracy: 0.7587
F1 Score on Validation Set: 0.7511



bag of words, K-means, Softmax
Validation Accuracy: 0.7469
F1 Score on Validation Set: 0.7463



bag of words, GMM, MNB
Validation Accuracy: 0.8068
F1 Score on Validation Set: 0.8058



bag of words, GMM, Softmax
Validation Accuracy: 0.6726
F1 Score on Validation Set: 0.6732



N-gram, K-means, MNB
Validation Accuracy: 0.6938
F1 Score on Validation Set: 0.7004



N-gram, K-means, Softmax
Validation Accuracy: 0.6061
F1 Score on Validation Set: 0.6224



N-gram, GMM, MNB
Validation Accuracy: 0.7116
F1 Score on Validation Set: 0.7081



N-gram, GMM, Softmax
Validation Accuracy: 0.6494
F1 Score on Validation Set: 0.6566


