In [51]:
###
# Import Libraries
###

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, f1_score
from scipy.sparse import vstack
from scipy.optimize import linear_sum_assignment
from sklearn.mixture import GaussianMixture
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [52]:
###
# Import Data and some preparation
###

test_path = './data/test.csv'
train_path = './data/train.csv'
val_path = './data/val.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
val_data = pd.read_csv(val_path)

nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\haomiao_xu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\haomiao_xu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [53]:
### 
# text_preprocess
###

def preprocess_text(text):
    # Lowercase and remove special characters
    if text is None or text == "":
        return ""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    # Remove stop words
    text = " ".join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
    return text

In [54]:
###
# vectorize the phrase (1 for bag of words, 2 for N-gram)
###

def phrase_vectorize(vectorize_method, train_data):
    if (vectorize_method == 1):
        vectorizer = CountVectorizer(binary=True, min_df=6)
    elif(vectorize_method == 2):
        vectorizer = CountVectorizer(binary=True, min_df=4, ngram_range=(2,2))
    else:
        raise ValueError("vectorize_method error")
    vectorizer.fit(train_data['Phrase'])
    return vectorizer

In [55]:
###
# unsupervised learning (1 for K-means, 2 for GMM)
###

def unsup_learn(learn_method, vectorizer, K, unlabeled_train, X_train_labeled, y_train_labeled):
    X_train_unlabeled = vectorizer.transform(unlabeled_train['Phrase'])

    if (learn_method == 1):
        learn_model = KMeans(n_clusters=K, random_state=42)
    elif (learn_method == 2):
        learn_model = GaussianMixture(n_clusters=K, random_state=42)
    else:
        raise ValueError("unsupervise learn_method error")
    pseudo_labels = learn_model.fit_predict(X_train_unlabeled)

    cost_matrix = np.zeros((K, K))
    for cluster_id in range(K):
        # Predict cluster assignments for the labeled data
        labeled_in_cluster = y_train_labeled[learn_model.predict(X_train_labeled) == cluster_id]
        for label in range(K):
            # Count mismatches to form the cost matrix
            cost_matrix[cluster_id, label] = np.sum(labeled_in_cluster != label)
            
    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    cluster_to_label_map = {row: col for row, col in zip(row_ind, col_ind)}
    pseudo_labels_remapped = np.array([cluster_to_label_map[cluster] for cluster in pseudo_labels])
    X_combined = vstack([X_train_labeled, X_train_unlabeled])
    y_combined = np.concatenate((y_train_labeled, pseudo_labels_remapped))
    return X_combined, y_combined


In [56]:
###
# supervised learning (1 for Multinomial Naive Bayes, 2 for Softmax Regression)
###

def sup_learn(learn_method, X_combined, y_combined, X_val, y_val):

    if (learn_method == 1):
        learn_model = MultinomialNB(alpha=1) 
    elif (learn_method == 2):
        learn_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
    else:
        raise ValueError("supervise learn_method error")

    learn_model.fit(X_combined, y_combined)

    y_pred_val = learn_model.predict(X_val)

    val_accuracy = accuracy_score(y_val, y_pred_val)
    f1 = f1_score(y_val, y_pred_val, average="macro")
    return learn_method, val_accuracy, f1


In [57]:
train_data['Phrase'] = train_data['Phrase'].apply(preprocess_text)
test_data['Phrase'] = test_data['Phrase'].apply(preprocess_text)
val_data['Phrase'] = val_data['Phrase'].apply(preprocess_text)

# Separate labeled and unlabeled data
labeled_train = train_data[train_data['Sentiment'] != -100]
unlabeled_train= train_data[train_data['Sentiment'] == -100]
labeled_val = val_data[val_data['Sentiment'] != -100]
unlabeled_val = val_data[val_data['Sentiment'] == -100]

In [58]:
my_vectorizer = phrase_vectorize(1, train_data)
X_train_labeled = my_vectorizer.transform(labeled_train['Phrase'])
X_val_labeled = my_vectorizer.transform(labeled_val['Phrase'])
y_train_labeled = labeled_train['Sentiment']
y_val = labeled_val['Sentiment']
K = len(np.unique(y_train_labeled))  # number of classes
X_combined, y_combined = unsup_learn(1, my_vectorizer, K, unlabeled_train, X_train_labeled, y_train_labeled)
final_model, val_accuracy, f1 = sup_learn(1, X_combined, y_combined, X_val_labeled, y_val)
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"F1 Score on Validation Set: {f1:.4f}")



Validation Accuracy: 0.8077
F1 Score on Validation Set: 0.8056


In [None]:
vocab_size = len(my_vectorizer.get_feature_names_out())
print(f"Total number of features (vocabulary size): {vocab_size}")

print(f"First 10 words in the vocabulary: {my_vectorizer.get_feature_names_out()[:10]}")

Total number of features (vocabulary size): 4496
First 10 words in the vocabulary: ['abandoned' 'ability' 'ablaze' 'able' 'abo' 'absolutely' 'absorb'
 'absorbs' 'absurd' 'absurdity']
