In [152]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [153]:
test_path = './data/test.csv'
train_path = './data/train.csv'
val_path = './data/val.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
val_data = pd.read_csv(val_path)
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def preprocess_text(text):
    # Lowercase and remove special characters
    if text is None or text == "":
        return ""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    # Remove stop words
    text = " ".join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
    return text

train_data['Phrase'] = train_data['Phrase'].apply(preprocess_text)
test_data['Phrase'] = test_data['Phrase'].apply(preprocess_text)
val_data['Phrase'] = val_data['Phrase'].apply(preprocess_text)


# Separate labeled and unlabeled data
labeled_train = train_data[train_data['Sentiment'] != -100]
unlabeled_train= train_data[train_data['Sentiment'] == -100]
labeled_val = val_data[val_data['Sentiment'] != -100]
unlabeled_val = val_data[val_data['Sentiment'] == -100]




In [154]:
from sklearn.feature_extraction.text import CountVectorizer


# vectorizer = CountVectorizer(binary=True, min_df=4, ngram_range=(2,2))
vectorizer = CountVectorizer(binary=True, min_df=6)
vectorizer.fit(train_data['Phrase'])

X_train_labeled = vectorizer.transform(labeled_train['Phrase'])
X_val_labeled = vectorizer.transform(labeled_val['Phrase'])
y_train_labeled = labeled_train['Sentiment']
y_val = labeled_val['Sentiment']


vocab_size = len(vectorizer.get_feature_names_out())
print(f"Total number of features (vocabulary size): {vocab_size}")

print(f"First 10 words in the vocabulary: {vectorizer.get_feature_names_out()[:10]}")

Total number of features (vocabulary size): 5066
First 10 words in the vocabulary: ['0s' '10' '100' '1000' '10000' '100000' '1010' '10am' '10k' '10minute']


In [155]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import f1_score
from scipy.sparse import vstack
from scipy.optimize import linear_sum_assignment

# Step 1: Generate pseudo-labels with KMeans on unlabeled data
X_train_unlabeled = vectorizer.transform(unlabeled_train['Phrase'])
X_val = vectorizer.transform(labeled_val['Phrase'])

K = len(np.unique(y_train_labeled))  # number of classes
kmeans = KMeans(n_clusters=K, random_state=42)
pseudo_labels = kmeans.fit_predict(X_train_unlabeled)

# Step 2: Create cost matrix for Hungarian Algorithm
cost_matrix = np.zeros((K, K))
for cluster_id in range(K):
    # Predict cluster assignments for the labeled data
    labeled_in_cluster = y_train_labeled[kmeans.predict(X_train_labeled) == cluster_id]
    for label in range(K):
        # Count mismatches to form the cost matrix
        cost_matrix[cluster_id, label] = np.sum(labeled_in_cluster != label)

# Step 3: Apply Hungarian Algorithm for optimal mapping
row_ind, col_ind = linear_sum_assignment(cost_matrix)

# Step 4: Remap pseudo-labels using the optimal assignment
cluster_to_label_map = {row: col for row, col in zip(row_ind, col_ind)}
pseudo_labels_remapped = np.array([cluster_to_label_map[cluster] for cluster in pseudo_labels])

# Step 5: Combine labeled and remapped pseudo-labeled data
X_combined = vstack([X_train_labeled, X_train_unlabeled])
y_combined = np.concatenate((y_train_labeled, pseudo_labels_remapped))

# Step 6: Calculate psies and phis using the combined data
n_combined = X_combined.shape[0]
d = X_combined.shape[1]

psies_combined = np.zeros((K, d))  # P(X|Y=k) for each feature in each class
phis_combined = np.zeros(K)         # P(Y=k) for each class
alpha = 1  # Laplace smoothing

for k in range(K):
    X_k = X_combined[y_combined == k].toarray()
    nk = X_k.shape[0]  # number of samples in class k
    
    psies_combined[k] = (np.sum(X_k, axis=0) + alpha) / (nk + 2 * alpha)
    phis_combined[k] = nk / float(n_combined)

# Step 7: Define function for Naive Bayes predictions
def nb_predictions(x, psies, phis):
    if not isinstance(x, np.ndarray):
        x = x.toarray()
    
    n, d = x.shape
    psies = psies.clip(1e-14, 1 - 1e-14)  # Avoid log(0) errors
    
    logpy = np.log(phis).reshape((K, 1))
    logpxy = x @ np.log(psies.T) + (1 - x) @ np.log(1 - psies.T)
    logpyx = logpxy + logpy.T
    return logpyx.argmax(axis=1), logpyx

# Step 8: Predict and evaluate on validation set
y_pred_val, _ = nb_predictions(X_val, psies_combined, phis_combined)

val_accuracy = (y_pred_val == y_val).mean()
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Calculate F1 score on the validation set
f1 = f1_score(y_val, y_pred_val, average="macro")
print(f"F1 Score on Validation Set: {f1:.4f}")


Validation Accuracy: 0.8394
F1 Score on Validation Set: 0.8388
