In [406]:
###
# Import Libraries
###

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, f1_score
from scipy.sparse import vstack
from scipy.optimize import linear_sum_assignment
from sklearn.mixture import GaussianMixture
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

In [407]:
np.random.seed(0)

In [408]:
###
# Import Data and some preparation
###

test_path = './data/test.csv'
train_path = './data/train.csv'
val_path = './data/val.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
val_data = pd.read_csv(val_path)

nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thomasli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/thomasli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [409]:
### 
# text_preprocess
###

def preprocess_text(text):
    # Lowercase and remove special characters
    if text is None or text == "":
        return ""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)

    text = re.sub(r'\d+', '', text)
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    # Remove stop words
    text = " ".join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
    return text

In [410]:
train_data['Phrase'] = train_data['Phrase'].apply(preprocess_text)
test_data['Phrase'] = test_data['Phrase'].apply(preprocess_text)
val_data['Phrase'] = val_data['Phrase'].apply(preprocess_text)

# Separate labeled and unlabeled data
labeled_train = train_data[train_data['Sentiment'] != -100]
labeled_val = val_data[val_data['Sentiment'] != -100]

#run MVNB on the labeled data
def run_supervised( labeled_train, labeled_val, test_data):
    # Create a count vectorizer
    count_vectorizer = CountVectorizer(binary=True, min_df=5)
    X_train = count_vectorizer.fit_transform(labeled_train['Phrase'])
    y_train = labeled_train['Sentiment']
    X_val = count_vectorizer.transform(labeled_val['Phrase'])
    y_val = labeled_val['Sentiment']
    X_test = count_vectorizer.transform(test_data['Phrase'])

    # Train a Multinomial Naive Bayes classifier
    # clf = MultinomialNB(alpha=0.5)
    clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=500)

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    print("Accuracy: ", accuracy_score(y_val, y_pred))
    print("F1 Score: ", f1_score(y_val, y_pred, average='weighted'))
    test_data['Sentiment'] = clf.predict(X_test)
    return test_data[['PhraseID', 'Sentiment']]

# Run the model and save the output
test_results = run_supervised(labeled_train, labeled_val, test_data)
test_results.to_csv('test_sentiment_output.csv', index=False)



Accuracy:  0.8708720330237358
F1 Score:  0.8706522565078751
