In [None]:
import os
import random
import json
from enum import Enum

import numpy as np
import pandas as pd
import gensim.downloader
import matplotlib.pyplot as plt
import nltk
import torch
import torch.nn.functional as functional
import sklearn.metrics as sk_metrics
from wordcloud import WordCloud
from scipy.spatial import distance
from transformers import BertModel, BertTokenizer
from transformers import RobertaModel, RobertaTokenizer
from sklearn.model_selection import train_test_split

## Preparation

In [None]:
data_path = "../Data/Restaurant reviews.csv"
df = pd.read_csv(data_path, dtype=str, na_filter=False)

for rating in range(1, 6):
    df_score_k = df[df["Rating"] == str(rating)]
    print(rating, df_score_k.size)
    # df_score_k.to_csv(f"../Data/reviews_rating_{rating}.csv")

restaurant_names = df["Restaurant"].tolist()
restaurant_names = set(restaurant_names)
for restaurant_name in restaurant_names:
    df_by_name = df[df["Restaurant"] == restaurant_name]
    # print(restaurant_name, df_by_name.size)

In [None]:
train_df, valid_df, test_df = None, None, None
# df["Rating"] = df["Rating"].replace("1.5", "2")
# df["Rating"] = df["Rating"].replace("2.5", "3")
# df["Rating"] = df["Rating"].replace("3.5", "4")
# df["Rating"] = df["Rating"].replace("4.5", "5")
print(df["Rating"].unique(), len(df))

for rating in range(1, 6):
    df_score_k = df[df["Rating"] == str(rating)]
    step_train_df, step_test_df = train_test_split(df_score_k, test_size=0.2)
    step_train_df, step_valid_df = train_test_split(step_train_df, test_size=0.2)

    if train_df is None:
        train_df = step_train_df
        valid_df = step_valid_df
        test_df = step_test_df
    else:
        train_df = pd.concat([train_df, step_train_df], axis=0)
        valid_df = pd.concat([valid_df, step_valid_df], axis=0)
        test_df = pd.concat([test_df, step_test_df], axis=0)

    # step_train_df.to_csv(f"../Data/review_train_data_{rating}.csv", index=False)
    print(len(step_train_df), len(df_score_k))

train_df.reset_index(drop=True)
valid_df.reset_index(drop=True)
test_df.reset_index(drop=True)
# train_df.to_csv(f"../Data/review_train_data.csv", index=False)
# valid_df.to_csv(f"../Data/review_valid_data.csv", index=False)
# test_df.to_csv(f"../Data/review_test_data.csv", index=False)
print(len(train_df), len(valid_df), len(test_df))

In [None]:
def read_reviews_of_rating(rating):
    csv_path = f"../Data/reviews_rating_{rating}.csv"
    df = pd.read_csv(csv_path, dtype=str, na_filter=False)
    return df


def filter_reviews_by_restaurant_name(df, restaurant_name):
    if restaurant_name is None:
        return df
    return df[df["Restaurant"] == restaurant_name]

## Task 1

In [None]:
cloud_generator = WordCloud(
    max_words=200,
    stopwords=None,
    collocations=False,
    normalize_plurals=True,
)

for rating in range(1, 6):
    df = read_reviews_of_rating(rating)

    reviews = df["Review"]
    cloud = cloud_generator.generate("\n".join([str(review) for review in reviews]))

    plt.imshow(cloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

## Task 4+5

In [None]:
gensim_word_embeddings = {
    "fasttext": gensim.downloader.load("fasttext-wiki-news-subwords-300"),
    "glove": gensim.downloader.load("glove-wiki-gigaword-300"),
    "word2vec": gensim.downloader.load("word2vec-google-news-300"),
}

In [None]:
# To change which BERT embeddings to use, or RoBERTa, change it here
bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# bert_model = BertModel.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
# bert_tokenizer = BertTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
# bert_model = RobertaModel.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
# bert_tokenizer = RobertaTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

In [None]:
EmbeddingName = Enum("Embedding_Name", ["fasttext", "glove", "word2vec", "bert"])


class WordEmbeddings:
    def __init__(self, model_name: EmbeddingName):
        self.gensim_word_embeddings = gensim_word_embeddings
        self.bert_model = bert_model
        self.bert_tokenizer = bert_tokenizer
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        self.model_name = model_name

        self.bert_model = self.bert_model.to(self.device)
        self.bert_model.eval()

    def get_word_embedding(self, word: str):
        if self.model_name == EmbeddingName.bert:
            return self.get_sentence_embedding(word)
        return self.gensim_word_embeddings[self.model_name.name].get_vector(word)

    def check_word_exists(self, word: str):
        if self.model_name == EmbeddingName.bert:
            return True
        return self.gensim_word_embeddings[self.model_name.name].has_index_for(word)

    def get_sentence_embedding(self, sentence: str):
        if self.model_name == EmbeddingName.bert:
            tokens = self.bert_tokenizer.encode(
                sentence,
                max_length=512,
                truncation=True,
                add_special_tokens=False)
            bert_inputs = torch.tensor([
                self.bert_tokenizer.build_inputs_with_special_tokens(tokens),
            ]).to(self.device)
            with torch.no_grad():
                pooler_output = self.bert_model(bert_inputs)[0][:, 0]
            sentence_embedding = pooler_output.squeeze().detach().cpu().numpy()

        else:
            tokens = nltk.word_tokenize(sentence)
            embeddings = [
                self.get_word_embedding(token)
                for token in tokens
                if self.check_word_exists(token)
            ]
            if len(embeddings) == 0:
                raise ValueError(f"Invalid sentence: '{sentence}'!")
            sentence_embedding = np.mean(embeddings, axis=0)

        return sentence_embedding

In [None]:
embedding_generator = WordEmbeddings(EmbeddingName.bert)
restaurant_name = None

reviews_of_rating_1 = filter_reviews_by_restaurant_name(
    read_reviews_of_rating(1),
    restaurant_name=restaurant_name,
)["Review"]
reviews_of_rating_2 = filter_reviews_by_restaurant_name(
    read_reviews_of_rating(2),
    restaurant_name=restaurant_name,
)["Review"]
reviews_of_rating_3 = filter_reviews_by_restaurant_name(
    read_reviews_of_rating(3),
    restaurant_name=restaurant_name,
)["Review"]
reviews_of_rating_4 = filter_reviews_by_restaurant_name(
    read_reviews_of_rating(4),
    restaurant_name=restaurant_name,
)["Review"]
reviews_of_rating_5 = filter_reviews_by_restaurant_name(
    read_reviews_of_rating(5),
    restaurant_name=restaurant_name,
)["Review"]

print(len(reviews_of_rating_1), len(reviews_of_rating_2), len(reviews_of_rating_4), len(reviews_of_rating_5))
num_samples = 200

for embedding_name in EmbeddingName:
    i = 0
    true_count = 0
    embedding_generator.model_name = embedding_name

    while i < num_samples:
        random_review_rating_1 = reviews_of_rating_1.sample(1).squeeze()
        random_review_rating_2 = reviews_of_rating_2.sample(1).squeeze()
        random_review_rating_4 = reviews_of_rating_4.sample(1).squeeze()

        try:
            embedding_review_rating_1 = embedding_generator.get_sentence_embedding(random_review_rating_1)
            embedding_review_rating_2 = embedding_generator.get_sentence_embedding(random_review_rating_2)
            embedding_review_rating_4 = embedding_generator.get_sentence_embedding(random_review_rating_4)
        except ValueError:
            continue

        same_sentiment_distance = distance.cosine(embedding_review_rating_1, embedding_review_rating_2)
        diff_sentiment_distance = distance.cosine(embedding_review_rating_1, embedding_review_rating_4)
        if same_sentiment_distance < diff_sentiment_distance:
            true_count += 1
        # print(f"same_sentiment ({same_sentiment_distance}) < diff_sentiment ({diff_sentiment_distance})")
        i += 1

    print(embedding_name, true_count / num_samples)
# fasttext 0.49, glove 0.494, word2vec 0.532, bert 0.525, bert 0.926, roberta 0.936

In [None]:
embedding_generator = WordEmbeddings(EmbeddingName.bert)


def get_embedding_of_dataframe(reviews):
    num_reviews = len(reviews)
    df_embedding = None

    for review in reviews:
        try:
            embedding = embedding_generator.get_sentence_embedding(review)
        except ValueError as e:
            print(review)
            raise e
        if df_embedding is None:
            df_embedding = embedding / num_reviews
        else:
            df_embedding += embedding / num_reviews
    return df_embedding


embedding_by_rating = {
    score: get_embedding_of_dataframe(read_reviews_of_rating(score)["Review"])
    for score in range(1, 6)
}
distance_matrix = torch.zeros(5, 5)
for i in range(1, 6):
    for j in range(1, 6):
        distance_matrix[i - 1, j - 1] = distance.cosine(
            embedding_by_rating[i],
            embedding_by_rating[j],
        )
for i in range(5):
    print(distance_matrix[i])

## Task 9: Model Comparisons

In [None]:
test_df = pd.read_csv(f"../Data/review_test_data.csv")
targets = test_df["Rating"].astype(int).values

method_names = [
    "vicuna",
    # "bert_fc-False_pooler-False_transformer-False_d0.2_wd1e-4_lr1e-4",
    # "bert_fc-False_pooler-False_transformer-True_d0.0_wd1e-5_lr1e-4",
    # "bert_fc-True_pooler-False_transformer-False_d0.0_wd1e-4_lr1e-3",
    # "bert_fc-True_pooler-True_transformer-False_d0.0_wd1e-5_lr1e-4",
    # "roberta_fc-False_pooler-False_transformer-False_d0.2_wd1e-5_lr1e-4",
    # "roberta_fc-True_pooler-False_transformer-False_d0.0_wd1e-4_lr1e-4",
    # "roberta_fc-True_pooler-True_transformer-False_d0.0_wd1e-5_lr1e-4",
    "roberta_fc-False_pooler-False_transformer-True_d0.0_wd1e-4_lr1e-4",
]

for method_name in method_names:
    if method_name == "vicuna":
        output_path = os.path.join("../../weights/results.json")
    else:
        output_path = os.path.join("../../weights", method_name, "outputs.npy")

    if output_path.endswith(".npy"):
        outputs = np.load(output_path)
    elif output_path.endswith(".json"):
        outputs = json.load(open(output_path))
        # If the output is ill-defined, use the neutral rating
        outputs = np.array([
            (out["rating"] if out["rating"] in range(1, 6) else 3)
            for out in outputs
        ])

    accuracy = sk_metrics.accuracy_score(targets, outputs)
    f1_score = sk_metrics.f1_score(targets, outputs, average="weighted")
    precision = sk_metrics.precision_score(targets, outputs, average="weighted")
    recall = sk_metrics.recall_score(targets, outputs, average="weighted")
    print(f"{method_name}: {accuracy:.5f} {f1_score:.5f} {precision:.5f} {recall:.5f}")

    if method_name != "roberta_fc-False_pooler-False_transformer-True_d0.0_wd1e-4_lr1e-4":
        continue

    confusion_matrix = sk_metrics.ConfusionMatrixDisplay.from_predictions(targets, outputs)
    # confusion_matrix.plot()
pass