In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
data = pd.read_csv('/content/drive/MyDrive/소캡디/data/preprocessed_data.csv')

In [None]:
data = data.dropna()

In [None]:
data[data['리뷰'].str.contains('채소')]

In [None]:
raw_data = pd.read_csv('/content/drive/MyDrive/소캡디/data/skin_top10_review.csv')
raw_data[raw_data['리뷰'].str.contains('채소')]

In [None]:
raw_data.replace('올리브영', '', inplace=True)
raw_data.replace('채소', '', inplace=True)
raw_data.replace('증인', '', inplace=True)
raw_data.replace('넘버', '', inplace=True)

In [None]:
raw_data

In [None]:
raw_data[raw_data['리뷰'].str.contains('채소')]

In [None]:
# model.py

# -*- coding: utf-8 -*-
import numpy as np
import torch
from torch.nn import init
from torch.nn.parameter import Parameter


class SelfAttention(torch.nn.Module):
    def __init__(self, wv_dim: int, maxlen: int):
        super(SelfAttention, self).__init__()
        self.wv_dim = wv_dim

        # max sentence length -- batch 2nd dim size
        self.maxlen = maxlen
        self.M = Parameter(torch.empty(size=(wv_dim, wv_dim)))
        init.kaiming_uniform_(self.M.data)

        # softmax for attending to wod vectors
        self.attention_softmax = torch.nn.Softmax(dim=-1)

    def forward(self, input_embeddings):
        # (b, wv, 1)
        mean_embedding = torch.mean(input_embeddings, (1,)).unsqueeze(2)

        # (wv, wv) x (b, wv, 1) -> (b, wv, 1)
        product_1 = torch.matmul(self.M, mean_embedding)

        # (b, maxlen, wv) x (b, wv, 1) -> (b, maxlen, 1)
        product_2 = torch.matmul(input_embeddings, product_1).squeeze(2)

        results = self.attention_softmax(product_2)

        return results

    def extra_repr(self):
        return 'wv_dim={}, maxlen={}'.format(self.wv_dim, self.maxlen)


class ABAE(torch.nn.Module):
    """
        The model described in the paper ``An Unsupervised Neural Attention Model for Aspect Extraction''
        by He, Ruidan and  Lee, Wee Sun  and  Ng, Hwee Tou  and  Dahlmeier, Daniel, ACL2017
        https://aclweb.org/anthology/papers/P/P17/P17-1036/
    """

    def __init__(self, wv_dim: int = 200, asp_count: int = 30,
                 ortho_reg: float = 0.1, maxlen: int = 201, init_aspects_matrix=None):
        """
        Initializing the model
        :param wv_dim: word vector size
        :param asp_count: number of aspects
        :param ortho_reg: coefficient for tuning the ortho-regularizer's influence
        :param maxlen: sentence max length taken into account
        :param init_aspects_matrix: None or init. matrix for aspects
        """
        super(ABAE, self).__init__()
        self.wv_dim = wv_dim
        self.asp_count = asp_count
        self.ortho = ortho_reg
        self.maxlen = maxlen

        self.attention = SelfAttention(wv_dim, maxlen)
        self.linear_transform = torch.nn.Linear(self.wv_dim, self.asp_count)
        self.softmax_aspects = torch.nn.Softmax(dim=-1)
        self.aspects_embeddings = Parameter(torch.empty(size=(wv_dim, asp_count)))

        if init_aspects_matrix is None:
            torch.nn.init.xavier_uniform(self.aspects_embeddings)
        else:
            self.aspects_embeddings.data = torch.from_numpy(init_aspects_matrix.T)

    def get_aspects_importances(self, text_embeddings):
        """
            Takes embeddings of a sentence as input, returns attention weights
        """

        # compute attention scores, looking at text embeddings average
        attention_weights = self.attention(text_embeddings)

        # multiplying text embeddings by attention scores -- and summing
        # (matmul: we sum every word embedding's coordinate with attention weights)
        weighted_text_emb = torch.matmul(attention_weights.unsqueeze(1),  # (batch, 1, sentence)
                                         text_embeddings  # (batch, sentence, wv_dim)
                                         ).squeeze()

        # encoding with a simple feed-forward layer (wv_dim) -> (aspects_count)
        raw_importances = self.linear_transform(weighted_text_emb)

        # computing 'aspects distribution in a sentence'
        aspects_importances = self.softmax_aspects(raw_importances)

        return attention_weights, aspects_importances, weighted_text_emb

    def forward(self, text_embeddings, negative_samples_texts):

        # negative samples are averaged
        averaged_negative_samples = torch.mean(negative_samples_texts, dim=2)

        # encoding: words embeddings -> sentence embedding, aspects importances
        _, aspects_importances, weighted_text_emb = self.get_aspects_importances(text_embeddings)

        # decoding: aspects embeddings matrix, aspects_importances -> recovered sentence embedding
        recovered_emb = torch.matmul(self.aspects_embeddings, aspects_importances.unsqueeze(2)).squeeze()

        # loss
        reconstruction_triplet_loss = ABAE._reconstruction_loss(weighted_text_emb,
                                                                recovered_emb,
                                                                averaged_negative_samples)
        max_margin = torch \
            .max(reconstruction_triplet_loss, torch.zeros_like(reconstruction_triplet_loss)) \
            .unsqueeze(dim=-1)

        return self.ortho * self._ortho_regularizer() + max_margin

    @staticmethod
    def _reconstruction_loss(text_emb, recovered_emb, averaged_negative_emb):

        positive_dot_products = torch.matmul(text_emb.unsqueeze(1), recovered_emb.unsqueeze(2)).squeeze()
        negative_dot_products = torch.matmul(averaged_negative_emb, recovered_emb.unsqueeze(2)).squeeze()
        reconstruction_triplet_loss = torch.sum(1 - positive_dot_products.unsqueeze(1) + negative_dot_products, dim=1)

        return reconstruction_triplet_loss

    def _ortho_regularizer(self):
        return torch.norm(
            torch.matmul(self.aspects_embeddings.t(), self.aspects_embeddings) \
            - torch.eye(self.asp_count))

    def get_aspect_words(self, w2v_model, logger, topn=15):
        words = []

        # getting aspects embeddings
        aspects = self.aspects_embeddings.detach().numpy()

        # getting scalar products of word embeddings and aspect embeddings;
        # to obtain the ``probabilities'', one should also apply softmax
        # words_scores = w2v_model.wv.syn0.dot(aspects)
        words_scores = w2v_model.wv.vectors.dot(aspects)

        for row in range(aspects.shape[1]):
            argmax_scalar_products = np.argsort(- words_scores[:, row])[:topn]
            # print([w for w, dist in w2v_model.wv.similar_by_vector(aspects.T[row])[:topn]])
            words.append([w2v_model.wv.index_to_key[i] for i in argmax_scalar_products])

        return words

In [None]:
import codecs
import sys

import gensim
from tqdm import tqdm

model = gensim.models.Word2Vec.load('/content/drive/MyDrive/소캡디/data/w2v_embedding.model')

In [None]:
!pip install gensim==4.0.0

In [None]:
# reader.py

# -*- coding: utf-8 -*-
import gensim
import numpy as np
from sklearn.cluster import MiniBatchKMeans


def read_data_batches(path: str, batch_size: int=50, minlength: int=5):
    """
        Reading batched texts of given min. length
    :param path: path to the text file ``one line -- one normalized sentence''
    :return: batches iterator
    """
    batch = []

    for line in open(path, encoding="utf-8"):
        line = line.strip().split()

        # lines with less than `minlength` words are omitted
        if len(line) >= minlength:
            batch.append(line)
            if len(batch) >= batch_size:
                yield batch
                batch = []

    if len(batch) > 0:
        yield batch


def text2vectors(text: list, w2v_model, maxlen: int, vocabulary):
    """
        Token sequence -- to a list of word vectors;
        if token not in vocabulary, it is skipped; the rest of
        the slots up to `maxlen` are replaced with zeroes
    :param text: list of tokens
    :param w2v_model: gensim w2v model
    :param maxlen: max. length of the sentence; the rest is just cut away
    :return:
    """

    acc_vecs = []

    for word in text:
        if word in w2v_model.wv and (vocabulary is None or word in vocabulary):
            acc_vecs.append(w2v_model.wv[word])

    # padding for consistent length with ZERO vectors
    if len(acc_vecs) < maxlen:
        acc_vecs.extend([np.zeros(w2v_model.vector_size)] * (maxlen - len(acc_vecs)))

    return acc_vecs


def get_w2v(path):
    """
        Reading word2vec model given the path
    """
    return gensim.models.Word2Vec.load(path)


def read_data_tensors(path, word_vectors_path=None,
                      batch_size=50, vocabulary=None,
                      maxlen=100, pad_value=0, min_sent_length=5):
    """
        Data for training the NN -- from text file to word vectors sequences batches
    :param path:
    :param word_vectors_path:
    :param batch_size:
    :param vocabulary:
    :param maxlen:
    :param pad_value:
    :param minsentlength:
    :return:
    """
    w2v_model = get_w2v(word_vectors_path)

    for batch in read_data_batches(path, batch_size, min_sent_length):
        batch_vecs = []
        batch_texts = []

        for text in batch:
            vectors_as_list = text2vectors(text, w2v_model, maxlen, vocabulary)
            batch_vecs.append(np.asarray(vectors_as_list[:maxlen], dtype=np.float32))
            batch_texts.append(text)

        yield np.stack(batch_vecs, axis=0), batch_texts


def get_centroids(w2v_model, aspects_count):
    """
        Clustering all word vectors with K-means and returning L2-normalizes
        cluster centroids; used for ABAE aspects matrix initialization
    """

    km = MiniBatchKMeans(n_clusters=aspects_count, verbose=0, n_init=100)
    m = []

    for k in w2v_model.wv.key_to_index:
        m.append(w2v_model.wv[k])

    m = np.matrix(m)
    km.fit(m)
    clusters = km.cluster_centers_

    # L2 normalization
    norm_aspect_matrix = clusters / np.linalg.norm(clusters, axis=-1, keepdims=True)

    return norm_aspect_matrix


if __name__ == "__main__":

    for b in read_data_tensors("/content/drive/MyDrive/소캡디/data/data.txt", '/content/drive/MyDrive/소캡디/data/w2v_embedding.model', batch_size=3):
        print(b[0].shape, b[1][:2])

In [None]:
!pip install hydra

In [None]:
!pip install hydra-core --upgrade

In [None]:
# -*- coding: utf-8 -*-
import logging

import pathlib
#import hydra
import numpy as np
import torch
import os
import argparse

#from model import ABAE
#from reader import get_centroids, get_w2v, read_data_tensors


logger = logging.getLogger(__name__)

#parser = argparse.ArgumentParser()
#args = parser.parse_args(args=[])
#@hydra.main("configs", "config")
def main():
    w2v_model = get_w2v(os.path.join('/content/drive/MyDrive/소캡디/data/w2v_embedding.model'))
    wv_dim = w2v_model.vector_size
    y = torch.zeros((8, 1))

    model = ABAE(wv_dim=wv_dim,
                 asp_count = 5,
                 init_aspects_matrix=get_centroids(w2v_model, aspects_count = 5))
    logger.debug(str(model))
    

    criterion = torch.nn.MSELoss(reduction="sum")
    optimizer = torch.optim.Adam(model.parameters())
    

    for t in range(1):

        logger.debug("Epoch %d/%d" % (t + 1, 1))

        data_iterator = read_data_tensors('/content/drive/MyDrive/소캡디/data/data.txt',
                                          '/content/drive/MyDrive/소캡디/data/w2v_embedding.model',
                                          batch_size=8, maxlen=201)

        for item_number, (x, texts) in enumerate(data_iterator):
            if x.shape[0] < 8:  # pad with 0 if smaller than batch size
                x = np.pad(x, ((0, 8 - x.shape[0]), (0, 0), (0, 0)))

            x = torch.from_numpy(x)

            # extracting bad samples from the very same batch; not sure if this is OK, so todo
            negative_samples = torch.stack(
                tuple([x[torch.randperm(x.shape[0])[:5]]
                       for _ in range(8)]))

            # prediction
            y_pred = model(x, negative_samples)
            #print(y_pred)

            # error computation
            loss = criterion(y_pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if item_number % 100 == 0:

                print("%d batches, and LR: %.5f" % (item_number, optimizer.param_groups[0]['lr']))

                for i, aspect in enumerate(model.get_aspect_words(w2v_model, logger)):
                    print(("[%d] %s" % (i + 1, " ".join([a for a in aspect]))))
                    #print(aspect)

                print("Loss: %.4f" % loss.item())

                try:
                    torch.save(model, f"abae_%.2f_%06d.bin" % (loss.item(), item_number))
                    print(1)
                except Exception as e:
                    print("Model saving failed.")


if __name__ == "__main__":
    main()

In [None]:
ㄴS'''
    if cfg.optimizer.name == "adam":
        optimizer = torch.optim.Adam(model.parameters())
    elif cfg.optimizer.name == "sgd":
        optimizer = torch.optim.SGD(model.parameters(), lr=cfg.optimizer.learning_rate)
    elif cfg.optimizer.name == "adagrad":
        optimizer = torch.optim.Adagrad(model.parameters())
    elif cfg.optimizer.name == "asgd":
        optimizer = torch.optim.ASGD(model.parameters(), lr=cfg.optimizer.learning_rate)
    else:
        raise Exception("Optimizer '%s' is not supported" % cfg.optimizer.name)
'''
