In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
data = pd.read_csv('/content/drive/MyDrive/소캡디/data/preprocessed_data.csv')

In [None]:
!pip install pyconll

In [None]:
# utils.py

"""Some utils."""
import pyconll
import numpy as np


def conll2text(paths, outpath):
    """Write a conll file to a text file."""
    with open(outpath, 'w') as f:
        for path in paths:
            for sent in pyconll.iter_from_file(path):
                txt = []
                for x in sent:
                    txt.append(x.form)
                if txt:
                    txt = " ".join(txt).lower()
                    txt = "".join([x for x in txt if x.isprintable()])
                    f.write(f"{txt}\n")


def normalize(x):
    """Normalize a vector while controlling for zero vectors."""
    x = np.copy(x)
    if np.ndim(x) == 1:
        norm = np.linalg.norm(x)
        if norm == 0:
            return x
        return x / np.linalg.norm(x)
    norm = np.linalg.norm(x, axis=-1)
    mask = norm > 0
    x[mask] /= norm[mask][:, None]
    return x

In [None]:
# fragments.py

"""Get nouns from conllu files"""
import pyconll
import re
import json

from tqdm import tqdm
from copy import copy
from collections import Counter, defaultdict
from itertools import chain


ARROW = re.compile(r"(<-|->)")


def get_fragments(path, from_pos, to_pos, max_length):
    """Get all fragments from every sentence in a file."""
    all_tokens = [x for x in trees_from_conll(path) if x]
    result = defaultdict(list)
    for id, tokens in all_tokens:
        result[id].extend(search(tokens, from_pos, to_pos, max_length))
    return result


def trees_from_conll(path):
    """Get all trees for every sentence in a conll file."""
    for x in pyconll.iter_from_file(path):
        yield tree(x)


def tree(s):
    """Preprocess a tree to a dict."""
    tokens = {t.id: {"text": t.form.lower(),
                     "pos": t.upos,
                     "id": t.id}
              for t in s}
    if not tokens:
        return s.id.split(".")[0], []
    for token in s:
        idx = token.id
        try:
            # ROOT has a head of None
            nb = token.head
        except (ValueError, TypeError) as e:
            print(e, [x.form for x in s])
        try:
            if nb != "0":
                tokens[idx][f"<-{token.deprel}<-"] = tokens[nb]
                tokens[nb][f"->{token.deprel}->"] = tokens[idx]
        except KeyError as e:
            print(e, tokens)
            return s.id.split(".")[0], []

    return s.id.split(".")[0], list(zip(*sorted(tokens.items())))[1]


def search(tokens, from_pos, to_pos, max_length):
    """
    Search for all patterns starting with POS tag 'f' of max_length.
    Parameters
    ----------
    tokens : list of dict
        A list of dictionaries.
    from_pos : string
        The POS tag to search from.
    to_pos : string
        The POS tag to search to.
    max_length : int
        the maximum length in dependencies to search for.
    Returns
    -------
    result : list
        A list of (word, pattern, word) triples.
    """
    # start the search with tokens with the correct POS.
    result = []
    for token in [t for t in tokens if t["pos"] == from_pos]:
        # return all candidates.
        r = []
        for x in list(_search(token, to_pos, 0, max_length, [], set())):
            pos, text = zip(*x)
            pos_string = "".join(pos)
            pos = ARROW.split(pos_string)
            c = Counter(pos)
            if c[from_pos] > 1 or c[to_pos] > 1:
                continue
            # print(pos, x)
            if pos and pos[0] == from_pos and pos[-1] == to_pos:
                r.append((text[0], pos_string, text[-1]))
        if r:
            result.append(sorted(r, key=lambda x: x[1])[0])

    return result


def _search(token, to, length, max_length, path, visited, dep=None):
    """Recursive function for searching trees."""
    p = copy(path)
    if dep is None:
        p.append([f"{token['pos']}", token["text"]])
    else:
        p.append([f"{dep}{token['pos']}", token["text"]])

    visited.add(token["id"])
    paths = [p]
    if length < max_length:
        for k, v in [(k, v) for k, v in token.items()
                     if k not in {"id", "pos", "text"}]:
            if v["id"] in visited:
                continue
            paths.extend(_search(v, to, length+1, max_length, p, visited, k))
    return [x for x in paths if x]


def create_fragments(in_files, out_path, max_length):
    """Create fragments from all conllu files in a folder."""
    fragments = defaultdict(list)
    for path in tqdm(in_files):
        for k, v in get_fragments(path, "ADJ", "NOUN", max_length).items():
            fragments[k].extend(v)

    json.dump(fragments, open(out_path, 'w'))


def load_fragments(path_to_json,
                   max_path_length=5,
                   words=None):
    """
    Loads fragments from a JSON file.
    Parameters
    ----------
    path_to_json : str
        The path to the json file extracted by create_fragments
    words : iterable
        An iterable of words. Only words in this set are kept.
    max_path_length : int, default 5
        The maximum path length to extract
    Returns
    -------
    fragments : tuple of triples
        A tuple consisting of (adjective, construction, noun) triples.
    """
    fragments = json.load(open(path_to_json))
    _, fragments = zip(*fragments.items())
    fragments = list(chain(*fragments))

    num_arrows = (max_path_length * 4) + 1
    fragments = [x for x in fragments
                 if len(ARROW.split(x[1])) <= num_arrows]

    if words:
        fragments = [(x, y, z) for x, y, z in fragments if
                     x in words and z in words]

    return fragments


def create_noun_counts(in_files, out_path):
    """Get all noun counts."""
    c = Counter()
    for path in in_files:
        c.update(nouns_from_conll(path))

    json.dump(dict(c), open(out_path, 'w'))


def nouns_from_conll(path):
    """Get all nouns, regardless of adjectival modification."""
    for sent in pyconll.iter_from_file(path):
        for token in sent:
            if token.upos == "NOUN":
                yield token.form.lower()

In [None]:
# plot.py

"""Plotting of attention distributions."""
from matplotlib import pyplot as plt


def plot_attention(attentions, texts):
    assert len(attentions) == len(texts)
    fig, axes = plt.subplots(len(attentions), 1, figsize=(5, 3))

    if len(attentions) == 1:
        axes = [axes]

    for idx, (att, txt) in enumerate(zip(attentions, texts)):
        ax = axes[idx]
        ax.imshow(att[None, :],
                  vmin=.0,
                  vmax=1.0,
                  cmap="Reds",
                  aspect="auto")
        ax.set_xticks(range(att.shape[0]))
        ax.set_xticklabels(txt, rotation=45)
        ax.set_yticks([])

        for idx, x in enumerate(att):
            ax.text(idx-.2, 0, str(x.round(2))[1:])

    fig.tight_layout()
    return fig

In [None]:
# simple.py

"""Simple method."""
import numpy as np
from collections import defaultdict
#from .utils import normalize
from sklearn.metrics.pairwise import rbf_kernel
from collections import Counter


def get_aspects(fragments, embeddings, n_adj_seed, n_nouns, min_count):
    """Get aspects based on fragments."""
    adj, _, noun = zip(*fragments)
    adj_cand, _ = zip(*Counter(adj).most_common(n_adj_seed))

    cands = candidate(embeddings,
                      adj,
                      noun,
                      adj_cand,
                      n_nouns,
                      min_count)

    return cands


def candidate(embeddings,
              adj,
              noun,
              seed_words,
              n_nouns,
              min_count):
    """
    Generates candidate aspects based on adjective co-occurrences
    Parameters
    ----------
    embeddings : Reach
        A Reach instance containing the word embeddings.
    constructions : list of tuples
        A list of adjective noun tuples.
    seed_words : list of str
        A list of strings. All these words should be in vocab for the
        given embeddings model.
    frequency_threshold : int
        Any noun occurring fewer times than this threshold is discarded
    n_nouns : int
        The amount of items to return
    Returns
    -------
    candidates : dict
        A dictionary mapping strings to their scores.
    """
    a = list(set(adj))
    sims = embeddings.similarity(a, seed_words).max(1)
    adj_scores = dict(zip(a, sims))

    noun_scores = defaultdict(lambda: [0, 0])
    for adj, noun in zip(adj, noun):
        noun_scores[noun][0] += adj_scores[adj]
        noun_scores[noun][1] += 1

    noun_scores = {k: v[0] for k, v in noun_scores.items()
                   if v[1] > min_count}

    return sorted(noun_scores.items(), key=lambda x: x[1])[-n_nouns:]


def rbf_attention(vec, memory, gamma, **kwargs):
    """
    Single-head attention using RBF kernel.
    Parameters
    ----------
    vec : np.array
        an (N, D)-shaped array, representing the tokens of an instance.
    memory : np.array
        an (M, D)-shaped array, representing the memory items
    gamma : float
        the gamma of the RBF kernel.
    Returns
    -------
    attention : np.array
        A (1, N)-shaped array, representing a single-headed attention mechanism
    """
    z = rbf_kernel(vec, memory, gamma)
    s = z.sum()
    if s == 0:
        # If s happens to be 0, back off to uniform
        return np.ones((1, len(vec))) / len(vec)
    return (z.sum(1) / s)[None, :]


def softmax(x, axis=1):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x, axis, keepdims=True))
    s = e_x.sum(axis=axis, keepdims=True)
    return e_x / s


def attention(vec, memory, **kwargs):
    """
    Standard multi-head attention mechanism.
    Parameters
    ----------
    vec : np.array
        an (N, D)-shaped array, representing the tokens of an instance.
    memory : np.array
        an (M, D)-shaped array, representing the memory items
    Returns
    -------
    attention : np.array
        A (M, N)-shaped array, representing the attention over all memories.
    """
    z = memory.dot(vec.T)
    return softmax(z)


def mean(vec, aspect_vecs, **kwargs):
    """Just a mean weighting."""
    return (np.ones(len(vec)) / len(vec))[None, :]


def get_scores(instances,
               aspects,
               r,
               labels,
               remove_oov=False,
               attention_func=attention,
               **kwargs):
    """Scoring function."""
    assert all([x in r.items for x in labels])
    label_vecs = normalize(r.vectorize(labels))
    aspect_vecs = [x.mean(0)
                   for x in r.transform(aspects,
                                        remove_oov=False)]
    aspect_vecs = np.stack(aspect_vecs)
    if len(instances) == 1:
        instances = [instances]

    t = r.transform(instances, remove_oov=remove_oov)

    out = []
    for vec in t:
        att = attention_func(vec, aspect_vecs, **kwargs)
        # Att = (n_heads, n_words)
        z = att.dot(vec)
        # z = (n_heads, n_dim)
        x = normalize(z).dot(label_vecs.T)
        # x = (n_heads, n_labels)
        out.append(x.sum(0))
    return np.stack(out)

In [None]:
# preprocessing.py

"""Creating fragments takes a long time so we treat it as a
pre-processing step."""
import logging
import json

from gensim.models import Word2Vec
#from cat.fragments import create_noun_counts
#from cat.utils import conll2text
from collections import Counter

logging.basicConfig(level=logging.INFO)


if __name__ == "__main__":

    paths = ["/content/drive/MyDrive/소캡디/data/data.txt"]
   
    create_noun_counts(paths,
                       "/content/drive/MyDrive/소캡디_ABAE_Pytorch/nouns.json")
    conll2text(paths, "/content/drive/MyDrive/소캡디_ABAE_Pytorch/all_txt.txt")
    corpus = [x.lower().strip().split()
              for x in open("/content/drive/MyDrive/소캡디/data/data.txt")]

    f = Word2Vec(corpus,
                 sg=0,
                 negative=5,
                 window=10,
                 size=200,
                 min_count=2,
                 iter=5,
                 workers=10)

    f.wv.save_word2vec_format("/content/drive/MyDrive/소캡디_ABAE_Pytorch/my_word_vectors.vec")

    d = json.load(open("/content/drive/MyDrive/소캡디_ABAE_Pytorch/nouns.json"))
    nouns = Counter()
    for k, v in d.items():
        if k.lower() in f.wv.items:
            nouns[k.lower()] += v

    nouns, _ = zip(*sorted(nouns.items(),
                           key=lambda x: x[1],
                           reverse=True))

    json.dump(nouns, open("/content/drive/MyDrive/소캡디_ABAE_Pytorch/aspect_words.json", "w"))

In [None]:
!pip install reach

In [None]:
# run.py

import json

#from cat.simple import get_scores, rbf_attention
from reach import Reach
from collections import defaultdict


GAMMA = .03
N_ASPECT_WORDS = 200

if __name__ == "__main__":

    scores = defaultdict(dict)
    r = Reach.load("embeddings/my_word_vectors.vec",
                   unk_word="<UNK>")

    aspects = [[x] for x in json.load(open("data/aspect_words.json"))]
    aspects = aspects[:N_ASPECT_WORDS]

    instances = ["text_1".split(), "text_2".split()]
    label_set = {"label1", "label2", "label3"}

    s = get_scores(instances,
                   aspects,
                   r,
                   label_set,
                   gamma=GAMMA,
                   remove_oov=False,
                   attention_func=rbf_attention)

    pred = s.argmax(1)