<a href="https://colab.research.google.com/github/Shivam-Miglani/easdrl_datasets/blob/main/nltopddl_data_emb_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install flair datasets

In [15]:
import pandas as pd
import numpy as np
import torch
from flair.data import Sentence
from flair.embeddings import WordEmbeddings, StackedEmbeddings, TransformerWordEmbeddings

In [19]:
DATASET = "win2k"

# ACT

In [20]:
emb_names = ["glove", "en", "bert-base-uncased","roberta-base",  "xlm-roberta-base"]

def assign_rl_actions(x):
    rl_acts = np.ones(len(x["tokens"]), dtype=np.int8)
    for acts in x["acts"]:
        rl_acts[acts["act_idx"]] = acts["act_type"] + 1
    return rl_acts

def get_stacked_embedding(emb_name, mean=True):
    """
    Use flair to get stacked transformer embeddings or just glove embedding.
    Following emb_name are possible:
    - glove
    - bert-base-uncased
    - roberta-base

    With mean = True, emb_size: 768 (bert-base) + 100 (glove) = 868
    With mean = False, emb_size: 768 (bert-base) * 4 (layers) + 100 (glove) = 3172
    """
    if emb_name == "glove":
        return WordEmbeddings("glove")
    elif emb_name == "en":
        return WordEmbeddings("en")
    else:
        return StackedEmbeddings(
            [
                WordEmbeddings("glove"),
                TransformerWordEmbeddings(emb_name, layers="-1,-2,-3,-4", layer_mean=mean, 
                                          subtoken_pooling="mean", use_context=True),
            ]
        )

def add_padding(x):
    pad_len = NUM_WORDS - len(x["tokens"])
    if pad_len > 0:
      x["rl_actions"] = np.concatenate((x["rl_actions"], np.ones(pad_len, dtype=np.int32))).reshape(
          NUM_WORDS, 1
      )
      x["doc_embs"] = np.concatenate((x["doc_embs"], np.zeros([pad_len, WORD_DIM])))
    else:
      x["rl_actions"] = x["rl_actions"][: NUM_WORDS]
      x["doc_embs"] = x["doc_embs"][: NUM_WORDS]
    return x

def get_doc_embeddings(sentences):
    """
    generate stacked embedding from a sentence
    """
    sent_vec = []
    for sent in sentences:
        sent = Sentence(sent, use_tokenizer=False)
        STACKED_EMBEDDING.embed(sent)
        for token in sent:
            sent_vec.append(token.embedding.cpu().numpy())
    return np.array(sent_vec)

for i in range(len(emb_names)):
  EMB_NAME = emb_names[i]
  print(EMB_NAME)
  act_df = pd.DataFrame(pd.read_pickle(f'{DATASET}_act.pkl'))
  act_df = act_df.rename(columns={"words": "tokens"})
  act_df["doc_len"] = act_df.tokens.apply(lambda x: len(x))
  act_df["exclusive_related_actions"] = act_df.acts.apply(lambda x: {y["act_idx"]: y["related_acts"] for y in x})
  act_df["rl_actions"] = act_df.apply(assign_rl_actions, axis=1)
  act_df["sents_"] = act_df.sents.apply(lambda x: [" ".join(y) for y in x])

  
  STACKED_EMBEDDING = get_stacked_embedding(EMB_NAME)

  act_df["doc_embs"] = act_df.sents_.apply(get_doc_embeddings)

  NUM_WORDS = 512
  WORD_DIM = act_df.doc_embs.loc[0][0].shape[0]

  act_df = act_df.apply(add_padding, axis=1)
  del act_df["sents_"]
  del act_df["sent_acts"]
  del act_df["word2sent"]
  del act_df["acts"]
  del act_df["doc_len"]
  act_df['state_with_label'] = act_df.apply(lambda x: np.concatenate((x['doc_embs'], x['rl_actions'].reshape(NUM_WORDS, 1)), axis=1), axis=1)
  del act_df['sents']
  del act_df['doc_embs']
  del act_df['rl_actions']

  act_df.to_csv(f"{DATASET}_act_{EMB_NAME}.csv", index=False)


glove
en
bert-base-uncased
roberta-base
xlm-roberta-base


# ARG

In [21]:
from tqdm import tqdm
NUM_WORDS=150
emb_names = ["glove", "en", "bert-base-uncased","roberta-base",  "xlm-roberta-base"]

def get_stacked_embedding(emb_name, mean=True):
    """
    Use flair to get stacked transformer embeddings or just glove embedding.
    Following emb_name are possible:
    - glove
    - bert-base-uncased
    - roberta-base

    With mean = True, emb_size: 768 (bert-base) + 100 (glove) = 868
    With mean = False, emb_size: 768 (bert-base) * 4 (layers) + 100 (glove) = 3172
    """
    if emb_name == "glove":
        return WordEmbeddings("glove")
    elif emb_name == "en":
        return WordEmbeddings("en")
    else:
        return StackedEmbeddings(
            [
                WordEmbeddings("glove"),
                TransformerWordEmbeddings(emb_name, layers="-1,-2,-3,-4", layer_mean=mean, 
                                          subtoken_pooling="mean", use_context=True),
            ]
        )

def gen_stacked_embedding(words):
    """
    generate stacked embedding from a list of words
    """
    sent_vec = []
    # Stacked embeddings
    line = " ".join(words)
    sent = Sentence(line, use_tokenizer=False)
    STACKED_EMBEDDING.embed(sent)
    for token in sent:
        sent_vec.append(token.embedding.cpu().numpy())
    return np.array(sent_vec)

for i in range(0, len(emb_names)):
  EMB_NAME = emb_names[i]
  STACKED_EMBEDDING = get_stacked_embedding(EMB_NAME)
  docs = pd.read_pickle(f"{DATASET}_arg.pkl")[-1]
  arg_sents = []
  for i in tqdm(range(len(docs)), position=0, leave=True):
      for j in range(len(docs[i])):
          if len(docs[i][j]) == 0:
              continue
          words = docs[i][j]["last_sent"] + docs[i][j]["this_sent"]
          sent_len = len(words)  # here sent len is last_sent + this_sent
          act_inds = [
              a["act_idx"] for a in docs[i][j]["acts"] if a["act_idx"] < NUM_WORDS
          ]

          # for each ground-truth action in each doc's each sample
          for k in range(len(docs[i][j]["acts"])):
              act_ind = docs[i][j]["acts"][k]["act_idx"]  # action index
              # object index list
              obj_inds = docs[i][j]["acts"][k]["obj_idxs"]
              arg_sent = {}

              # assign rl_action tags
              arg_tags = np.ones(sent_len, dtype=np.int32)  # tags
              if len(obj_inds[1]) == 0:
                  arg_tags[obj_inds[0]] = 2  # essential objects
              else:
                  arg_tags[obj_inds[0]] = 4  # exclusive objects
                  arg_tags[obj_inds[1]] = 4  # exclusive objects

              # gen. distance repr
              position = np.zeros(sent_len, dtype=np.int32)
              position.fill(act_ind)
              distance = np.abs(np.arange(sent_len) - position)

              arg_sent["tokens"] = words
              arg_sent["tags"] = arg_tags
              arg_sent["act_ind"] = act_ind
              arg_sent["distance"] = distance
              arg_sent["act_inds"] = act_inds
              arg_sent["obj_inds"] = obj_inds

              # Stacked embeddings
              sent_vec = gen_stacked_embedding(words)
              WORD_DIM = sent_vec[0].shape[0]

              # Padding
              pad_len = NUM_WORDS - len(sent_vec)
              distance = np.zeros([NUM_WORDS, 1])

              if pad_len > 0:
                  sent_vec = np.concatenate((sent_vec, np.zeros([pad_len, WORD_DIM])))
                  arg_sent["tags"] = np.concatenate(
                      (arg_sent["tags"], np.ones(pad_len, dtype=np.int32))
                  )
                  for d in range(len(arg_sent["distance"])):
                      distance[d] = arg_sent["distance"][d]
              else:
                  sent_vec = sent_vec[: NUM_WORDS]
                  arg_sent["tokens"] = arg_sent["tokens"][: NUM_WORDS]
                  arg_sent["tags"] = np.array(arg_sent["tags"])[: NUM_WORDS]
                  for d in range(NUM_WORDS):
                      distance[d] = arg_sent["distance"][d]

              # RL State for arg DQN
              sent_vec = np.concatenate((sent_vec, distance), axis=1)
              arg_sent["sent_vec"] = sent_vec
              arg_sent["tags"].shape = (NUM_WORDS, 1)
              arg_sents.append(arg_sent)
  arg_df = pd.DataFrame(arg_sents)
  arg_df['state_with_label'] = arg_df.apply(lambda x: np.concatenate((x['sent_vec'], x['tags']), axis=1), axis=1)
  del arg_df['tags']
  del arg_df['sent_vec']
  del arg_df['distance']

  arg_df.to_csv(f"{DATASET}_arg_{EMB_NAME}.csv", index=False)

100%|██████████| 154/154 [00:02<00:00, 75.75it/s]
100%|██████████| 154/154 [00:02<00:00, 72.74it/s]
100%|██████████| 154/154 [00:22<00:00,  6.81it/s]
100%|██████████| 154/154 [00:22<00:00,  6.74it/s]
100%|██████████| 154/154 [00:22<00:00,  6.86it/s]
