In [21]:
from transformers import BertTokenizer, FlavaModel
import spacy
import torch

model = FlavaModel.from_pretrained("facebook/flava-full")
tokenizer = BertTokenizer.from_pretrained("facebook/flava-full")
nlp = spacy.load("en_core_web_sm")

`text_config_dict` is provided which will be used to initialize `FlavaTextConfig`. The value `text_config["id2label"]` will be overriden.
`multimodal_config_dict` is provided which will be used to initialize `FlavaMultimodalConfig`. The value `multimodal_config["id2label"]` will be overriden.
`image_codebook_config_dict` is provided which will be used to initialize `FlavaImageCodebookConfig`. The value `image_codebook_config["id2label"]` will be overriden.


In [35]:
sentence = "SpaCy provides rich linguistic annotations, and you can explore various properties."

inputs = tokenizer(sentence)
input_ids = inputs["input_ids"]
print(input_ids)
tokens = tokenizer.convert_ids_to_tokens(input_ids)
tokens = list(filter(lambda x: x != "[CLS]" and x != "[SEP]", tokens)) # TODO: add these back in in the graph
print(tokens)
edges = nlp(sentence)
for word in edges:
    print(f"{word.text} [{word.i}] -- {word.dep_} --> {word.head.text} [{word.head.i}]")

class Graph:
    def __init__(self, n):
        self.n = n
        self.mat = [[None for i in range(n)] for j in range(n)]

    def add_edge(self, u, v, edge_type):
        self.mat[u][v] = edge_type
    
    def get_01_graph(self):
        return [[1 if self.mat[i][j] is not None else 0 for j in range(self.n)] for i in range(self.n)]
    

graph = Graph(len(tokens))
sentence_arr = [token.text for token in edges]
sentence_to_token_idxs = []
i = 0
for word in edges:
    token_idxs = [i]
    i += 1
    while i < len(tokens) and "##" in tokens[i]:
        token_idxs.append(i)
        i += 1
    sentence_to_token_idxs.append(token_idxs)

for word in edges:
    print(f"{word.text} {[tokens[i] for i in sentence_to_token_idxs[word.i]]}")
    from_nodes = sentence_to_token_idxs[word.i]
    to_nodes = sentence_to_token_idxs[word.head.i]
    for from_node in from_nodes:
        for to_node in to_nodes:
            graph.add_edge(from_node, to_node, word.dep_)

# print 01 matrix
for row in graph.get_01_graph():
    print(row)

# print type matrix
for row in graph.mat:
    print(row)


[101, 12403, 5666, 3640, 4138, 12158, 5754, 17287, 9285, 1010, 1998, 2017, 2064, 8849, 2536, 5144, 1012, 102]
['spa', '##cy', 'provides', 'rich', 'linguistic', 'ann', '##ota', '##tions', ',', 'and', 'you', 'can', 'explore', 'various', 'properties', '.']
SpaCy [0] -- nsubj --> provides [1]
provides [1] -- ROOT --> provides [1]
rich [2] -- amod --> annotations [4]
linguistic [3] -- amod --> annotations [4]
annotations [4] -- dobj --> provides [1]
, [5] -- punct --> provides [1]
and [6] -- cc --> provides [1]
you [7] -- nsubj --> explore [9]
can [8] -- aux --> explore [9]
explore [9] -- conj --> provides [1]
various [10] -- amod --> properties [11]
properties [11] -- dobj --> explore [9]
. [12] -- punct --> explore [9]
SpaCy ['spa', '##cy']
provides ['provides']
rich ['rich']
linguistic ['linguistic']
annotations ['ann', '##ota', '##tions']
, [',']
and ['and']
you ['you']
can ['can']
explore ['explore']
various ['various']
properties ['properties']
. ['.']
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0

In [None]:
tokenizer.