<a href="https://colab.research.google.com/github/NikolasGialitsis/DependencyParsing/blob/main/DependencyParsingDemo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import spacy
import networkx as nx
import string
import pprint
translator = str.maketrans('', '', string.punctuation)


def preprocess_word(word):
    global translator
    return word.translate(translator)

def preprocess_sentence(text):
    tokenized_lower = text.lower().split()
    return list(map(preprocess_word, tokenized_lower))

In [None]:

sentences = [
    'nick had a little lamp',
    'cognihub is the best place on earth',
    #...
]
preprocessed_sentences = list(map(preprocess_sentence, sentences))
wordset = set()
for sentence in preprocessed_sentences:
    for word in sentence:
        wordset.add(word)

print(wordset)

{'is', 'little', 'place', 'cognihub', 'had', 'best', 'earth', 'a', 'nick', 'lamp', 'the', 'on'}


In [None]:
nlp = spacy.load("en_core_web_sm")

base_graph = nx.Graph()
base_graph.add_nodes_from(wordset)


repr = {}
for sentence_id, sentence_contents in enumerate(sentences):
    sentence_graph = base_graph.copy()
    processed_sentence =  nlp(' '.join(preprocess_sentence(sentence_contents)))
    
    print('\n',processed_sentence)

    n_nodes, n_edges = nx.number_of_nodes(sentence_graph), nx.number_of_edges(sentence_graph)
    print(n_nodes,'nodes in sentence graph')
    print(n_edges,'edges in sentence graph')


    for token in processed_sentence:
        nodeA = token.text
        nodeB = token.head.text
        print('\tadding edge between', nodeA, 'and', nodeB)
        sentence_graph.add_edge(nodeA, nodeB)
        sentence_representation =  nx.adjacency_matrix(sentence_graph) #sparse matrix
        print('\t\t sparse matrix has ',sentence_representation.count_nonzero(),'nonzero elements')
        repr[sentence_id] = sentence_representation

    n_nodes, n_edges = nx.number_of_nodes(sentence_graph), nx.number_of_edges(sentence_graph)
    print(n_nodes,'nodes in sentence graph')
    print(n_edges,'edges in sentence graph')



print('\n\nsaving representations')
import pickle
with open('sentence_representations_adjmat.p','wb') as fw:
    pickle.dump(repr, fw)
    



 nick had a little lamp
12 nodes in sentence graph
0 edges in sentence graph
	adding edge between nick and had
		 sparse matrix has  2 nonzero elements
	adding edge between had and had
		 sparse matrix has  3 nonzero elements
	adding edge between a and lamp
		 sparse matrix has  5 nonzero elements
	adding edge between little and lamp
		 sparse matrix has  7 nonzero elements
	adding edge between lamp and had
		 sparse matrix has  9 nonzero elements
12 nodes in sentence graph
5 edges in sentence graph

 cognihub is the best place on earth
12 nodes in sentence graph
0 edges in sentence graph
	adding edge between cognihub and is
		 sparse matrix has  2 nonzero elements
	adding edge between is and is
		 sparse matrix has  3 nonzero elements
	adding edge between the and place
		 sparse matrix has  5 nonzero elements
	adding edge between best and place
		 sparse matrix has  7 nonzero elements
	adding edge between place and is
		 sparse matrix has  9 nonzero elements
	adding edge between on a