In [None]:
# This imports a parser for xml through tei.  Parsing metadata
# The lxml XML toolkit is a Pythonic binding for the C libraries libxml2 and libxslt. It is unique in that it combines the speed and XML feature completeness of these libraries with the simplicity of a native Python API, mostly compatible but superior to the well-known ElementTree API. The latest release works with all CPython versions from 2.7 to 3.9. See the introduction for more information about background and goals of the lxml project. Some common questions are answered in the FAQ.
import lxml.etree
# Will need o download shrew file from github to run demo
tree = lxml.etree.parse('Shrew.xml')

print(tree)

In [None]:
# Prints xml data as a string
print(lxml.etree.tostring(tree).decode()[0:500])

In [None]:
# Roots and children are the elemets within the data tree.  Root is the tei itself.
print(tree.getroot().tag)

print(len(tree.getroot()))

for child in tree.getroot():
    print(child.tag, child.attrib)
    for nested in child:
        print(nested.tag, nested.attrib)

In [None]:
# this removes tei header and addition info, allowing access to actal text
start = tree.getroot()[1][1]
print(start.tag)
for child in start:
    print (child.tag)

In [None]:
#this won't work - need the namespace (as shown above)

print(tree.getroot().find('title'))

#using a namespace map
# save the namespace map to nav tree so we can find what we are looking for

NSMAP = {'tei': 'http://www.tei-c.org/ns/1.0'}
print(tree.getroot().find('.//tei:title', namespaces=NSMAP).text)

In [None]:
# Builds a charater relationship map between characters
# node is an identified speaker, edges is when another follows that speaker
def character_network(tree):
    """Construct a character interaction network.

    Construct a character interaction network for Shakespeare texts in
    the Folger Digital Text collection. Character interaction networks
    are constructed on the basis of successive speaker turns in the texts,
    and edges between speakers are created when their utterances follow
    one another.

    Arguments:
        tree: An lxml.ElementTree instance representing one of the XML
            files in the Folger Shakespeare collection.

    Returns:
        A character interaction network represented as a weighted,
        undirected NetworkX Graph.

    """
    # This creates the graph
    G = nx.Graph()
    # extract a list of speaker turns for each scene in a play
    for scene in tree.iterfind('.//tei:div2[@type="scene"]', NSMAP):
        speakers = scene.findall('.//tei:sp', NSMAP)
        # iterate over the sequence of speaker turns...
        for i in range(len(speakers) - 1):
            # ... and extract pairs of adjacent speakers
            try:
                speaker_i = speakers[i].attrib['who'].split('_')[0].replace('#', '')
                speaker_j = speakers[i + 1].attrib['who'].split('_')[0].replace('#', '')
                # if the interaction between two speakers has already
                # been attested, update their interaction count
                if G.has_edge(speaker_i, speaker_j):
                    G[speaker_i][speaker_j]['weight'] += 1
                # else add an edge between speaker i and j to the graph
                else:
                    G.add_edge(speaker_i, speaker_j, weight=1)
            except KeyError:
                continue
    return G

# Speakers will be nodes, edges will be the weighted counting oftheir interactions

In [None]:
# This is the math part.  No further details
import networkx as nx
import matplotlib.pyplot as plt

G = character_network(tree.getroot())
print(f"N nodes = {G.number_of_nodes()}, N edges = {G.number_of_edges()}")

In [None]:
import collections

interactions = collections.Counter()

for speaker_i, speaker_j, data in G.edges(data=True):
    interaction_count = data['weight']
    interactions[speaker_i] += interaction_count
    interactions[speaker_j] += interaction_count

nodesizes = [interactions[speaker] * 5 for speaker in G]

In [None]:
# Create an empty figure of size 15x15
fig = plt.figure(figsize=(15, 15))
# Compute the positions of the nodes using the spring layout algorithm
pos = nx.spring_layout(G, k=0.5, iterations=200)
# Then, add the edges to the visualization
nx.draw_networkx_edges(G, pos, alpha=0.4)
# Subsequently, add the weighted nodes to the visualization
nx.draw_networkx_nodes(G, pos, node_size=nodesizes, alpha=0.4)
# Finally, add the labels (i.e. the speaker IDs) to the visualization
nx.draw_networkx_labels(G, pos)
plt.axis('off');

In [None]:


import json
from networkx.readwrite import json_graph

with open('shrew.json', 'w') as f:
    json.dump(json_graph.node_link_data(G), f)

with open('shrew.json') as f:
    d = json.load(f)

G = json_graph.node_link_graph(d)
print(f"Graph with {len(G.nodes())} nodes and {len(G.edges())} edges.")

