In [1]:
import networkx as nx
from pathlib import Path
from bs4 import BeautifulSoup
from itertools import combinations
import matplotlib
import csv
import copy
from shakespear_extra import SHAKESPEAR_GENRES

SHAKEDRACOR ANALYSIS

In [4]:
# helper function if network is split into disconnected parts
def get_largest_G(input_G):
    if nx.is_connected(input_G) is False:
        components = list(nx.connected_components(input_G))
        small_components = [f for f in components if len(f) != max([len(c) for c in components])]
        G_copy = input_G.copy()
        for component in small_components:
            for node in component:
                G_copy.remove_node(node)
        return G_copy
    else:
        return input_G

In [5]:
# open xml files and parse soups
shake_soups = {}
for xml_path in Path('shakedracor/tei').glob('*.xml'):
    with open(xml_path, 'r') as fh:
        shake_soups[xml_path.stem] = BeautifulSoup(fh.read(), 'lxml-xml')

In [6]:
# get significant characters who have their own dialouge - THIS IS FOR FURTHER ANALYSIS, NOT FOR MATCHING WHOLE DRACORE NETWORK
significant_characters = []
for sp in shake_soups['coriolanus'].find_all('sp', {'who':True}):
    speakers = sp['who'].split(' ')
    if len(speakers) == 1 and speakers[0] not in significant_characters:
        significant_characters.extend(speakers)

In [7]:
# Iterate div type="act" tags and extract all characters, create combinations of 2 and add combination to list of edges
G_list = {}
for name, soup in shake_soups.items():
    edge_list = []
    lonely_nodes = []
    significant_characters = []
    for scene in soup.find_all('div', {'type': 'scene'}):
        scene_characters = []
        for sp in scene.find_all('sp', {'who': True}):
            speakers = sp['who'].split(' ')

            # UNCOMMENT FOR FURTHER ANALYSIS USING ONLY SIGNIFICANT CHARACTERS
#             if len(speakers) == 1 and speakers[0] not in significant_characters:
#                 significant_characters.extend(speakers)

            for split_sp in speakers:
                scene_characters.append(split_sp)
            
        if len(scene_characters) == 1:
            lonely_nodes.append(scene_characters[0])
            
        scene_edge_list = list(combinations(set(scene_characters), 2))
        edge_list += scene_edge_list
    
    whole = nx.from_edgelist(set(edge_list))
    
    for lonely_node in lonely_nodes:
        if lonely_node not in whole.nodes:
            whole.add_node(lonely_node)
    
    # UNCOMMENT FOR FURTHER ANALYSIS USING ONLY SIGNIFICANT CHARACTERS
    # remove those who have no independent dialouge
#     removed_nodes = []
#     for node in list(whole.nodes()):
#         if node not in significant_characters:
#             whole.remove_node(node)
#             removed_nodes.append(node)
    
                
    G_list[name] = {'whole': whole, 
                    'title_pretty': soup.find('title').get_text(strip=True), 
                    'kept_characters': ', '.join(list(whole.nodes())),
                    'character_count': len(whole.nodes()),
#                     'removed_characters': ', '.join(removed_nodes),
#                     'removed_characters_count': len(removed_nodes)
                   }
    

for name, soup in shake_soups.items():
    for n in [1,2,3,4,5]:
        lonely_nodes = []
        edge_list = []
        significant_characters = []
    
        for act in soup.find_all('div', {'type': 'act'}):
            if act['n'] != str(n):
                for scene in act.find_all('div', {'type': 'scene'}):
                    scene_characters = []
                    for sp in scene.find_all('sp', {'who': True}):
                        speakers = sp['who'].split(' ')
                        if len(speakers) == 1 and speakers[0] not in significant_characters:
                            significant_characters.extend(speakers)
                        for split_sp in speakers:
                            scene_characters.append(split_sp)
                            scene_edge_list = list(combinations(set(scene_characters), 2))
                    if len(scene_characters) == 1:
                        lonely_nodes.append(scene_characters[0])
                    edge_list += scene_edge_list
        
        # create network from edge list
        whole = nx.from_edgelist(set(edge_list))
    
        # add those with independent scenes
        for lonely_node in lonely_nodes:
            if lonely_node not in whole.nodes:
                whole.add_node(lonely_node)
    
        # UNCOMMENT FOR FURTHER ANALYSIS USING ONLY SIGNIFICANT CHARACTERS
        # remove those who have no independent dialouge
#         for node in list(whole.nodes()):
#             if node not in significant_characters:
#                 whole.remove_node(node)
                
        G_list[name]['wo'+str(n)] = whole

**CSV WRITING**

In [8]:
column_names = ['title', 'title_pretty', 'genre', 'kept_characters', 'character_count', 'removed_characters', 'removed_characters_count']
metric_names = ['density', 'diameter', 'average_clustering']
for w in ['whole', 'wo1', 'wo2', 'wo3', 'wo4', 'wo5']:
    for n in metric_names:
        column_names.append(w+'_'+n)

In [11]:
with open('shakedracore_metrics8_misitest.csv', 'w', newline='') as csvfile:
    fieldnames = column_names
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    
    for name, graph_dict in G_list.items():
        drama_dict = {'title': name,
                      'title_pretty': graph_dict['title_pretty'],
                      'genre': SHAKESPEAR_GENRES[name],
                      'kept_characters': graph_dict['kept_characters'],
                      'character_count': graph_dict['character_count'],
#                       'removed_characters': graph_dict['removed_characters'],
#                       'removed_characters_count': graph_dict['removed_characters_count']
                     }
        
        drama_dict['whole_density'] = nx.density(graph_dict['whole'])
        drama_dict['whole_diameter'] = nx.diameter(get_largest_G(graph_dict['whole']))
        drama_dict['whole_average_clustering'] = nx.average_clustering(graph_dict['whole'])
        
        for w in ['wo1', 'wo2', 'wo3', 'wo4', 'wo5']:
            drama_dict[w+'_density'] = nx.density(graph_dict[w])
            drama_dict[w+'_diameter'] = nx.diameter(get_largest_G(graph_dict[w]))
            drama_dict[w+'_average_clustering'] = nx.average_clustering(graph_dict[w])
        
        
        writer.writerow(drama_dict)

In [None]:
# results
for name, G in G_list.items():
    print(f'{name:>15} DENSITY:', nx.density(G))
    print(f'{name:>15} DIAMETER:', nx.diameter(G_list_connected[name]))
    print(f'{name:>15} AVERAGE CLUSTERING:', nx.average_clustering(G), '\n')