In [9]:
import csv
from pathlib import Path
from itertools import combinations
from collections import defaultdict

import networkx as nx
from bs4 import BeautifulSoup

In [11]:
comedies_and_tragedies = {}
with open('fredracor-metadata.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if row['normalizedGenre'] in ['Tragedy', 'Comedy']:
            comedies_and_tragedies[row['name']] = row['normalizedGenre']

### Files

In [3]:
fredracor_tei_files = '/home/misinagy/Projects/fredracor/tei'

### FREDRACOR CUMULATIVE

##### Helpers

In [33]:
def one_appearance_unit_edge_list(unit, lonely_nodes):
    """
    Take unit of BS4 tag and extract all connections between speakers in unit.
    """
    characters = []

    for sp in unit.find_all('sp', {'who': True}):
        # get each speaker from speaker tag
        speakers = sp['who'].split(' ')

        for split_sp in speakers:
            characters.append(split_sp)
    edge_list = list(combinations(set(characters), 2))
    # if only one character in scene, add lonely node
    if len(characters) == 1:
        lonely_nodes.append(characters[0])

    return edge_list, lonely_nodes


def get_largest_G(input_G, name=None):
    """
    Function to extract largest connected section of nx Graph object.
    """
    if len(input_G.nodes) == 0:
        raise ValueError(f'ZERO NODE GRAPH PASSED TO get_largest_G - {input_G}: {name}')
    if nx.is_connected(input_G) is False:
        # larges connected section
        nodes_in_largest = max(nx.connected_components(input_G), key=len)
        nodes_to_remove = set(input_G.nodes) - nodes_in_largest
        G_copy = input_G.copy()
        G_copy.remove_nodes_from(nodes_to_remove)
        return G_copy
    else:
        return input_G

##### Collection

In [5]:
# SETTINGS
# Change these variables if needed!

# List of dramas to exclude (bad/different structure)
BLACKLIST = ['anonyme-vende']

# Tags to consider as acts
ACT_TAGS = ['act', 'acte', 'prologue', 'ate', 'partie', 'critique', 'tableau', 'intermede']

# Tags to consider as scenes
SCENE_TAGS = ['scene', 'ecene', 'zcene', 'scne', 'type', 'vaudeville', 'ballet', 'divertissement', 'epilogue', 'couplet', 'couplets', 'marche']


In [6]:
# open XML files of tragedy and comedy dramas, and parse soups where 'act' div is 5+ and there are
# 5+ characters.

def div_type_tag_count(s: BeautifulSoup, tag_names: list):
    total = 0
    for tag_name in tag_names:
        total += len(s.find_all('div', {'type': tag_name}))
    return total

fre_soups = {}
for xml_path in Path(fredracor_tei_files).glob('*.xml'):

    # Genres handled according to dracor metadata csv for consistency
    if xml_path.stem in comedies_and_tragedies and xml_path.stem not in BLACKLIST:
        with open(xml_path, 'r') as fh:
            # Filter 5 act, 5+ actor, Tragedy and Comedy dramas from GerDracor
            soup = BeautifulSoup(fh.read(), 'lxml-xml')

            acts_len = div_type_tag_count(soup, ACT_TAGS)

            cast_list = soup.find('profileDesc').find('listPerson').find_all('person')
            if cast_list is None:
                raise ValueError('No cast list found!')
            if acts_len == 5 and len(cast_list) > 5:
                fre_soups[xml_path.stem] = soup
print(f'{len(fre_soups)} collected')

400 collected


In [7]:
# CREATE THE MAIN EXTRACTOR FUNCTION
# take first n acts from drama
def edge_list_extractor(list_of_soup_segments):
    """
    Takes list of soup elements, returns edge list for shared scenes, and if no scenes, just shared acts.
    This is GerDracor specific, and should only be used for the 128 dramas --> len(acts) == 5 and len(cast_list) > 5 and genre in ['Tragedy', 'Comedy']
    """

    lonely_nodes = []
    edge_list_in_iteration = []

    # iterate over n acts
    for c, act in enumerate(list_of_soup_segments, start=1):
        if act is None:
            if c == 1 or c == len(list_of_soup_segments):
                continue
            else:
                raise ValueError(f'ACT {c} IS NONE IN {name}')

        all_scene_type_tags = []
        for scene_type_tag in SCENE_TAGS:
            scene_type_tags = act.find_all('div', {'type': scene_type_tag})
            if len(scene_type_tags) > 0:
                all_scene_type_tags.extend(scene_type_tags)

        # IF IT HAS SCENES
        if len(all_scene_type_tags) > 0:
            for scene_tag in all_scene_type_tags:
                if scene_tag.find('div', {'type': 'scene'}) is not None:
                    print(f'Scene tag has scene tags')
                else:
                    scene_edge_list, lonely_nodes = one_appearance_unit_edge_list(scene_tag, lonely_nodes)
                    edge_list_in_iteration += scene_edge_list

        else:
            if any(act.find(elem) is not None for elem in ACT_TAGS):
                raise ValueError(f'Unaccounted div type in {name} !')
            act_edge_list, lonely_nodes = one_appearance_unit_edge_list(act, lonely_nodes)
            edge_list_in_iteration += act_edge_list

    return edge_list_in_iteration, lonely_nodes

In [27]:
# MAIN CUMULATIVE NETWORKS GENERATION FOR FREDRACOR

cumulative_G_list_fre = defaultdict(dict)

for name, soup in fre_soups.items():

    # Metadata annotation for dict
    cumulative_G_list_fre[name]['soup'] = soup
    cumulative_G_list_fre[name]['genre'] = comedies_and_tragedies[name]
    genre_tag = soup.find('textClass').find('keywords').find('term', {'type': 'genreTitle'})
    cumulative_G_list_fre[name]['title_pretty'] = soup.find('title').get_text(strip=True)

    # These are used to determine what structural elements there are.
    all_acts = []
    for act_tag_type in ACT_TAGS:
        all_acts.extend(soup.find_all('div', {'type': act_tag_type}))

    edge_list, lonely_nodes_main = edge_list_extractor(all_acts)
    # create network from edge list
    whole_drama = nx.from_edgelist(set(edge_list))

    # add those with independent scenes
    for lonely_node in lonely_nodes_main:
        if lonely_node not in whole_drama.nodes:
            whole_drama.add_node(lonely_node)

    cumulative_G_list_fre[name]['whole'] = whole_drama

    # Cumulative calculation
    print(f'Processing {name}', (50-(len(name)))*' ', end='')
    # create iterations for 1, 1-2, 1-2-3, ... acts

    current_iteration_rounds = []
    for iteration_round in range(1, len(all_acts)+1):  # was all_acts
        current_iteration_rounds.append(iteration_round)

        # take first n acts from drama
        acts_included = all_acts[:iteration_round]  # was all_acts
        print(f'-{len(acts_included)}', end='')

        edge_list_in_iteration_out, lonely_nodes_main = edge_list_extractor(acts_included)

        # create network from edge list
        n_acts_whole = nx.from_edgelist(set(edge_list_in_iteration_out))

        # add those with independent scenes
        for lonely_node in lonely_nodes_main:
            if lonely_node not in n_acts_whole.nodes:
                n_acts_whole.add_node(lonely_node)

        label = f"acts_{'-'.join([str(i) for i in current_iteration_rounds])}"

        cumulative_G_list_fre[name][label] = n_acts_whole
    print('\n---------------')

Processing magnon-sejanus                                     -1-2-3-4-5
---------------
Processing tristan-mort-de-chrispe                            -1-2-3-4-5
---------------
Processing genlis-amant-anonyme                               -1-2-3-4-5
---------------
Processing crebillon-idomenee                                 -1-2-3-4-5
---------------
Processing du-ryer-saul                                       -1-2-3-4-5
---------------
Processing corneillep-pulcherie                               -1-2-3-4-5
---------------
Processing la-calprenede-hermenegilde                         -1-2-3-4-5
---------------
Processing soret-ceciliade                                    -1-2-3-4-5
---------------
Processing quinault-bellerophon                               -1-2-3-4-5
---------------
Processing scarron-fausse-apparence                           -1-2-3-4-5
---------------
Processing voltaire-irene                                     -1-2-3-4-5
---------------
Processing boyer-arta

##### CSV WRITING FOR FREDRACOR RESULTS

In [34]:
# Write csvs

csv_filename = 'fredracor_cumulative.csv'
column_names = ['title', 'title_pretty', 'genre']
metric_names = ['density', 'diameter', 'average_clustering']
most_acts = max([len(v) for v in cumulative_G_list_fre.values()]) - 1

for act in range(1, 5+1):
    for n in metric_names:
        column_names.append(f"acts_{'-'.join([str(n) for n in range(1, act+1)])}_{n}")

ger_result_dict = defaultdict(dict)
with open(csv_filename, 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=column_names)
    writer.writeheader()

    for name, drama_data in cumulative_G_list_fre.items():
        drama_dict = {'title': name,
                      'title_pretty': drama_data['title_pretty'],
                      'genre': drama_data['genre']}

        for act in range(1, 5+1):
            acts_name = f"acts_{'-'.join([str(n) for n in range(1, act+1)])}"
            if acts_name in drama_data.keys():
                drama_dict[f"{acts_name}_density"] = nx.density(drama_data[acts_name])
                drama_dict[f'{acts_name}_diameter'] = nx.diameter(get_largest_G(drama_data[acts_name], name))
                drama_dict[f'{acts_name}_average_clustering'] = nx.average_clustering(drama_data[acts_name])
            else:
                raise ValueError(f'{acts_name} not in {name}')
        ger_result_dict[name] = drama_dict
        writer.writerow(drama_dict)