In [1]:
import os
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import csv

In [2]:
path_overall = 'C:\\Users\\arvra\\Documents\\UVa files\\Classes\\Fall_18\\Capstone Project\\books\\books'

### Function to extract Event related BESS tag from the XML file

In [3]:
def bess_to_dataframe(bess_file_path):
    
    #### Checking if it is a valid file before using xml parser
    valid_file = 0
    for each in open(bess_file_path,'r'):
        
        if("BESS" in each):
            #print(each)
            valid_file = 1
            break
    
    ## If there is no valid text, we return without any processing
    if(valid_file == 0):
        return(None)
    
    bess_file_read = bess_file_path
    tree = ET.parse(bess_file_read)
    root = tree.getroot()
    
    ### Checking if we have any information at all
    
    if(root.find('textAnalyzed') == None):
        return(None)
    
    ########################## STARTING THE ACTUAL PARSING ##################################################
    #################### Getting Meta data infromation by parsing XML #######################################
    
    textAnalyzed = root.find('textAnalyzed')
    c_id = textAnalyzed.find('collectionID').text
    b_id = textAnalyzed.find('biographyID').text
    uri = textAnalyzed.find('URI').text
    
    ## Setting Author to Unknown if we do not have an author
    if( textAnalyzed.find('author') == None):
        author = "Unknown"
    else:
        author = textAnalyzed.find('author').text
    
    ## Setting the Title to Unknown if we do not have a title
    if( textAnalyzed.find('title') == None):
        title = "Unknown"
    else:
        title = textAnalyzed.find('title').text
    
    persona_name = textAnalyzed.find('personaName').text
    
    ## Creating Dataframe for Meta data and BESS tags
    Event_Data_frame = pd.DataFrame({'para no' : [],'Event' : [],'Content': []})
    Meta_data_df = pd.DataFrame({'collectionID': [c_id],'biographyID': [b_id],
                                 'URI': [uri],'author': [author],'title': [title],
                                 'personaName': [persona_name]})
    
    
    #################### Getting BESS tag infromation by parsing XML #######################################
    ## EVENT ## Getting BESS tags by parsing XML
    
    for each_member in root.find('analysis'):
        
        ## Checking if the tag is Event (since we are parsing for event here)
        if(each_member.tag == 'event'):
            
            ##Getting all the sub tags under events
            sub_members = each_member.getchildren()
            
            ## textUnitRangeReference
            ## Creating a variable for para_number to store the paragraph number
            para_number = None

            
            ### Looping through each of the sub tags under Events
            for each in sub_members:

                if(each.text == None):
                    continue

                ### Check if we have start and end tag for each of the sub tag ##

                ### Absence of Start and End subtag under the tags
                ### i.e. the tag corresponds to only one paragraph
                if(len(each.getchildren()) == 0):
                    #print(each.tag)
                    #print(each.text)

                    if(each.tag == 'textUnitReference'):
                        para_number = each.text
                    else:

                        #print(para_number)
                        if(para_number == None):
                            break


                        Event_Data_temp = pd.DataFrame({'para no' : [para_number],
                                                        'Event': each.tag,'Content': [each.text]})
                        Event_Data_frame = pd.concat([Event_Data_frame,Event_Data_temp], axis = 0)
                        #print(Event_Data_temp)
                ### Presence of Start and End subtag under the tags
                ### i.e. the tag corresponds to more than one paragraph

                if(len(each.getchildren()) != 0 and each.tag == 'textUnitRangeReference'):
                    start_para = int(each.getchildren()[0].text)
                    end_para = int(each.getchildren()[1].text)+1


                    for group_para in range(start_para,end_para):
                        counts_after_index = 0
                        for each_group_para in sub_members[1:len(sub_members)]:

                            #print(group_para)
                            Event_Data_temp = pd.DataFrame({'para no' : [group_para],\
                                                            'Event': each_group_para.tag,\
                                                            'Content': [each_group_para.text]})
                            Event_Data_frame = pd.concat([Event_Data_frame,Event_Data_temp], axis = 0)
                            #print(Event_Data_temp)
                            counts_after_index+= 1

                    for each in range(counts_after_index):
                        continue
            
    ### Events concat
    Events = pd.concat([Event_Data_frame,Meta_data_df], axis = 1)
    Events.reset_index(drop = True,inplace = True)
    return(Events)

### Looping through all the folders to get the data frames for each xml file

In [4]:
#Initializing an empty data frame
bess_df_all = pd.DataFrame()

### Looping through each of the collective biography
for a_files in os.listdir(path_overall):
    #print(a_files)
    if(os.path.isdir(path_overall+"\\"+a_files)):
        
        ### Looping through each of the individual biography
        for bio_files in os.listdir(path_overall+"\\"+a_files):
            
            #print(os.path.isdir(path_overall+"\\"+a_files))
            if(os.path.isdir(path_overall+"\\"+a_files+"\\"+bio_files)):
                
                files = os.listdir(path_overall+"\\"+a_files+"\\"+bio_files)
                
                
                bess_files = [each_file for each_file in files if ".bess.xml" in each_file]
                ### Looping through each of the valid file in an individual biography
                for each_bess_file in bess_files:
                    #print(each_bess_file)
                    bess_file_path  = path_overall+"\\"+a_files+"\\"+bio_files+"\\"+each_bess_file
                    bess_df_current = bess_to_dataframe(bess_file_path)
                    bess_df_all = pd.concat([bess_df_all,bess_df_current],axis = 0)
                    bess_df_all.reset_index(drop = True, inplace= True)

In [5]:
bess_df_all.shape

(83183, 9)

In [6]:
bess_df_all.head()

Unnamed: 0,Content,Event,para no,URI,author,biographyID,collectionID,personaName,title
0,seduction,type,1,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
1,suicide,type,1,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
2,"death, persona's",type,1,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
3,"male adult, other",agentType,1,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
4,"lover, male, unnamed",agentType,1,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
