In [14]:
pip install python-gedcom

Note: you may need to restart the kernel to use updated packages.


In [15]:
pip install tabulate


Note: you may need to restart the kernel to use updated packages.


In [16]:
### gedcom library

from gedcom.parser import Parser
from gedcom.element.individual import IndividualElement
from gedcom.element.family import FamilyElement
import pandas as pd
import numpy as np
from tabulate import tabulate
import datetime
from datetime import date

import warnings
warnings.filterwarnings("ignore")

In [17]:
def calculateAge(birthDate, deathDate):
    
      age = deathDate.year - birthDate.year - ((deathDate.month, deathDate.day) <
            (birthDate.month, birthDate.day))
         
 
      return age

In [18]:
def getIndividuals(file):
    
    ### dataframe to store individuals details
    individuals = pd.DataFrame(columns=['ID','Name','Gender','Birthday','Age','Alive','Death','Child','Spouse'])
    
    ### Parse the gedcom file
    parser = Parser()
    parser.parse_file(file)
    
    ### Iterate through each element and pull an individual data based on level and tag conditions
    for ele in parser.get_element_list():
        
        if(IndividualElement.is_individual(ele) == True):
            
            famSpouses = set()
            famChilds = set()            
            id = ele.get_pointer().replace('@', '')
            deathDate = 'NA'
            alive = 'True'
            
            ### For each individual element, below extracting the personal data
            child_elements = IndividualElement.get_child_elements(ele)
            for child in child_elements:
                if(child.get_tag() == 'NAME'):
                    name = child.get_value()
                    # print(name)
                elif(child.get_tag() == 'SEX'):
                    gender = child.get_value()
                    # print(gender)
                elif(child.get_tag() == 'BIRT'):
                    birthDate = datetime.datetime.strptime(child.get_child_elements()[0].get_value(),'%d %b %Y').strftime('%Y-%m-%d')
                    # print(birthDate)
                    age = calculateAge(date(int(birthDate.split('-')[0]),int(birthDate.split('-')[1]),int(birthDate.split('-')[2])), 
                            date.today())
                elif(child.get_tag() == 'FAMS'):
                    famSpouses.add(child.get_value().replace('@', ''))
                    # print('Spouse ', famSpouses)
                elif(child.get_tag() == 'FAMC'):
                    famChilds.add(child.get_value().replace('@', ''))
                    # print('Childs ', famChilds)
                elif(child.get_tag() == 'DEAT'):
                    deathDate = datetime.datetime.strptime(child.get_child_elements()[0].get_value(),'%d %b %Y').strftime('%Y-%m-%d')
                    # print(deathDate)
                    alive = 'False'
                    age = calculateAge(date(int(birthDate.split('-')[0]),int(birthDate.split('-')[1]),int(birthDate.split('-')[2])), 
                           date(int(deathDate.split('-')[0]),int(deathDate.split('-')[1]),int(deathDate.split('-')[2])))
                    
        
            # print('*********')  
        ### Appending each individual to dataframe
        
            if not famChilds:
                famChilds = {}
            if not famSpouses:
                famSpouses = {}
                
            row = {'ID': id, 'Name': name, 'Gender': gender, 
                'Birthday': birthDate, 'Age': age, 'Alive': alive,
                'Death': deathDate, 'Child': famChilds, 'Spouse': famSpouses}
            
            individuals = individuals.append(row, ignore_index = True)
            
    return individuals


In [19]:
def getFamilies(file):
    
    ### dataframe to store individuals details
    families = pd.DataFrame(columns=['ID','Married','Divorced','Husband ID','Husband Name','Wife ID','Wife Name','Children'])
    
    ### Parse the gedcom file
    parser = Parser()
    parser.parse_file(file)
    
    for ele in parser.get_element_list():

        if(FamilyElement.is_family(ele) == True):
                
                childrens = set()
                id = ele.get_pointer().replace('@', '')
                divorce = 'NA'
                childs = IndividualElement.get_child_elements(ele)
                for child in childs:
                    
                    if(child.get_tag() == 'HUSB'):
                        husbID = child.get_value().replace('@', '')
                    elif(child.get_tag() == 'WIFE'):
                        wifeID = child.get_value().replace('@', '')
                    elif(child.get_tag() == 'CHIL'):
                        childrens.add(child.get_value().replace('@', ''))
                    elif(child.get_tag() == 'MARR'):
                        marriage = datetime.datetime.strptime(IndividualElement.get_child_elements(child)[0].get_value(),'%d %b %Y').strftime('%Y-%m-%d')
                    elif(child.get_tag() == 'DIV'):
                        divorce = datetime.datetime.strptime(IndividualElement.get_child_elements(child)[0].get_value(),'%d %b %Y').strftime('%Y-%m-%d')
                        
                        
                individuals = getIndividuals(file)
                
                husbName = individuals.loc[individuals['ID'] == husbID, 'Name'].iloc[0]
                wifeName = individuals.loc[individuals['ID'] == wifeID, 'Name'].iloc[0]
                ### Appending each individual to dataframe
                row = {'ID': id, 'Married': marriage, 'Divorced': divorce, 
                    'Husband ID': husbID, 'Husband Name': husbName, 'Wife ID': wifeID,
                    'Wife Name': wifeName, 'Children': childrens}
                
                families = families.append(row, ignore_index = True)
    
    return families

In [28]:
### Enter your file name

file = input("Enter the file name ")

individuals = getIndividuals(file)
print(tabulate(individuals, headers='keys', tablefmt='psql'))

families = getFamilies(file)
print(tabulate(families, headers='keys', tablefmt='psql'))


with open('output_file.txt', 'w') as f:
    f.write(tabulate(individuals, headers='keys', tablefmt='psql')+'\n')
    f.write(tabulate(families, headers='keys', tablefmt='psql')+'\n')

+----+------+-------------------+----------+------------+-------+---------+------------+---------+--------------+
|    | ID   | Name              | Gender   | Birthday   |   Age | Alive   | Death      | Child   | Spouse       |
|----+------+-------------------+----------+------------+-------+---------+------------+---------+--------------|
|  0 | I1   | Alex /Rubin/      | M        | 1950-10-14 |    59 | False   | 2010-05-11 | {}      | {'F1'}       |
|  1 | I2   | Michelle /Rubin/  | F        | 1955-06-08 |    -5 | False   | 1950-09-04 | {}      | {'F1', 'F2'} |
|  2 | I3   | Jhon /Rubin/      | M        | 1980-08-08 |    42 | True    | NA         | {'F1'}  | {'F3'}       |
|  3 | I4   | Emily /Rubin/     | F        | 1984-11-13 |    -5 | False   | 1980-11-09 | {'F1'}  | {}           |
|  4 | I5   | Crystal /Rubin/   | F        | 1985-06-08 |    38 | True    | NA         | {}      | {'F3'}       |
|  5 | I6   | Leo /Rubin/       | M        | 2010-05-07 |    13 | True    | NA         |

In [29]:
### US 03 - Birth before death
def verify_birth(individuals):
    
    errors = []
    
    if(type(individuals) is not pd.DataFrame):
        
        print('The data should be Dataframe to process')
    elif(individuals is None):
        
        print('The data is empty')
    else:
        for i in range(0,len(individuals)):
            
            if(individuals.iloc[i]['Birthday'] != 'NA' and individuals.iloc[i]['Death'] != 'NA'):    
                
                if(datetime.datetime.strptime(individuals.iloc[i]['Birthday'], '%Y-%m-%d') >= datetime.datetime.strptime(individuals.iloc[i]['Death'], '%Y-%m-%d')):
                    
                    errors.append('ERROR: INDIVIDUAL: US03 ' + individuals.iloc[i]['ID'] + ' Died ' + individuals.iloc[i]['Death'] + ' before born '+individuals.iloc[i]['Birthday'])
                    print('ERROR: INDIVIDUAL: US03 ' + individuals.iloc[i]['ID'] + ' Died ' + individuals.iloc[i]['Death'] + ' before born '+individuals.iloc[i]['Birthday'])
                    
    return errors

In [30]:
errors = verify_birth(individuals)

with open("output_file.txt", 'a') as f:
    for e in errors:
        f.write(e+'\n')

ERROR: INDIVIDUAL: US03 I2 Died 1950-09-04 before born 1955-06-08
ERROR: INDIVIDUAL: US03 I4 Died 1980-11-09 before born 1984-11-13


In [31]:
### US 05 - Marriage before death
def verify_marriage(families, individuals):
    
    errors = []
    
    if(type(families) is not pd.DataFrame):
        
        print('The data should be Dataframe to process')
    elif(families is None):
        
        print('The data is empty')
    else:
        for i in range(0,len(families)):
            
            if(individuals.loc[individuals['ID'] == families.iloc[i]['Husband ID'], 'Death'].iloc[0] != 'NA' and families.iloc[i]['Married'] != 'NA'):    
                
                if(datetime.datetime.strptime(families.iloc[i]['Married'], '%Y-%m-%d') >= datetime.datetime.strptime(individuals.loc[individuals['ID'] == families.iloc[i]['Husband ID'], 'Death'].iloc[0], '%Y-%m-%d')):
                    
                    errors.append('ERROR: Family: US05 ' + families.iloc[i]['ID'] + ': Married'+ families.iloc[i]['Married'] + ' after husband/s '+families.iloc[i]['Husband ID']+ ' death on '+individuals.loc[individuals['ID'] == families.iloc[i]['Husband ID'], 'Death'].iloc[0])
                    print('ERROR: Family: US05 ' + families.iloc[i]['ID'] + ': Married'+ families.iloc[i]['Married'] + ' after husband/s '+families.iloc[i]['Husband ID']+ ' death on '+individuals.loc[individuals['ID'] == families.iloc[i]['Husband ID'], 'Death'].iloc[0])
            
            elif(individuals.loc[individuals['ID'] == families.iloc[i]['Wife ID'], 'Death'].iloc[0] != 'NA' and families.iloc[i]['Married'] != 'NA'):
                
                if(datetime.datetime.strptime(families.iloc[i]['Married'], '%Y-%m-%d') >= datetime.datetime.strptime(individuals.loc[individuals['ID'] == families.iloc[i]['Wife ID'], 'Death'].iloc[0], '%Y-%m-%d')):
                    
                    errors.append('ERROR: Family: US05 ' + families.iloc[i]['ID'] + ': Married '+ families.iloc[i]['Married'] + ' after wife/s '+families.iloc[i]['Wife ID']+ ' death on '+individuals.loc[individuals['ID'] == families.iloc[i]['Wife ID'], 'Death'].iloc[0])
                    print('ERROR: Family: US05 ' + families.iloc[i]['ID'] + ': Married '+ families.iloc[i]['Married'] + ' after wife/s '+families.iloc[i]['Wife ID']+ ' death on '+individuals.loc[individuals['ID'] == families.iloc[i]['Wife ID'], 'Death'].iloc[0])
    return errors

In [32]:
errors = verify_marriage(families, individuals)

with open("output_file.txt", 'a') as f:
    for e in errors:
        f.write(e+'\n')

ERROR: Family: US05 F2: Married 2015-08-15 after wife/s I2 death on 1950-09-04
