In [None]:
pip install python-gedcom

In [None]:
pip install tabulate


In [3]:
### gedcom library

from gedcom.parser import Parser
from gedcom.element.individual import IndividualElement
from gedcom.element.family import FamilyElement
import pandas as pd
import numpy as np
from tabulate import tabulate
import datetime
from datetime import date
from datetime import timedelta


import warnings
warnings.filterwarnings("ignore")

In [4]:
def calculateAge(birthDate, deathDate):
    
      age = deathDate.year - birthDate.year - ((deathDate.month, deathDate.day) <
            (birthDate.month, birthDate.day))
         
 
      return age

In [5]:
def getIndividuals(file):
    
    ### dataframe to store individuals details
    individuals = pd.DataFrame(columns=['ID','Name','Gender','Birthday','Age','Alive','Death','Child','Spouse'])
    
    ### Parse the gedcom file
    parser = Parser()
    parser.parse_file(file)
    
    ### Iterate through each element and pull an individual data based on level and tag conditions
    for ele in parser.get_element_list():
        
        if(IndividualElement.is_individual(ele) == True):
            
            famSpouses = set()
            famChilds = set()            
            id = ele.get_pointer().replace('@', '')
            deathDate = 'NA'
            alive = 'True'
            
            ### For each individual element, below extracting the personal data
            child_elements = IndividualElement.get_child_elements(ele)
            for child in child_elements:
                if(child.get_tag() == 'NAME'):
                    name = child.get_value()
                    # print(name)
                elif(child.get_tag() == 'SEX'):
                    gender = child.get_value()
                    # print(gender)
                elif(child.get_tag() == 'BIRT'):
                    birthDate = datetime.datetime.strptime(child.get_child_elements()[0].get_value(),'%d %b %Y').strftime('%Y-%m-%d')
                    # print(birthDate)
                    age = calculateAge(date(int(birthDate.split('-')[0]),int(birthDate.split('-')[1]),int(birthDate.split('-')[2])), 
                            date.today())
                elif(child.get_tag() == 'FAMS'):
                    famSpouses.add(child.get_value().replace('@', ''))
                    # print('Spouse ', famSpouses)
                elif(child.get_tag() == 'FAMC'):
                    famChilds.add(child.get_value().replace('@', ''))
                    # print('Childs ', famChilds)
                elif(child.get_tag() == 'DEAT'):
                    deathDate = datetime.datetime.strptime(child.get_child_elements()[0].get_value(),'%d %b %Y').strftime('%Y-%m-%d')
                    # print(deathDate)
                    alive = 'False'
                    age = calculateAge(date(int(birthDate.split('-')[0]),int(birthDate.split('-')[1]),int(birthDate.split('-')[2])), 
                           date(int(deathDate.split('-')[0]),int(deathDate.split('-')[1]),int(deathDate.split('-')[2])))
                    
        
            # print('*********')  
        ### Appending each individual to dataframe
        
            if not famChilds:
                famChilds = {}
            if not famSpouses:
                famSpouses = {}
                
            row = {'ID': id, 'Name': name, 'Gender': gender, 
                'Birthday': birthDate, 'Age': age, 'Alive': alive,
                'Death': deathDate, 'Child': famChilds, 'Spouse': famSpouses}
            
            #individuals = individuals.append(row, ignore_index = True)#

            individuals = pd.concat([individuals, pd.DataFrame([row])], ignore_index=True)
            
    return individuals


In [6]:
def getFamilies(file):
    
    ### dataframe to store individuals details
    families = pd.DataFrame(columns=['ID','Married','Divorced','Husband ID','Husband Name','Wife ID','Wife Name','Children'])
    
    ### Parse the gedcom file
    parser = Parser()
    parser.parse_file(file)
    
    for ele in parser.get_element_list():

        if(FamilyElement.is_family(ele) == True):
                
                childrens = set()
                id = ele.get_pointer().replace('@', '')
                divorce = 'NA'
                childs = IndividualElement.get_child_elements(ele)
                for child in childs:
                    
                    if(child.get_tag() == 'HUSB'):
                        husbID = child.get_value().replace('@', '')
                    elif(child.get_tag() == 'WIFE'):
                        wifeID = child.get_value().replace('@', '')
                    elif(child.get_tag() == 'CHIL'):
                        childrens.add(child.get_value().replace('@', ''))
                    elif(child.get_tag() == 'MARR'):
                        marriage = datetime.datetime.strptime(IndividualElement.get_child_elements(child)[0].get_value(),'%d %b %Y').strftime('%Y-%m-%d')
                    elif(child.get_tag() == 'DIV'):
                        divorce = datetime.datetime.strptime(IndividualElement.get_child_elements(child)[0].get_value(),'%d %b %Y').strftime('%Y-%m-%d')
                        
                        
                individuals = getIndividuals(file)
                
                husbName = individuals.loc[individuals['ID'] == husbID, 'Name'].iloc[0]
                wifeName = individuals.loc[individuals['ID'] == wifeID, 'Name'].iloc[0]
                ### Appending each individual to dataframe
                row = {'ID': id, 'Married': marriage, 'Divorced': divorce, 
                    'Husband ID': husbID, 'Husband Name': husbName, 'Wife ID': wifeID,
                    'Wife Name': wifeName, 'Children': childrens}
                
                #families = families.append(row, ignore_index = True)#

                families = pd.concat([families, pd.DataFrame([row])], ignore_index=True)
    
    return families

In [7]:
### Enter your file name

file = input("Enter the file name ")

individuals = getIndividuals(file)
print(tabulate(individuals, headers='keys', tablefmt='psql'))

families = getFamilies(file)
print(tabulate(families, headers='keys', tablefmt='psql'))


with open('output_file.txt', 'w') as f:
    f.write(tabulate(individuals, headers='keys', tablefmt='psql')+'\n')
    f.write(tabulate(families, headers='keys', tablefmt='psql')+'\n')

+----+------+-------------------+----------+------------+-------+---------+------------+---------+--------------+
|    | ID   | Name              | Gender   | Birthday   |   Age | Alive   | Death      | Child   | Spouse       |
|----+------+-------------------+----------+------------+-------+---------+------------+---------+--------------|
|  0 | I1   | Alex /Rubin/      | M        | 1950-10-14 |    72 | False   | 2023-05-17 | {}      | {'F1'}       |
|  1 | I2   | Michelle /Rubin/  | F        | 1955-06-08 |    -5 | False   | 1950-09-04 | {}      | {'F1', 'F2'} |
|  2 | I3   | Jhon /Rubin/      | M        | 1980-08-08 |    42 | True    | NA         | {'F1'}  | {'F3'}       |
|  3 | I4   | Emily /Rubin/     | F        | 1984-11-13 |    -5 | False   | 1980-11-09 | {'F1'}  | {}           |
|  4 | I5   | Crystal /Rubin/   | F        | 1985-06-08 |    38 | True    | NA         | {}      | {'F3'}       |
|  5 | I6   | Leo /Rubin/       | M        | 2010-05-07 |    13 | True    | NA         |

In [8]:
### US 03 - Birth before death
def verify_birth(individuals):
    
    errors = []
    
    if(type(individuals) is not pd.DataFrame):
        
        print('The data should be Dataframe to process')
    elif(individuals is None):
        
        print('The data is empty')
    else:
        for i in range(0,len(individuals)):
            
            if(individuals.iloc[i]['Birthday'] != 'NA' and individuals.iloc[i]['Death'] != 'NA'):    
                
                if(datetime.datetime.strptime(individuals.iloc[i]['Birthday'], '%Y-%m-%d') >= datetime.datetime.strptime(individuals.iloc[i]['Death'], '%Y-%m-%d')):
                    
                    errors.append('ERROR: INDIVIDUAL: US03 ' + individuals.iloc[i]['ID'] + ' Died ' + individuals.iloc[i]['Death'] + ' before born '+individuals.iloc[i]['Birthday'])
                    print('ERROR: INDIVIDUAL: US03 ' + individuals.iloc[i]['ID'] + ' Died ' + individuals.iloc[i]['Death'] + ' before born '+individuals.iloc[i]['Birthday'])
                    
    return errors

In [9]:
errors = verify_birth(individuals)

with open("output_file.txt", 'a') as f:
    for e in errors:
        f.write(e+'\n')

ERROR: INDIVIDUAL: US03 I2 Died 1950-09-04 before born 1955-06-08
ERROR: INDIVIDUAL: US03 I4 Died 1980-11-09 before born 1984-11-13


In [10]:
### US 05 - Marriage before death
def verify_marriage(families, individuals):
    
    errors = []
    
    if(type(families) is not pd.DataFrame):
        
        print('The data should be Dataframe to process')
    elif(families is None):
        
        print('The data is empty')
    else:
        for i in range(0,len(families)):
            
            if(individuals.loc[individuals['ID'] == families.iloc[i]['Husband ID'], 'Death'].iloc[0] != 'NA' and families.iloc[i]['Married'] != 'NA'):    
                
                if(datetime.datetime.strptime(families.iloc[i]['Married'], '%Y-%m-%d') >= datetime.datetime.strptime(individuals.loc[individuals['ID'] == families.iloc[i]['Husband ID'], 'Death'].iloc[0], '%Y-%m-%d')):
                    
                    errors.append('ERROR: Family: US05 ' + families.iloc[i]['ID'] + ': Married'+ families.iloc[i]['Married'] + ' after husband/s '+families.iloc[i]['Husband ID']+ ' death on '+individuals.loc[individuals['ID'] == families.iloc[i]['Husband ID'], 'Death'].iloc[0])
                    print('ERROR: Family: US05 ' + families.iloc[i]['ID'] + ': Married'+ families.iloc[i]['Married'] + ' after husband/s '+families.iloc[i]['Husband ID']+ ' death on '+individuals.loc[individuals['ID'] == families.iloc[i]['Husband ID'], 'Death'].iloc[0])
            
            elif(individuals.loc[individuals['ID'] == families.iloc[i]['Wife ID'], 'Death'].iloc[0] != 'NA' and families.iloc[i]['Married'] != 'NA'):
                
                if(datetime.datetime.strptime(families.iloc[i]['Married'], '%Y-%m-%d') >= datetime.datetime.strptime(individuals.loc[individuals['ID'] == families.iloc[i]['Wife ID'], 'Death'].iloc[0], '%Y-%m-%d')):
                    
                    errors.append('ERROR: Family: US05 ' + families.iloc[i]['ID'] + ': Married '+ families.iloc[i]['Married'] + ' after wife/s '+families.iloc[i]['Wife ID']+ ' death on '+individuals.loc[individuals['ID'] == families.iloc[i]['Wife ID'], 'Death'].iloc[0])
                    print('ERROR: Family: US05 ' + families.iloc[i]['ID'] + ': Married '+ families.iloc[i]['Married'] + ' after wife/s '+families.iloc[i]['Wife ID']+ ' death on '+individuals.loc[individuals['ID'] == families.iloc[i]['Wife ID'], 'Death'].iloc[0])
    return errors

In [11]:
errors = verify_marriage(families, individuals)

with open("output_file.txt", 'a') as f:
    for e in errors:
        f.write(e+'\n')

ERROR: Family: US05 F2: Married 2015-08-15 after wife/s I2 death on 1950-09-04


In [12]:
#US04 - Marriage before divorce
def parsedate(date_str):
    try:
        return datetime.datetime.strptime(date_str, '%Y-%m-%d')
    except ValueError:
        return None

def MarriageBeforeDivorce(individuals, families):
    errors = []
    for i in range(0,len(families)):
        if (families.iloc[i]['Divorced'] == 'NA' or families.iloc[i]['Married'] == 'NA'):
            continue
        marriageDate = parsedate(families.iloc[i]['Married'])
        divorceDate = parsedate(families.iloc[i]['Divorced'])
        if (marriageDate and divorceDate and divorceDate < marriageDate):
            errors.append(f"ERROR US04: {families.iloc[i]['ID']} Divorce happened before the date of marriage")
    return errors if errors else None

#US06 - Divorce before death
def DivorceBeforeDeath(individual, maritalRecord):
    
    for i in range(0,len(maritalRecord)):
        if(maritalRecord.iloc[i]['Divorced'] != 'NA'):
            husbandDeath = individual.loc[individual['ID'] == maritalRecord.iloc[i]['Husband ID'], 'Death'].iloc[0]
            wifeDeath = individual.loc[individual['ID'] == maritalRecord.iloc[i]['Wife ID'], 'Death'].iloc[0]
            if(husbandDeath != 'NA'):
                if(parsedate(husbandDeath) < parsedate(maritalRecord.iloc[i]['Divorced'])):
                    print("Error US06: Person: " , individual.iloc[i]['ID'] , ", " , ": Divorce of " , maritalRecord.iloc[i]['Divorced'], "occur after death ", husbandDeath)
            elif(wifeDeath != 'NA'):
                if(parsedate(wifeDeath) < parsedate(maritalRecord.iloc[i]['Divorced'])):
                    print("Error US06: Person: " , individual.iloc[i]['ID'] , ", " , ": Divorce of " , maritalRecord.iloc[i]['Divorced'], "occur after death ", wifeDeath)
        else:
            result = False
    
    if result is False:
        print('No errors for US06')



In [13]:
errors = MarriageBeforeDivorce(individuals, families)
with open("output_file.txt", 'a') as f:
    for e in errors:
        f.write(e+'\n')

In [14]:
DivorceBeforeDeath(individuals, families)

No errors for US06


In [15]:
def getLivingMarried(individuals, families):
    
    for i in range(0,len(families)):
        
        if (families.iloc[i]['Married'] != 'NA' and families.iloc[i]['Divorced'] == 'NA'):
            husbandDeath = individuals.loc[individuals['ID'] == families.iloc[i]['Husband ID'], 'Death'].iloc[0]
            wifeDeath = individuals.loc[individuals['ID'] == families.iloc[i]['Wife ID'], 'Death'].iloc[0]
            
            if(husbandDeath != ' NA' and wifeDeath != 'NA'):
                print("INFO US30: Person: " , families.iloc[i]['Husband Name'] , " is" , " living married to " , families.iloc[i]['Wife Name'])


In [16]:
getLivingMarried(individuals, families)

INFO US30: Person:  Jim /Hernandez/  is  living married to  Michelle /Rubin/


In [17]:
def getRecentDeaths(individuals):
    
    for i in range(0,len(individuals)):
        
        if (individuals.iloc[i]['Death'] != 'NA'):
            
            if(datetime.datetime.today() - timedelta(days=30) <= parsedate(individuals.iloc[i]['Death'])):
                print("INFO US36: Person: " , individuals.iloc[i]['Name'] , " is" , " dead within the last 30 days")


In [18]:
getRecentDeaths(individuals)

In [19]:
### US 28 ---- List of ordered siblings

def getSiblings(individuals, families):
    
    siblingsID = [item for sublist in families["Children"] for item in sublist]
    # print(siblingsID)
    
    siblingsInfo = individuals[individuals.ID.isin(siblingsID)]    
    siblingsOrdered = siblingsInfo.sort_values(['Birthday'],ascending=True)
    
    print(tabulate(siblingsOrdered,headers='keys', tablefmt = 'psql'))
    
    return tabulate(siblingsOrdered,headers='keys', tablefmt = 'psql')

    
with open("output_file.txt", 'a') as f:
    f.write("******US28: List of ordered siblings\n")
    f.write(getSiblings(individuals, families)+'\n')

+----+------+-------------------+----------+------------+-------+---------+------------+---------+----------+
|    | ID   | Name              | Gender   | Birthday   |   Age | Alive   | Death      | Child   | Spouse   |
|----+------+-------------------+----------+------------+-------+---------+------------+---------+----------|
|  2 | I3   | Jhon /Rubin/      | M        | 1980-08-08 |    42 | True    | NA         | {'F1'}  | {'F3'}   |
|  3 | I4   | Emily /Rubin/     | F        | 1984-11-13 |    -5 | False   | 1980-11-09 | {'F1'}  | {}       |
|  5 | I6   | Leo /Rubin/       | M        | 2010-05-07 |    13 | True    | NA         | {'F3'}  | {}       |
|  6 | I7   | Rina /Rubin/      | F        | 2013-11-12 |     9 | True    | NA         | {'F3'}  | {}       |
|  8 | I9   | Yosef /Hernandez/ | M        | 2016-12-08 |     6 | True    | NA         | {'F2'}  | {}       |
+----+------+-------------------+----------+------------+-------+---------+------------+---------+----------+


In [20]:
### US 29 ---- List of deceased

def getListDeceased(individuals):
    
    
    return individuals[['ID', 'Name']].where(individuals['Alive'] == "False").dropna()


with open("output_file.txt", 'a') as f:
    f.write("******US29: List of deceased\n")
    f.write(tabulate(getListDeceased(individuals), headers='keys', tablefmt='psql')+'\n')

In [21]:
#US10 - Marriage after 14
def MarriageAfter14(individuals, families):
    marriedAfter14 = []
    for i in range(len(families)):
        if families.iloc[i]["Husband ID"] and families.iloc[i]["Wife ID"]:
            husbandId = individuals.loc[individuals['ID'] == families.iloc[i]['Husband ID']]
            wifeId = individuals.loc[individuals['ID'] == families.iloc[i]['Wife ID']]

            husband = individuals.loc[husbandId]
            wife = individuals.loc[wifeId]

            if not husband.empty and not wife.empty:
                husbandBirthDate = husband["Birthday"].iloc[0]
                wifeBirthDate = wife["Birthday"].iloc[0]

                if husbandBirthDate and wifeBirthDate:
                    husband_age_at_marriage = families.iloc[i]["Married"] - husbandBirthDate
                    wife_age_at_marriage = families.iloc[i]["Married"] - wifeBirthDate

                    if husband_age_at_marriage >= 14 and wife_age_at_marriage >= 14:
                        marriedAfter14.append((families.iloc[i]['Husband ID'], families.iloc[i]['Wife ID']))
                        
    return marriedAfter14


     
    
#US11 - No Bigamy
def noBigamy(individuals, families):   
    living_married = getLivingMarried(individuals, families)    
    for c in living_married:
        for ind in c:
            if len(living_married) > 1:
                for cp in living_married.remove(c):
                    if ind in cp:
                        
                        print("ANOMALY: US 11: Individual " + ind + " is committing bigamy")
                        
            else:
                print("NORMAL: US 11: Individual " + ind + " is not committing bigamy")

In [22]:
def get_Large_age_Differences(families, individuals):
    result = 0; 
    for i in range(0,len(families)):
        husbandAge = individuals.loc[individuals['ID'] == families.iloc[i]['Husband ID'], 'Age'].iloc[0]
        wifeAge = individuals.loc[individuals['ID'] == families.iloc[i]['Wife ID'], 'Age'].iloc[0]
        ageDifference = husbandAge / wifeAge    
        if(ageDifference) >= 2 : 
                print(families[i]['Husband Name'])
                print(families[i]['Wife Name'])
        else:
                result = result + 1
    if(result == len(families)):
                print("No Major Age Difference Found")

In [23]:
get_Large_age_Differences(families, individuals)

No Major Age Difference Found


In [24]:
from datetime import datetime

def reject_Illegitimate_dates(families, individuals):
#verifying families dates
   for i in range(0,len(families)):
      divorceDate = families.iloc[i]['Divorced']
      marriedDate = families.iloc[i]['Married']
      try:

         dateObject = datetime.strptime(divorceDate, '%Y-%m-%d')
         return True
          

      except ValueError:
         
         return ("Illegitimate date inputted")
   
      try:

         dateObject = datetime.strptime(marriedDate, '%Y-%m-%d')
         return dateObject    

      except ValueError:
         
         return ("Illegitimate date inputted")
      
   for i in range(0,len(individuals)):
   #verifying for individuals dates
      birthDay = individuals.iloc[i]['Birthday']
      deathDate = individuals.iloc[i]['Death']
      try:

         dateObject = datetime.strptime(birthDay, '%Y-%m-%d')
         return True
          

      except ValueError:
         
         return ("Illegitimate date inputted")
   
      try:

         dateObject = datetime.strptime(deathDate, '%Y-%m-%d')
         return dateObject    

      except ValueError:
         
         return ("Illegitimate date inputted")


In [25]:
reject_Illegitimate_dates(families, individuals)

True

In [26]:
### US 38 - List upcoming birthdays occuring in the next 30 days

def upcomingBirthdays(individuals):
    
     
    today = date.today()
    listBirthdays = pd.DataFrame(columns=['Name', 'Birthday'])
    for i in range(0, len(individuals)):
        
        if (individuals.iloc[i]['Alive'] == 'True' and individuals.iloc[i]['Birthday'] != 'NA'):
            
            if (today <= datetime.strptime(individuals.iloc[i]['Birthday'], '%Y-%m-%d').date().replace(year=today.year) <= (today+timedelta(days=30))):
                
                ind = {'Name': individuals.iloc[i]['Name'].replace('/', ''), 
                       'Birthday': datetime.strptime(individuals.iloc[i]['Birthday'], '%Y-%m-%d').date().replace(year=today.year)}
                
                listBirthdays =  pd.concat([listBirthdays, pd.DataFrame([ind])], ignore_index=True)
    
    return listBirthdays



with open("output_file.txt", 'a') as f:
    f.write("******US38: List of upcoming birthdays\n")
    f.write(tabulate(upcomingBirthdays(individuals), headers='keys', tablefmt='psql')+'\n')

In [27]:
### US39 --- list marriage anniversaries occuring in the next 30 days

def marriageAnniversaries(families, individuals):
    
    today = datetime.today()
    anniversaries = pd.DataFrame(columns=['Husband', 'Wife', 'Marriage Anniversary'])
    for i in range(0, len(families)):
        
        if(individuals.loc[individuals['ID'] == families.iloc[i]['Husband ID'], 'Alive'].iloc[0] == True
           and individuals.loc[individuals['ID'] == families.iloc[i]['Wife ID'], 'Death'].iloc[0] == True
           and families.iloc[i]['Divorced'] == 'NA'):
            
            if (today <= datetime.strptime(families.iloc[i]['Married'], '%Y-%m-%d').date().replace(year=today.year) <= (today+timedelta(days=30))):
                
                family = {'Husband': individuals.loc[individuals['ID']== families.iloc[i]['Husband ID'], 'Name'].iloc[0],
                          'Wife': individuals.loc[individuals['ID']== families.iloc[i]['Wife ID'], 'Name'].iloc[0],
                          'Marriage Anniversary': (families.iloc[i]['Married']).date().replace(year = today.year)}
                
                anniversaries = pd.concat([anniversaries, pd.DataFrame(family)], ignore_index=True)

            
    return anniversaries

with open("output_file.txt", 'a') as f:
    f.write("******US39: List of upcoming marriage anniversaries\n")
    f.write(tabulate(marriageAnniversaries(families, individuals), headers='keys', tablefmt='psql')+'\n')    

In [28]:
#US13 - Siblings spacing
from datetime import datetime
from dateutil.relativedelta import relativedelta

def siblingSpace(individuals, families):
    for fam_index, fam_row in families.iterrows():
        siblingBirth = []
        for col_name, indiv in fam_row.iteritems():
            if col_name.startswith('Child') and not pd.isna(indiv):
                indi_row = individuals.loc[indiv]
                siblingBirth.append(datetime.strptime(indi_row['Birthday'], "%d %b %Y"))

        for i in range(len(siblingBirth)):
            for j in range(i + 1, len(siblingBirth)):
                diff = relativedelta(siblingBirth[j], siblingBirth[i])
                if (diff.years == 0 and diff.months < 8) or \
                   (diff.years == 0 and diff.months == 0 and diff.days >= 2):
                    print("Error: US13: Siblings not spaced out enough")
                    return

    print("US13: Siblings spaced out enough")


In [None]:
siblingSpace(individuals,families)

In [29]:
#US14 - Multiple births <= 5
from collections import Counter
from datetime import datetime, date

def getBirthDateUsingID(individuals, personID):
    return individuals.iloc[personID]['Birthday']

def multipleBirthslessThan5(individuals, families):
    multipleBirthsList = []

    for famID, family in families.items():
        children = family.get('Child', [])
        if len(children) >= 5:
            birthDates = [getBirthDateUsingID(individuals, child) for child in children]
            birthDateCounts = Counter(birthDates)
            if any(count > 5 for count in birthDateCounts.values()):
                multipleBirthsList.append(famID)

    if len(multipleBirthsList) != 0:
        print("ERROR: US14: Families with the following IDs had more than 5 siblings born at the same time:")
        print(multipleBirthsList)
    else:
        print("US14: All families have no more than 5 siblings born at the same time.")

    return len(multipleBirthsList)


In [None]:
multipleBirthslessThan5(individuals, families)

In [30]:
def getCurrentGender(families, individuals):

    for i in range(0,len(families)):
                                                                    
        if(families.iloc[i]['Husband ID']) != 'NA' and individuals.loc[individuals['ID'] == families.iloc[i]['Husband ID'], 'Gender'].iloc[0] == 'M':
            print('Husband in family ' + families.iloc[i]['ID'] + ' is indeed a male')

        if(families.iloc[i]['Wife ID']) != 'NA' and individuals.loc[individuals['ID'] == families.iloc[i]['Wife ID'], 'Gender'].iloc[0] == 'F':
            print('Wife in family ' + families.iloc[i]['ID'] + ' is indeed a female')
        
        else:
            print('N/A')

In [31]:
getCurrentGender(families, individuals)

Husband in family F1 is indeed a male
Wife in family F1 is indeed a female
Husband in family F2 is indeed a male
Wife in family F2 is indeed a female
Husband in family F3 is indeed a male
Wife in family F3 is indeed a female


In [32]:
def getDatesBeforeCurrentDate(families, individuals):
    
    for i in range(0,len(families)):

                if(families.iloc[i]['Married'] != 'NA' and date.today() > datetime.strptime(families.iloc[i]['Married'], '%Y-%m-%d').date()):
                    print("Invalid Marriage Date")
                if(families.iloc[i]['Divorced'] != 'NA' and date.today() > datetime.strptime(families.iloc[i]['Divorced'], '%Y-%m-%d').date()):
                    print("Invalid Divorced Date")

                else:
                      print("All Marriage and Divorce Dates are Valid")    
                      
    for i in range(0,len(individuals)):

                if(individuals.iloc[i]['Birthday'] != 'NA' and date.today() > datetime.strptime(individuals.iloc[i]['Birthday'], '%Y-%m-%d').date()):
                    print("Invalid Birthday Date")
                if(individuals.iloc[i]['Death'] != 'NA' and date.today() > datetime.strptime(individuals.iloc[i]['Death'], '%Y-%m-%d').date()):
                    print("Invalid Death Date")
            
                else:
                    print("All Birthday and Death Dates are Valid")

In [33]:
getDatesBeforeCurrentDate(families,individuals)

Invalid Marriage Date
Invalid Divorced Date
Invalid Marriage Date
All Marriage and Divorce Dates are Valid
Invalid Marriage Date
All Marriage and Divorce Dates are Valid
Invalid Birthday Date
Invalid Death Date
Invalid Birthday Date
Invalid Death Date
Invalid Birthday Date
All Birthday and Death Dates are Valid
Invalid Birthday Date
Invalid Death Date
Invalid Birthday Date
All Birthday and Death Dates are Valid
Invalid Birthday Date
All Birthday and Death Dates are Valid
Invalid Birthday Date
All Birthday and Death Dates are Valid
Invalid Birthday Date
All Birthday and Death Dates are Valid
Invalid Birthday Date
All Birthday and Death Dates are Valid


In [34]:
#US23 - Unique name and birth date
def DuplicateNameBirthDate(individuals):
    duplicates = []
    for i in range(len(individuals)):
        for j in range(i + 1, len(individuals)):
            if individuals.iloc[i]['Name'] == individuals.iloc[j]['Name'] and \
               individuals.iloc[i]['Birthday'] == individuals.iloc[j]['Birthday']:
                duplicates.append((individuals.iloc[i]['Name'], individuals.iloc[i]['Birthday']))
    return duplicates

def UniqueNameBirthDate(individuals, families):
    duplicates = DuplicateNameBirthDate(individuals)
    if duplicates:
        for name, birthday in duplicates:
            print("Error: US23: Found a duplicate: {} - {}".format(name, birthday))
        return False
    else:
        print("US23: No duplicates found!")
        return True


In [35]:
UniqueNameBirthDate(individuals, families)

US23: No duplicates found!


True

In [36]:
#US24 - No more than one family with the same spouses by name and the same marriage date should appear in a GEDCOM file
#Remodified version of US39
def UniqueFamilies(individuals, families):
    today = datetime.today()
    anniversaries = pd.DataFrame(columns=['Husband', 'Wife', 'Marriage Anniversary'])

    for i in range(len(families)):
        if (individuals.loc[individuals['ID'] == families.iloc[i]['Husband ID'], 'Alive'].iloc[0] == True
            and individuals.loc[individuals['ID'] == families.iloc[i]['Wife ID'], 'Death'].iloc[0] == True
            and families.iloc[i]['Divorced'] == 'NA'):
            
            if today <= datetime.strptime(families.iloc[i]['Married'], '%Y-%m-%d').date().replace(year=today.year):

                family = {'Husband': individuals.loc[individuals['ID']== families.iloc[i]['Husband ID'], 'Name'].iloc[0],
                          'Wife': individuals.loc[individuals['ID']== families.iloc[i]['Wife ID'], 'Name'].iloc[0],
                          'Marriage Anniversary': (families.iloc[i]['Married']).date().replace(year=today.year)}

                anniversaries = pd.concat([anniversaries, pd.DataFrame([family])], ignore_index=True)

    # Check for duplicate families based on Husband, Wife, and Marriage Anniversary columns
    duplicateFamilies = anniversaries.duplicated(subset=['Husband', 'Wife', 'Marriage Anniversary'], keep=False)

    if duplicateFamilies.any():
        print("Error: US24: Duplicate families with the same spouses and marriage date found.")
        return anniversaries[duplicateFamilies]
    else:
        print("US24: No more than one family with the same spouses and the same marriage date.")
        return True

In [37]:
UniqueFamilies(individuals, families)

US24: No more than one family with the same spouses and the same marriage date.


True

In [75]:
### US12: Parents not too old

def parentsAge(families, individuals):
    
    results= []
    father = True
    ### Checking for Father age difference
    for f in range(0,len(families)):
        
        if(individuals.loc[individuals['ID']== families.iloc[f]['Husband ID'], 'Age'].iloc[0] < 0):
            
            print('US12: Husband ID '+ families.iloc[f]['Husband ID'] +' has invalid age')
            
            results.append('US12: Husband ID '+ families.iloc[f]['Husband ID'] +' has invalid age')
            
        elif(families.iloc[f]['Children']!= 'NA'):
            
            for c in families.iloc[f]['Children']:
                if(0 < individuals.loc[individuals['ID']== families.iloc[f]['Husband ID'], 'Age'].iloc[0] - individuals.loc[individuals['ID']== c, 'Age'].iloc[0] < 80):
                    pass
                else:
                    father = False
                
            if father == False:
                print('US12: Father ' + families.iloc[f]['Husband ID'] + ' is 80 years older than his child or an invalid age')
                
                results.append('US12: Father ' + families.iloc[f]['Husband ID'] + ' is 80 years older than his child or an invalid age')
            else:
                print('US12: Father ' + families.iloc[f]['Husband ID'] + ' is 80 years younger than his childs')
                
                results.append('US12: Father ' + families.iloc[f]['Husband ID'] + ' is 80 years younger than his childs')
                
    
    mother = True    
    ### Checking for Mother age difference
    for f in range(0,len(families)):
        
        if(individuals.loc[individuals['ID']== families.iloc[f]['Wife ID'], 'Age'].iloc[0] < 0):
            
            print('US12: Wife ID '+ families.iloc[f]['Wife ID'] +' has invalid age')
            
            results.append('US12: Wife ID '+ families.iloc[f]['Wife ID'] +' has invalid age')
        
        elif(families.iloc[f]['Children']!= 'NA'):
            for c in families.iloc[f]['Children']:
                if(0 < individuals.loc[individuals['ID']== families.iloc[f]['Wife ID'], 'Age'].iloc[0] - individuals.loc[individuals['ID']== c, 'Age'].iloc[0] < 60):
                    pass
                else:
                    mother = True
                
            if mother == False:
                print('US12: Mother ' + families.iloc[f]['Wife ID'] + ' is 60 years older than her child or an invalid age')
                
                results.append('US12: Mother ' + families.iloc[f]['Wife ID'] + ' is 60 years older than her child or an invalid age')
            else:
                print('US12: Mother ' + families.iloc[f]['Wife ID'] + ' is 80 years younger than her childs')
                
                results.append('US12: Mother ' + families.iloc[f]['Wife ID'] + ' is 80 years younger than her childs')
                
    return results
            

In [77]:
with open("output_file.txt", 'a') as f:
    f.write("******US12: Parents and childs age difference\n")
    for r in parentsAge(families, individuals):
        f.write(r + '\n')


US12: Father I1 is 80 years younger than his childs
US12: Father I8 is 80 years younger than his childs
US12: Father I3 is 80 years younger than his childs
US12: Wife ID I2 has invalid age
US12: Wife ID I2 has invalid age
US12: Mother I5 is 80 years younger than her childs


In [99]:
### US16: All males of a family should have the same last name

def lastNameCheck(families, individuals):
    
    
    errors = []
    for f in range(0,len(families)):
        
        lastName = []
        husbandName = individuals.loc[individuals['ID'] == families.iloc[f]['Husband ID'], 'Name'].iloc[0]
        lastName.append(husbandName.split(' ')[1])
        if(families.iloc[f]['Children'] != 'NA'):
            
            for c in families.iloc[f]['Children']:
                if(individuals.loc[individuals['ID']== c, 'Gender'].iloc[0] == 'M'):
                    
                    lastName.append(individuals.loc[individuals['ID']== c, 'Name'].iloc[0].split(' ')[1])
                    
    
        if(len(set(lastName)) != 1):
            
            errors.append('US16: Error: Family ' + families.iloc[f]['ID'] +' does not have the same last name for males')
        else:
            errors.append('US16: Error: Family ' + families.iloc[f]['ID'] +' does have the same last name for males')
            
        print(lastName)
            
    return errors

In [100]:
with open("output_file.txt", 'a') as f:
    f.write("******US16: Families' male last name\n")
    for r in lastNameCheck(families, individuals):
        f.write(r + '\n')

['/Rubin/', '/Rubin/']
['/Hernandez/', '/Hernandez/']
['/Rubin/', '/Rubin/']
