In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import requests, time, csv, argparse, os, warnings, re

from datetime import datetime
from bs4 import BeautifulSoup, NavigableString, Tag
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

from tqdm import tqdm

## 1. Collins dictionary (Characters in the Bible)

In [2]:
name_lst, description_lst = [], []

with webdriver.Chrome(service=Service(ChromeDriverManager().install())) as driver:
    URL = 'https://www.collinsdictionary.com/word-lists/bible-characters-in-the-bible#google_vignette'
    driver.get(URL)
    
    character_lst = driver.find_elements(By.XPATH, '//*[@id="main_content"]/div[1]/div/div[2]/div/div/div[2]/span[1]/span')
    
    for i in character_lst:
        try:
            name_lst.append(str(i.find_element(By.XPATH, 'span[1]').text.lower()))
            description_lst.append(str(i.find_element(By.XPATH, 'span[2]').text.lower()))
        except: pass
        
print(len(name_lst), len(description_lst))

175 175


In [3]:
bible_character_dic1 = {
    'name': name_lst,
    'description': description_lst
}

df1 = pd.DataFrame(bible_character_dic1)
print(df1.shape)
df1.head()

(175, 2)


Unnamed: 0,name,description
0,aaron,"the first high priest of the israelites, broth..."
1,abednego,"one of daniel's three companions who, together..."
2,abel,"the second son of adam and eve, a shepherd, mu..."
3,abigail,the woman who brought provisions to david and ...
4,abraham,"the first of the patriarchs, the father of isa..."


In [4]:
df1[df1['name'].str.contains('john')]

Unnamed: 0,name,description
87,john,"the apostle john, the son of zebedee, identifi..."
88,john the baptist,the forerunner and baptizer of jesus: he was k...


## 2. Wikidata (Lists/List of biblical characters)

In [5]:
name_lst, description_lst, work_lst, instance_lst, gender_lst = [], [], [], [], []

In [6]:
URL = 'https://www.wikidata.org/wiki/Wikidata:Lists/List_of_biblical_characters'

response = requests.get(URL)
soups = BeautifulSoup(response.text, 'html.parser')

character_lst = soups.select('#mw-content-text > div.mw-parser-output > table > tbody > tr')
reg = re.compile(r'[a-zA-Z]')

for i in tqdm(character_lst[1:]):
    if reg.match(i.select_one('td:nth-child(2)').text) != None:
        name_lst.append(i.select_one('td:nth-child(2)').text.lower().replace('\n', ''))
        description_lst.append(i.select_one('td:nth-child(3)').text.lower().replace('\n', ''))
        work_lst.append(i.select_one('td:nth-child(4)').text.lower().replace('\n', ''))
        instance_lst.append(i.select_one('td:nth-child(5)').text.lower().replace('\n', ''))
        gender_lst.append(i.select_one('td:nth-child(6)').text.lower().replace('\n', ''))
        
print(len(name_lst), len(description_lst), len(work_lst), len(instance_lst), len(gender_lst))

100%|███████████████████████████████████████| 773/773 [00:00<00:00, 4731.65it/s]

760 760 760 760 760





In [7]:
bible_character_dic2 = {
    'name': name_lst,
    'description': description_lst,
    'work': work_lst,
    'instance': instance_lst, 
    'gender': gender_lst
}

df2 = pd.DataFrame(bible_character_dic2)
print(df2.shape)
df2.head()

(760, 5)


Unnamed: 0,name,description,work,instance,gender
0,aaron,biblical and quranic character,biblethe prince of egypt,human biblical figure,male
1,abaddon,angel of destruction,,angel in judaismangel in christianityangel,male
2,abagtha,biblical character,bible,human biblical figure,female
3,abana,river described in the bible,bible,mythical river,
4,abda,biblical figure,bible,human biblical figure,male


In [8]:
df2[df2['name'].str.contains('john')]

Unnamed: 0,name,description,work,instance,gender
165,child saint john,st john the baptist as a child,,certain aspects of a person's lifebiblical cha...,
388,john the apostle,"apostle of jesus; son of zebedee and salome, b...",,humanhuman biblical figure,male
389,john the baptist,major religious figure,bible,human,male
534,new testament people named john,from new testament,new testament,,


In [9]:
len(set(bible_character_dic2['name']) - set(bible_character_dic1['name']))

601

In [10]:
len(set(bible_character_dic1['name']) - set(bible_character_dic2['name']))

63

## 3. Bible Gateway

In [11]:
name_lst = []

URL = 'https://www.biblegateway.com/resources/all-men-bible/Alphabetical-Order-All-Men'

response = requests.get(URL)
soups = BeautifulSoup(response.text, 'html.parser')

character_lst = soups.select('body > div.nav-content > div > section > div:nth-child(5) > section > div > div > div.basic-content.container > div.basic-main.bg-content.row > div > div.exbib-content > div:nth-child(1) > ul > ul')

for i in tqdm(character_lst):
    for j in i.select('li'):
        name_lst.append(j.text.lower())

print(len(name_lst))

100%|█████████████████████████████████████████| 23/23 [00:00<00:00, 3672.91it/s]

1770





In [12]:
bible_character_dic3 = {
    'name': name_lst
}

df3 = pd.DataFrame(bible_character_dic3)
print(df3.shape)
df3.head()

(1770, 1)


Unnamed: 0,name
0,aaron
1,abagtha
2,abda
3,abdeel
4,abdi


In [13]:
df3[df3['name'].str.contains('jesus')]

Unnamed: 0,name
266,bar-jesus
936,jesus
998,"joshua, jehoshua, jehoshuah, jeshua, jesus"


In [14]:
len(set(bible_character_dic3['name']) - set(bible_character_dic2['name']))

1498