In [9]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import pandas as pd
from langconv import *
from collections import defaultdict

In [10]:
base_url = 'https://en.wikipedia.org'

def Traditional2Simplified(sentence):
    sentence = Converter('zh-hans').convert(sentence)
    return sentence

In [11]:
# Parse page
html = urlopen('https://en.wikipedia.org/wiki/Lists_of_people_of_the_Three_Kingdoms')
bs = BeautifulSoup(html, 'html.parser')
links = bs.find('td', {'class':"navbox-list navbox-odd hlist"}).find_all(
    href=re.compile('^(/wiki/List_of_people_of_the_Three_Kingdoms_)'))

res = []

for link in links:
    table_html = urlopen(base_url + link['href'])
    table_bs = BeautifulSoup(table_html, 'html.parser')
    table = table_bs.find('div', {'class':"mw-parser-output"}).find(
        'table', {'class':"wikitable sortable"}).tbody
    names = []
    for row in table.find_all('tr')[1:]:
        name = row.td
        name_link = name.find('a')
        if name_link and re.search('^(/wiki/)', name_link['href']): # Has link
            s = name.text
            name_en = re.compile(r'[\u4e00-\u9fa5]').split(s)[0] # English name
            name_zh = Traditional2Simplified(s[len(name_en):]) # Chinese name (Simplified)
            if name_en in name_link['title']: # Has own entry
#                 names.append([name_en, name_zh, name.find('a')['href'], name_link['title']])
                names.append([name_en, name_zh, name.find('a')['href']])
    res.extend(names)

In [12]:
# Clean duplicated names
header = ['name_en', 'name_zh', 'url']
df = pd.DataFrame(columns=header, data=res)
count = defaultdict(lambda:0)
for x in df['name_en']: count[x] += 1
for key in count.keys():
    if count[key] > 1: 
        print(df[df['name_en']==key])

        name_en name_zh                                     url
4  Empress Bian     卞皇后  /wiki/Empress_Bian_(Cao_Huan%27s_wife)
5  Empress Bian     卞皇后   /wiki/Empress_Bian_(Cao_Mao%27s_wife)
    name_en name_zh                     url
27  Cao Jie      曹节   /wiki/Empress_Cao_Jie
28  Cao Jie      曹节  /wiki/Cao_Jie_(eunuch)
   name_en name_zh                                 url
29  Cao Ju      曹矩    /wiki/Cao_Ju_(Prince_of_Fanyang)
30  Cao Ju      曹据  /wiki/Cao_Ju_(Prince_of_Pengcheng)
    name_en name_zh                                url
31  Cao Jun      曹均        /wiki/Cao_Jun_(Duke_of_Fan)
32  Cao Jun      曹峻  /wiki/Cao_Jun_(Prince_of_Chenliu)
    name_en name_zh                                url
34  Cao Lin      曹霖  /wiki/Cao_Lin_(Prince_of_Donghai)
35  Cao Lin      曹林      /wiki/Cao_Lin_(Prince_of_Pei)
    name_en name_zh                                url
36  Cao Mao      曹髦                      /wiki/Cao_Mao
37  Cao Mao      曹茂  /wiki/Cao_Mao_(Prince_of_Laoling)
    name_en name

In [13]:
to_delete = [4, 5, 28, 30, 32, 35, 37, 41, 73, 94, 96, 165, 277, 286, 291, 306, 392, 408, 493, 554, 584, 585, 586, 592, 602, 611, 614, 619, 620, 646, 649]
df.drop(to_delete, axis=0, inplace = True)
df.reset_index(inplace = True, drop = True)

In [16]:
# Save data
df.to_csv('./data/people.csv', index = False)