In [2]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import statsmodels.formula.api as smf
import seaborn as sns
from tqdm import tqdm
import zipfile
from collections import Counter


In [5]:
df = pd.read_parquet('links.parquet')
df

Unnamed: 0,raw_data
0,"<!DOCTYPE html>\n<html class=""client-nojs"" lan..."
1,"<!DOCTYPE html>\n<html class=""client-nojs"" lan..."
2,"<!DOCTYPE html>\n<html class=""client-nojs"" lan..."
3,"<!DOCTYPE html>\n<html class=""client-nojs"" lan..."
4,"<!DOCTYPE html>\n<html class=""client-nojs"" lan..."
...,...
3674,"<!DOCTYPE html>\n<html class=""client-nojs"" lan..."
3675,"<!DOCTYPE html>\n<html class=""client-nojs"" lan..."
3676,"<!DOCTYPE html>\n<html class=""client-nojs"" lan..."
3677,"<!DOCTYPE html>\n<html class=""client-nojs"" lan..."


In [2]:
url = 'https://awoiaf.westeros.org/index.php/List_of_characters'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
r = requests.get(url, headers = headers)
r.status_code

200

In [3]:
soup = BeautifulSoup(r.text)
text = soup.find_all('div', class_='mw-parser-output')
links = []
for elem in text[0].find_all('ul')[1:]:
    for name in elem.find_all('li'):
        links.append(name.a.get('href'))
links        

['/index.php/A_certain_man',
 '/index.php/Abelar_Hightower',
 '/index.php/Abelon',
 '/index.php/Addam_of_Duskendale',
 '/index.php/Addam_Frey',
 '/index.php/Addam_Hightower',
 '/index.php/Addam_Marbrand',
 '/index.php/Addam_Osgrey',
 '/index.php/Addam_Rivers',
 '/index.php/Addam_Velaryon',
 '/index.php/Addam_Whitehead',
 '/index.php/Addison_Hill',
 '/index.php/Adrack_Humble',
 '/index.php/Adrian_Redfort',
 '/index.php/Adrian_Tarbeck',
 '/index.php/Adrian_Thorne',
 '/index.php/Aegon_Ambrose',
 '/index.php/Aegon_Blackfyre',
 '/index.php/Aegon_Frey_(son_of_Stevron)',
 '/index.php/Aegon_Frey_(son_of_Aenys)',
 '/index.php/Aegon_I_Targaryen',
 '/index.php/Aegon_II_Targaryen',
 '/index.php/Aegon_III_Targaryen',
 '/index.php/Aegon_IV_Targaryen',
 '/index.php/Aegon_V_Targaryen',
 '/index.php/Aegon_Targaryen_(son_of_Gaemon)',
 '/index.php/Aegon_Targaryen_(son_of_Aenys_I)',
 '/index.php/Aegon_Targaryen_(son_of_Jaehaerys_I)',
 '/index.php/Aegon_Targaryen_(son_of_Baelon)',
 '/index.php/Aegon_Targar

In [4]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
links_csv = []
for link in tqdm(links):
    url = 'https://awoiaf.westeros.org' + link
    r = requests.get(url, headers = headers)
    if r.status_code == 200:
        links_csv.append({'raw_data': r.text})
    else:
        print('error: ', r.status_code)

100%|█████████████████████████████████████████████████████████████████████████████| 3679/3679 [1:31:01<00:00,  1.48s/it]


In [5]:
df_links = pd.DataFrame(links_csv)
df_links

Unnamed: 0,raw_data
0,"<!DOCTYPE html>\n<html class=""client-nojs"" lan..."
1,"<!DOCTYPE html>\n<html class=""client-nojs"" lan..."
2,"<!DOCTYPE html>\n<html class=""client-nojs"" lan..."
3,"<!DOCTYPE html>\n<html class=""client-nojs"" lan..."
4,"<!DOCTYPE html>\n<html class=""client-nojs"" lan..."
...,...
3674,"<!DOCTYPE html>\n<html class=""client-nojs"" lan..."
3675,"<!DOCTYPE html>\n<html class=""client-nojs"" lan..."
3676,"<!DOCTYPE html>\n<html class=""client-nojs"" lan..."
3677,"<!DOCTYPE html>\n<html class=""client-nojs"" lan..."


In [6]:
df_links.to_parquet('links.parquet')

### Surname dataset

In [7]:
import os
import time
from tqdm import tqdm
surnames = []
for index, row in tqdm(df_links.iterrows()):
    html = row['raw_data']
    soup = BeautifulSoup(html)
    firstName = soup.find_all('h1', class_='firstHeading page-header')[0].span.text
    infobox = soup.find_all('table', class_ = 'infobox')
    names = [] 
    if infobox:
        list = infobox[0].find_all('tr')                   
        for elem in list:
            if elem.th:
                if elem.th.text == 'Aliases':
                    list = elem.find_all('li')
                    for name in list:
                        surname = re.findall(r'^(?:\d+,\d+)?[^[]+',name.text)
                        names.append(surname[0])
                        
                    break
                if elem.th.text == 'Alias': 
                    name = elem.td.text
                    surname = re.findall(r'^(?:\d+,\d+)?[^[]+',name)
                    names.append(surname[0])
                        
                    break
    surnames.append({'name': firstName, 'surname': names})
        
    



3679it [01:18, 46.80it/s]


In [8]:
df_surname = pd.DataFrame(surnames)
df_surname

Unnamed: 0,name,surname
0,A certain man,[]
1,Abelar Hightower,[]
2,Abelon,[]
3,Addam of Duskendale,[\nAddam of Duskendale]
4,Addam Frey,[]
...,...,...
3674,Zharaq zo Loraq,[\nThe Liberator]
3675,Zhea,"[Zhea the Barren, Zhea Zorseface, Zhea the Cruel]"
3676,Zhoe Blanetree,[]
3677,Zia Frey,[]


In [45]:
count = Counter(df_surname.name.values)
most_occurences = count.most_common()
repeat = []
for element, occurrences in most_occurences:
    if occurrences > 1:
        repeat.append(element)

In [46]:
repeat

['Beardless Dick',
 'Canker Jeyne',
 'Clubfoot Karl',
 'Corwyn Corbray',
 'Fearless Ithoke',
 'List of characters created for Game of Thrones',
 'High Septon (stonemason)',
 'Jack Bulwer',
 'Lord of Bones',
 'Paxter Redwyne',
 'Poxy Tym',
 'Red Rolfe',
 'Robyn Rhysling']

In [50]:
for elem in repeat:
    df_surname = df_surname.drop(index=df_surname[df_surname.name == elem].index[1])

In [52]:
df_surname.to_parquet('surnames.parquet')

In [53]:
test = pd.read_parquet('surnames.parquet')
test

Unnamed: 0,name,surname
0,A certain man,[]
1,Abelar Hightower,[]
2,Abelon,[]
3,Addam of Duskendale,[\nAddam of Duskendale]
4,Addam Frey,[]
...,...,...
3674,Zharaq zo Loraq,[\nThe Liberator]
3675,Zhea,"[Zhea the Barren, Zhea Zorseface, Zhea the Cruel]"
3676,Zhoe Blanetree,[]
3677,Zia Frey,[]


### Books dataset

In [13]:
import os
import time
from tqdm import tqdm
books = []

for index, row in tqdm(df_links.iterrows()):
    html = row['raw_data']
    soup = BeautifulSoup(html)
    firstName = soup.find_all('h1', class_='firstHeading page-header')[0].span.text
    infobox = soup.find_all('table', class_ = 'infobox')
    books_appearance = []
    books_mentioned = []
    books_POV = []
    if infobox:
        list = infobox[0].find_all('tr')                   
        for elem in list:
            if elem.th: # appears, mentioned, POV
                if elem.th.text == 'Books':
                    list = elem.find_all('li')
                    for book in list:
                        if 'mentioned' in book.text:
                            books_mentioned.append(book.a.text)

                        if 'appears' in book.text:
                            books_appearance.append(book.a.text)

                        if 'POV' in book.text:
                            books_POV.append(book.a.text)
                        
                    break
                if elem.th.text == 'Book': 
                    if 'mentioned' in elem.td.text:
                            books_mentioned.append(elem.a.text)

                    if 'appears' in elem.td.text:
                        books_appearance.append(elem.a.text)

                    if 'POV' in elem.td.text:
                        books_POV.append(elem.a.text)
                        
                    break
    books.append({'name': firstName, 'books_mentioned': books_mentioned, 'books_appearance': books_appearance, 'books_POV': books_POV})
        
    



3679it [01:26, 42.55it/s]


In [16]:
df_books = pd.DataFrame(books)
df_books

Unnamed: 0,name,books_mentioned,books_appearance,books_POV
0,A certain man,[A Clash of Kings],[],[]
1,Abelar Hightower,[],[The Hedge Knight],[]
2,Abelon,[Fire and Blood],[],[]
3,Addam of Duskendale,[The World of Ice & Fire],[],[]
4,Addam Frey,[],[The Mystery Knight],[]
...,...,...,...,...
3674,Zharaq zo Loraq,[A Dance with Dragons],[],[]
3675,Zhea,[The World of Ice & Fire],[],[]
3676,Zhoe Blanetree,[],[],[]
3677,Zia Frey,[],[],[]


In [54]:
count = Counter(df_books.name.values)
most_occurences = count.most_common()
repeat = []
for element, occurrences in most_occurences:
    if occurrences > 1:
        repeat.append(element)

In [57]:
for elem in repeat:
    df_books = df_books.drop(index=df_books[df_books.name == elem].index[1])

In [59]:
df_books.to_parquet('books.parquet')

### Text length, rank & infobox name dataset

In [18]:
infos = []

for index, row in tqdm(df_links.iterrows()):
    html = row['raw_data']
    soup = BeautifulSoup(html)
    firstName = soup.find_all('h1', class_='firstHeading page-header')[0].span.text
    infobox = soup.find_all('table', class_ = 'infobox')
    text_container = soup.find_all('div', class_= 'mw-parser-output')
    
    # find text length
    texts = text_container[0].find_all("p")
    text = "".join(i.text for i in texts)
    text = re.sub(r"\[\d+\]", "", text)
    text_len = len(text)


    #find rank
    page_rank = 0
    a_list = soup.find_all('a')
    for a in a_list:
        if a.get('href') in links:
           page_rank += 1 

    # find infobox name
    if infobox:
        table = infobox[0].find_all('table')
        if table:
            td = table[0].find_all('td')
            span = td[1].find("span")
            if span:
                tmp = span.text.strip()
                span.decompose()
                infobox_name = tmp + " " + td[1].text
            else:
                infobox_name = td[1].text
        else:
            infobox_name = infobox[0].find_all('th')[0].text
    else:
        infobox_name = ''
    # Create the dictionnary
    infos.append({'name': firstName, 'infobox_name': infobox_name, 'text_length': text_len, 'rank': page_rank})

3679it [01:42, 35.72it/s]


In [19]:
df_infos = pd.DataFrame(infos)
df_infos

Unnamed: 0,name,infobox_name,text_length,rank
0,A certain man,A certain man,1254,3
1,Abelar Hightower,Ser Abelar Hightower,508,3
2,Abelon,Archmaester Abelon,102,0
3,Addam of Duskendale,Addam of Duskendale,175,0
4,Addam Frey,Ser Addam Frey,586,14
...,...,...,...,...
3674,Zharaq zo Loraq,Zharaq zo Loraq,109,1
3675,Zhea,Zhea,1110,1
3676,Zhoe Blanetree,Zhoe Blanetree,269,21
3677,Zia Frey,Zia Frey,64,16


In [60]:
count = Counter(df_infos.name.values)
most_occurences = count.most_common()
repeat = []
for element, occurrences in most_occurences:
    if occurrences > 1:
        repeat.append(element)

In [61]:
for elem in repeat:
    df_infos = df_infos.drop(index=df_infos[df_infos.name == elem].index[1])

In [62]:
df_infos.to_parquet('infos.parquet')

### Join all datasets

In [63]:
surnames = pd.read_parquet('surnames.parquet')
books = pd.read_parquet('books.parquet')
infos = pd.read_parquet('infos.parquet')

In [64]:
surnames

Unnamed: 0,name,surname
0,A certain man,[]
1,Abelar Hightower,[]
2,Abelon,[]
3,Addam of Duskendale,[\nAddam of Duskendale]
4,Addam Frey,[]
...,...,...
3674,Zharaq zo Loraq,[\nThe Liberator]
3675,Zhea,"[Zhea the Barren, Zhea Zorseface, Zhea the Cruel]"
3676,Zhoe Blanetree,[]
3677,Zia Frey,[]


In [65]:
books

Unnamed: 0,name,books_mentioned,books_appearance,books_POV
0,A certain man,[A Clash of Kings],[],[]
1,Abelar Hightower,[],[The Hedge Knight],[]
2,Abelon,[Fire and Blood],[],[]
3,Addam of Duskendale,[The World of Ice & Fire],[],[]
4,Addam Frey,[],[The Mystery Knight],[]
...,...,...,...,...
3674,Zharaq zo Loraq,[A Dance with Dragons],[],[]
3675,Zhea,[The World of Ice & Fire],[],[]
3676,Zhoe Blanetree,[],[],[]
3677,Zia Frey,[],[],[]


In [66]:
infos

Unnamed: 0,name,infobox_name,text_length,rank
0,A certain man,A certain man,1254,3
1,Abelar Hightower,Ser Abelar Hightower,508,3
2,Abelon,Archmaester Abelon,102,0
3,Addam of Duskendale,Addam of Duskendale,175,0
4,Addam Frey,Ser Addam Frey,586,14
...,...,...,...,...
3674,Zharaq zo Loraq,Zharaq zo Loraq,109,1
3675,Zhea,Zhea,1110,1
3676,Zhoe Blanetree,Zhoe Blanetree,269,21
3677,Zia Frey,Zia Frey,64,16


In [70]:
df1 = pd.merge(surnames,infos)
df1

Unnamed: 0,name,surname,infobox_name,text_length,rank
0,A certain man,[],A certain man,1254,3
1,Abelar Hightower,[],Ser Abelar Hightower,508,3
2,Abelon,[],Archmaester Abelon,102,0
3,Addam of Duskendale,[\nAddam of Duskendale],Addam of Duskendale,175,0
4,Addam Frey,[],Ser Addam Frey,586,14
...,...,...,...,...,...
3661,Zharaq zo Loraq,[\nThe Liberator],Zharaq zo Loraq,109,1
3662,Zhea,"[Zhea the Barren, Zhea Zorseface, Zhea the Cruel]",Zhea,1110,1
3663,Zhoe Blanetree,[],Zhoe Blanetree,269,21
3664,Zia Frey,[],Zia Frey,64,16


In [72]:
df2 = pd.merge(df1,books)
df2

Unnamed: 0,name,surname,infobox_name,text_length,rank,books_mentioned,books_appearance,books_POV
0,A certain man,[],A certain man,1254,3,[A Clash of Kings],[],[]
1,Abelar Hightower,[],Ser Abelar Hightower,508,3,[],[The Hedge Knight],[]
2,Abelon,[],Archmaester Abelon,102,0,[Fire and Blood],[],[]
3,Addam of Duskendale,[\nAddam of Duskendale],Addam of Duskendale,175,0,[The World of Ice & Fire],[],[]
4,Addam Frey,[],Ser Addam Frey,586,14,[],[The Mystery Knight],[]
...,...,...,...,...,...,...,...,...
3661,Zharaq zo Loraq,[\nThe Liberator],Zharaq zo Loraq,109,1,[A Dance with Dragons],[],[]
3662,Zhea,"[Zhea the Barren, Zhea Zorseface, Zhea the Cruel]",Zhea,1110,1,[The World of Ice & Fire],[],[]
3663,Zhoe Blanetree,[],Zhoe Blanetree,269,21,[],[],[]
3664,Zia Frey,[],Zia Frey,64,16,[],[],[]


In [73]:
df2.to_parquet('pages.parquet')

In [74]:
pd.read_parquet('pages.parquet')

Unnamed: 0,name,surname,infobox_name,text_length,rank,books_mentioned,books_appearance,books_POV
0,A certain man,[],A certain man,1254,3,[A Clash of Kings],[],[]
1,Abelar Hightower,[],Ser Abelar Hightower,508,3,[],[The Hedge Knight],[]
2,Abelon,[],Archmaester Abelon,102,0,[Fire and Blood],[],[]
3,Addam of Duskendale,[\nAddam of Duskendale],Addam of Duskendale,175,0,[The World of Ice & Fire],[],[]
4,Addam Frey,[],Ser Addam Frey,586,14,[],[The Mystery Knight],[]
...,...,...,...,...,...,...,...,...
3661,Zharaq zo Loraq,[\nThe Liberator],Zharaq zo Loraq,109,1,[A Dance with Dragons],[],[]
3662,Zhea,"[Zhea the Barren, Zhea Zorseface, Zhea the Cruel]",Zhea,1110,1,[The World of Ice & Fire],[],[]
3663,Zhoe Blanetree,[],Zhoe Blanetree,269,21,[],[],[]
3664,Zia Frey,[],Zia Frey,64,16,[],[],[]
