In [540]:
import requests
import re
import os
from bs4 import BeautifulSoup
import pickle
import bz2file as bz2

In [541]:
baseUrl = "https://gameofthronesfanon.fandom.com/"

In [542]:
def save(title, data):
    with bz2.BZ2File(f"{title}.pbz2", "w") as f:
        pickle.dump(data, f)

In [543]:
def load(title):
    with bz2.BZ2File(f"{title}.pbz2", "r") as f:
        return pickle.load(f)

In [544]:
def fetch_page(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.text

In [545]:
def save_to_file(content, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)

In [546]:
# Get all urls for the characters
urls = []
for char in ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]:
     url = f"{baseUrl}wiki/Category:Characters?from={char}"
     urls.append(url)

In [547]:
# Get all characters from the Game of Thrones Fanon Wiki
if not os.path.exists('../data/all_characters.html'):   
    all_content = ""
    for url in urls:
        content = fetch_page(url)
        if content:
            all_content += content
        else:
            print(f"Failed to fetch {url}")

    save_to_file(all_content, '../data/all_characters.html')

In [548]:
class BiographicalInfo:
    def __init__(self, name, birth_year, birth_place):
        self.name = name
        self.birth_year = birth_year
        self.birth_place = birth_place
    
    def __repr__(self):
        return f"{self.name} was born in {self.birth_place} in {self.birth_year}"

In [549]:
class PoliticalInfo:
    def __init__(self, houses, titles):
        self.houses = houses
        self.titles = titles
    
    def __repr__(self):
        return f"{self.titles} of {self.houses}"

In [550]:
class PersonalInfo:
    def __init__(self, aka, culture, religon, father, mother, spouses, issues, siblings):
        self.aka = aka
        self.culture = culture
        self.religon = religon
        self.father = father
        self.mother = mother
        self.spouses = spouses
        self.issues = issues
        self.siblings = siblings
    
    def __repr__(self):
        return f"{self.aka} {self.culture} {self.religon} {self.father} {self.mother} {self.spouses} {self.issues} {self.siblings}"

In [551]:
class Character:
    def __init__(self, name, url, html, text):
        self.name = name
        self.url = url
        self.html = html
        self.text = text
        self.references = []
        self.biographical_info = None
        self.personal_info = None
        self.political_info = None

    def __repr__(self):
        return f"Character(name={self.name}, url={self.url})"

In [552]:
def get_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    text = soup.get_text(separator="\n", strip=True)
    return text

In [553]:
def get_html(url):
    content = fetch_page(url)
    return content

In [554]:
def create_character(name, url):
    text = get_text(url)
    html = get_html(url)
    character = Character(name, url, html, text)
    return character

In [555]:
def get_characters_from_matches(matches):
    skip = ["Category:", "User:"]
    characters = []
    for href, title in matches:
        if any(s in title for s in skip):
            continue
        try:
            character = create_character(title, baseUrl + href)
            characters.append(character)
        except Exception as e:
            pass

    return characters

In [556]:
def get_all_character_matches_of_html():
    with open('../data/all_characters.html', 'r', encoding='utf-8') as file:
        all_characters_html = file.read()
        pattern = r'<a\s+href="([^"]+)"\s+title="([^"]+)"'
        matches = re.findall(pattern, all_characters_html)
        return matches

In [557]:
# get all characters from the html file and add the text and raw html to the character
path = '../data/characters'
characters = []
if os.path.exists(f"{path}.pbz2"):
    characters = load(path)
else:    
    matches = get_all_character_matches_of_html()
    characters = get_characters_from_matches(matches)
    save(path, characters)

In [558]:
# add references to the characters
pattern = r'<a\s+href="([^"]+)"'
all_character_urls = [character.url for character in characters]
for character in characters:
    matches = re.findall(pattern, character.html)
    for href in matches:
        url = baseUrl + href
        if url in all_character_urls and url != character.url:
            reference = [c for c in characters if c.url == url][0]
            character.references.append(reference)

In [559]:
class CharacterAttribute:
    def __init__(self, data_source_title, texts):
        self.data_source_title = data_source_title
        self.texts = texts
        self.hrefs = []

    def __repr__(self):
        return f"{self.data_source_title} {self.texts} {self.hrefs}"

In [560]:
def get_by_data_source(character, data_source_title, tag):
    soup = BeautifulSoup(character.html, 'html.parser')
    result = soup.find(attrs={"data-source": data_source_title})
    if result:
        if tag is None:
            results = result
        else:
            results = result.findAll(tag)
        texts = [result.text for result in results]
        characterAttribute = CharacterAttribute(data_source_title, texts)
        if tag == "a":
            hrefs = [result['href'] for result in results]
            characterAttribute.hrefs = hrefs
        return characterAttribute
    return None

In [561]:
def get_name(character):
    return get_by_data_source(character, "Title", None)

In [562]:
def get_birth_year(character):
    return get_by_data_source(character, "Birth", "div")

In [563]:
def get_birth_place(character):
    return get_by_data_source(character, "Birth", "a")

In [564]:
def get_titles(character):
    return get_by_data_source(character, "Titles", "a")

In [565]:
def get_houses(character):
    return get_by_data_source(character, "House", "a")

In [566]:
def get_AKA(character):
    return get_by_data_source(character, "AKA", "div")

In [567]:
def get_culture(character):
    return get_by_data_source(character, "Culture", "a")

In [568]:
def get_religion(character):
    return get_by_data_source(character, "Religion", "a")

In [569]:
def get_father(character):
    return get_by_data_source(character, "Father", "a")

In [570]:
def get_mother(character):
    return get_by_data_source(character, "Mother", "a")

In [571]:
def get_spouses(character):
    return get_by_data_source(character, "Spouse", "div")

In [572]:
def get_issues(character):
    return get_by_data_source(character, "Issue", "a")

In [573]:
def get_siblings(character):
    return get_by_data_source(character, "Siblings", "a")

In [574]:
def get_character_bibliographical_info(character):
    name = get_name(character)
    birth_place = get_birth_place(character)
    birth_year = get_birth_year(character)
    return BiographicalInfo(name, birth_year, birth_place)

In [575]:
def get_political_info(character):
    houses = get_houses(character)
    titles = get_titles(character)
    return PoliticalInfo(houses, titles)

In [576]:
def get_personal_info(character):
    aka = get_AKA(character)
    culture = get_culture(character)
    religion = get_religion(character)
    father = get_father(character)
    mother = get_mother(character)
    spouses = get_spouses(character)
    issues = get_issues(character)
    siblings = get_siblings(character)
    return PersonalInfo(aka, culture, religion, father, mother, spouses, issues, siblings)

In [577]:
# add attributes to the characters
path = '../data/characters_with_data'
if os.path.exists(f"{path}.pbz2"):
    characters = load(path)
else:   
    for character in characters:
        character.biographical_info = get_character_bibliographical_info(character)
        character.political_info = get_political_info(character)
        character.personal_info = get_personal_info(character) 
    save(path, characters)