In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [2]:
url = 'https://witcher.fandom.com/wiki/Elder_Speech'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [3]:
elder_speech_words = soup.select('table.fandom-table td:nth-child(1)')
english_words = soup.select('table.fandom-table td:nth-child(2)')

In [4]:
parse = lambda word: re.sub(r'\[[^\]]*\]|\([^)]*\)', '', word.text.strip())

elder_speech_words = [parse(word) for word in elder_speech_words]
english_words = [parse(word) for word in english_words]

In [5]:
for i in range(10):
    print(f'{elder_speech_words[i]:<15} ->       {english_words[i]}')

a'baeth         ->       kiss, to kiss
aark, aark      ->       phonetic presentation of the sound ravens/crows make
abb             ->       mouth  
addan, adan     ->       dance; dancer; dancing
adhart          ->       forward
aé              ->       first person singular subject personal pronoun I 
aecáemm         ->       to follow
aedd            ->       shard, piece
aef             ->       to have, has
aefder          ->       later, after


In [6]:
# Put the words into a dataframe
translation = pd.DataFrame({'elder_speech': elder_speech_words, 'english': english_words})
translation.head()

Unnamed: 0,elder_speech,english
0,a'baeth,"kiss, to kiss"
1,"aark, aark",phonetic presentation of the sound ravens/crow...
2,abb,mouth
3,"addan, adan",dance; dancer; dancing
4,adhart,forward


In [7]:
# There are 243 words in the elder speech the rest are names or phrases
translation[240:245]

Unnamed: 0,elder_speech,english
240,zuirseime,Chaos
241,zvaere,swear
242,'ere,here
243,Aedd Gynvael,Shard of Ice
244,Aen N'og Mab Taedh'morc,"""Practices for a Young Bard"""


In [8]:
translation.drop(translation.index[243:], inplace=True)

# Now dataframe ends at the last word which is `'ere`
translation.tail()

Unnamed: 0,elder_speech,english
238,zael,to rise
239,zireael,swallow
240,zuirseime,Chaos
241,zvaere,swear
242,'ere,here


In [9]:
# Some elder speech words have many meanings in english so we will split them into separate rows so that each word has only one meaning
# Most common separator is a , or ;
translation['english'] = translation['english'].str.replace(';', ',')

In [10]:
translation['english'] = translation['english'].str.split(',')
translation.head(10)

Unnamed: 0,elder_speech,english
0,a'baeth,"[kiss, to kiss]"
1,"aark, aark",[phonetic presentation of the sound ravens/cro...
2,abb,[mouth ]
3,"addan, adan","[dance, dancer, dancing]"
4,adhart,[forward]
5,aé,[first person singular subject personal pronou...
6,aecáemm,[to follow]
7,aedd,"[shard, piece]"
8,aef,"[to have, has]"
9,aefder,"[later, after]"


In [11]:
translation = translation.explode('english')
translation.reset_index(drop=True, inplace=True)
translation.head(10)

Unnamed: 0,elder_speech,english
0,a'baeth,kiss
1,a'baeth,to kiss
2,"aark, aark",phonetic presentation of the sound ravens/crow...
3,abb,mouth
4,"addan, adan",dance
5,"addan, adan",dancer
6,"addan, adan",dancing
7,adhart,forward
8,aé,first person singular subject personal pronoun I
9,aecáemm,to follow


In [12]:
# Do the same with the elder speech words
translation['elder_speech'] = translation['elder_speech'].str.replace(';', ',')
translation['elder_speech'] = translation['elder_speech'].str.split(',')
translation = translation.explode('elder_speech')

In [13]:
# Strip spaces
translation['english'] = translation['english'].str.strip()
translation['elder_speech'] = translation['elder_speech'].str.strip()

In [None]:
# Remove duplicates
translation.drop_duplicates(subset=['elder_speech', 'english']).reset_index(drop=True)

In [14]:
# Save translation to csv
translation.to_csv('elder_speech.csv', index=False)