In [2]:
import requests
from bs4 import BeautifulSoup
import json

### 1. Parsing page

In [128]:
def get_html(link):
    page = requests.get(url=link)
    page.encoding = 'utf=8'
    parsed = BeautifulSoup(page.text)
    triplets = []
    for text in parsed.find_all('a', {'target': '_blank'}): # finding all texts
        text_link = 'http://www.babel.gwi.uni-muenchen.de/' + text.attrs['href'].replace('view_corpus_file_new', 'view_glossed_corpus')
        # formulating text link
        text_page = requests.get(url=text_link)
        text_page.encoding = 'utf=8'
        triplets.extend(parse_page(text_page))
    return triplets

### 2. Parsing a single glossed text

In [127]:
def parse_page(html):
    triplets = []
    parsed = BeautifulSoup(html.text)
    for table in parsed.find_all('table', {'class': 'table_arr2table_new_glossed_text'}): # finding gloss tables
        for word in table.find_all('table'): # finding word columns
            word_form = word.find_all('td')[0].text
            stem = word.find_all('td')[2].text.split('-')[0]
            tags = [word.find_all('td')[4].text.split('-')[0]]
            tags.extend(word.find_all('td')[3].text.split('-')[1:])
            triplets.append({'lemma': stem, 'word_form': word_form, 'tags': ';'.join(tags)})
    return triplets

In [129]:
triplets = get_html('http://www.babel.gwi.uni-muenchen.de/index.php?abfrage=KK_corpus&subnavi=corpus_pub')

In [130]:
len(triplets)

83090

In [143]:
with open('website_parsed.json', 'w', encoding='utf-8') as f:
    json.dump(triplets, f, indent='\t', ensure_ascii=False)

### 3. Converting data to our format

In [3]:
with open('website_parsed.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [32]:
def conv(data):
    
    def yield_data(data):
        for key in data:
            yield(key['lemma'], key['word_form'], key['tags'])
    
    def web2kaz(text):
        text = text.replace('aː', 'ă').replace('ɵː', 'ɵ').replace('ɔː', 'o').replace('sʲ', 'ś').replace('eː', 'ɛ')
        text = text.replace('nʲ', 'ń').replace('o', 'u').replace('u', 'ʉ').replace('ɬ', 'λ').replace('β', 'w')
        text = text.replace('x', 'χ').replace('ʃ', 'š').replace('ʲ', 'ˊ')
        return text
        
    my_list =[]
    for lemma, word_form, tags in yield_data(data):
        data_dict = {}
        data_dict['lemma'] = web2kaz(lemma)
        data_dict['word_form'] = web2kaz(word_form)
        data_dict['tags'] = tags
        my_list.append(data_dict)
    
    return my_list

In [33]:
converted_data = conv(data)

In [34]:
len(converted_data)

83090

In [35]:
write_format = set()
for triplet in converted_data:
    write_format.add(f"{triplet['lemma']}\t{triplet['word_form']}\t{triplet['tags']}\n")

In [36]:
with open('website_unidraft.tsv', 'w', encoding='utf-8') as f:
    f.write(''.join(sorted(write_format)))