In [1]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import re
import urllib.parse
from tqdm.notebook import tqdm

In [2]:
def get_event_judges(title, date, place, online_table_link):
    if online_table_link is None:
        return None
    df = pd.DataFrame(columns=['title','date','place','online_link','category', 'segment', 'position', 'name'])
    page = requests.get(online_table_link)
    soup = BeautifulSoup(page.text, 'html.parser')
    all_off = soup.find_all('a', {'href': re.compile(r'SEG[0-9]{3}OF')})
    qty = 0
    for link in all_off:
        off_link = urllib.parse.urljoin(online_table_link, link.get('href'))
        page = requests.get(off_link)
        if page.status_code == 200:
            soup = BeautifulSoup(page.text, 'html.parser')
            if len(soup.find_all('h2')) < 2:
                continue
            category, segment = [x.strip() for x in soup.find_all('h2')[1].text.split('-')[:2]]
            lines = soup.find_all('tr')
            for line in lines[1:]:
                cells = line.find_all('td')
                if cells[0].text != '\xa0':
                    df.loc[len(df)] = [title, date, place, off_link, category, segment, cells[0].text, cells[1].text]
    return df

In [3]:
def get_all_links(online_table_link):
    if online_table_link is None:
        return None
    page = requests.get(online_table_link)
    soup = BeautifulSoup(page.text, 'html.parser')
    all_res = soup.find_all('a', {'href': re.compile(r'SEG[0-9]{3}.HTM')})
    return list(map(lambda x: urllib.parse.urljoin(online_table_link, x.get('href')), all_res))

In [20]:
def parse_officials(online_table_link, data, info):
    if online_table_link is None:
        return None
    page = requests.get(online_table_link)
    soup = BeautifulSoup(page.text, 'html.parser')
    link = soup.find('a', {'href': re.compile(r'SEG[0-9]{3}OF.HTM')}).get('href')
    if link is None:
        return None
    link = urllib.parse.urljoin(
        online_table_link, 
        link
    )
    page = requests.get(link)
    if page.status_code != 200:
        return None
    soup = BeautifulSoup(page.text, 'html.parser')
    lines = soup.find_all('tr')
    for line in lines:
        cells = line.find_all('td')
        if len(cells) > 1:
            if cells[0].text != "":
                data['function'].append(cells[0].text)
                data['name'].append(cells[1].text)
                for key in info:
                    data[key].append(info[key])

In [4]:
def parse_result_line(line, data):
    cells = line.find_all('td')
    if len(cells) < 10:
        return False
    rank = cells[0].text
    full_name = cells[1].a.text
    club = cells[1].text[len(full_name):]
    full_name = full_name.replace("ё", "е").replace("Ё", "Е")
    club = club.replace("ё", "е").replace("Ё", "Е")
    full_name = [x.strip() for x in full_name.split()]
    if len(full_name) == 2:
        first_name = full_name[0]
        middle_name = ""
        last_name = full_name[1]
    elif len(full_name) == 3:
        first_name = full_name[0]
        middle_name = full_name[1]
        last_name = full_name[2]
    else:
        return False
    tss = float(cells[2].text)
    tes = float(cells[3].text)
    pcs = float(cells[5].text)
    data['rank'].append(rank)
    data['firstname'].append(first_name)
    data['middlename'].append(middle_name)
    data['lastname'].append(last_name)
    data['club'].append(club)
    data['tss'].append(tss)
    data['tes'].append(tes)
    data['pcs'].append(pcs)
    return True

In [5]:
def get_category_and_segment(page):
    soup = BeautifulSoup(page.text, 'html.parser')
    if len(soup.find_all('h2')) >= 1:
        return  [x.strip() for x in soup.find_all('h2')[1].text.split(' - ')[:2]]

In [6]:
def parse_results(entries_link, data, info):
    if entries_link is None:
        return None
    page = requests.get(entries_link)
    if page.status_code != 200:
        return None
    category, segment = get_category_and_segment(page)
    soup = BeautifulSoup(page.text, 'html.parser')
    lines = soup.find_all('tr')
    for line in lines:
        if parse_result_line(line, data):
            data['category'].append(category)
            data['segment'].append(segment)
            for key in info:
                data[key].append(info[key])

In [22]:
def parse_event(event_link, data, judges):
    soup = BeautifulSoup(requests.get(event_link).text, 'html.parser')
    title = soup.find("h1", "entry-title").text
    date = soup.find("div", "competition-date").p.text.split('-')[0].strip()
    place = soup.find("div", "competition-place").p.span.text.strip()
    online = soup.find("div", "competition-file")
    if online is not None:
        online_link = online.a.get('href')
        online_link = online_link if online_link[-1] == '/' else online_link + '/'
        parse_officials(online_link, judges, {'date': date, 'place': place, 'online': online_link})
        segments = get_all_links(online_link)
        for segment in segments:
            parse_results(segment, data, {'date': date, 'place': place, 'online': online_link})

In [13]:
def parse_season(season_link, data, judges):
    page = requests.get(season_link)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    competitions = soup.find_all('tr')
    for i in tqdm(range(len(competitions))): 
        title = competitions[i].find_all('td')[1]
        if title.a is not None:
            online_link = None
            proto_link = None
            event_link = title.a.get('href')
            parse_event(event_link, data, judges)

In [24]:
for season in range(2024, 2025):
    data = {
        'date': [],
        'place': [], 
        'online':[],
        'category': [],
        'segment': [],
        'rank': [],
        'firstname': [],
        'middlename': [],
        'lastname': [],
        'club': [],
        'tss': [],
        'tes': [],
        'pcs': []
    }
    judges = {
        'date': [],
        'place': [], 
        'online':[],
        'function': [],
        'name': [],
    }
    print(f"season - {str(season)[-2:]}-{str(season+1)[-2:]}")
    parse_season(f"http://ffkkmo.ru/calendar/?season={season}", data, judges)
    df = pd.DataFrame.from_dict(data)
    df.to_csv(f"ffkkmo_{str(season)[-2:]}{str(season+1)[-2:]}_dump.csv")
    df = pd.DataFrame.from_dict(judges)
    df.to_csv(f"ffkkmo_judges_{str(season)[-2:]}{str(season+1)[-2:]}_dump.csv")

season - 24-25


  0%|          | 0/59 [00:00<?, ?it/s]