In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

In [3]:
xml_data = open(f"121310-ivanteevka.xml", 'r', encoding="utf-8").read()

In [5]:
category_col_names = ['CAT_ID', 'CAT_NAME', 'CAT_LEVEL']
segment_col_names = ['SCP_ID', 'SCP_NAME', 'SCP_SNAM', 'CAT_ID']
club_col_names = ['PCT_ID', 'PCT_PLNAME']
participant_col_names = ['PCT_ID', 'PCT_PLNAME', 'PCT_CLBID', 'PCT_BDAY']
performance_col_names = ['PRF_ID', 'PAR_ID', 'SCP_ID', 'PRF_PLACE', 'PRF_POINTS', 'PRF_STAT', 'PRF_STRTIM']
component_col_names = ['PRF_ID', 'PRF_CMP_NUM', 'PRF_JDG_NUM', 'PRF_CMP_VALUE']
element_col_names = ['PRF_ID', 'PRF_INAE', 'PRF_ELM_NUM', 'PRF_JDG_NUM', 'PRF_GOE']
goes_convert = {
    '14': 5,
    '13': 4,
    '7': 3,
    '6': 2,
    '5': 1,
    '4': 0,
    '3': -1,
    '2': -2,
    '1': -3,
    '12': -4,
    '11': -5
}

In [7]:
def parse_event(xml_data):
    root = ET.XML(xml_data)
    for _, event in enumerate(root):
        categories_list = event.find('Categories_List')
        for _, category in enumerate(categories_list):
            parse_node(category, category_col_names, categories)
            segments_list = category.find('Segments_List')
            for _, segment in enumerate(segments_list):
                parse_node(segment, segment_col_names, segments)
                participant_list = segment.find('Participants_List')
                if participant_list:
                    parse_participant_list(participant_list)
                performance_list = segment.find('Performance_List')
                if performance_list:
                    parse_performance_list(performance_list)

In [9]:
def parse_node(node, node_cols, df):
    ret_val = {}
    for col in node_cols:
        ret_val[col] = node.get(col)
    df.loc[len(df.index)] = ret_val

In [11]:
def parse_participant_list(participant_list):
    for _, participant in enumerate(participant_list):
        for _, person_couple_team in enumerate(participant):
            if person_couple_team.tag != 'Person_Couple_Team':
                continue
            parse_node(person_couple_team, participant_col_names, participants)
            club = person_couple_team.find('Club')
            if club is not None:
                parse_node(club, club_col_names, clubs)

In [13]:
def parse_components(performance, df):
    ret_val = {}
    ret_val['PRF_ID'] = performance.get('PRF_ID')
    for c in ['01', '03', '05']:
        for j in range(1, 11):
            comp = performance.get(f'PRF_C{c}J{j:02d}')
            if comp:
                ret_val['PRF_CMP_NUM'] = c
                ret_val['PRF_JDG_NUM'] = j
                ret_val[f'PRF_CMP_VALUE'] = float(comp) / 100
                df.loc[len(df.index)] = ret_val

In [15]:
def parse_elements(performance, df):
    ret_val = {}
    ret_val['PRF_ID'] = performance.get('PRF_ID')
    for e in range(1, 20):
        element = performance.get(f'PRF_INAE{e:02d}')
        if element:
            ret_val['PRF_ELM_NUM'] = e
            ret_val['PRF_INAE'] = element
            
            for j in range(1, 11):
                judge_goe = performance.get(f'PRF_E{e:02d}J{j:02d}')
                if judge_goe and judge_goe != '9':
                    ret_val['PRF_JDG_NUM'] = j
                    ret_val['PRF_GOE'] = goes_convert[judge_goe]
                    df.loc[len(df.index)] = ret_val

In [17]:
def parse_performance_list(performance_List):
    for _, performance in enumerate(performance_List):
        parse_node(performance, performance_col_names, performances)
        parse_components(performance, components)
        parse_elements(performance, elements)
        # print(performance.get(f''))

In [19]:
categories = pd.DataFrame(columns=category_col_names)
segments = pd.DataFrame(columns=segment_col_names)
clubs = pd.DataFrame(columns=club_col_names)
participants = pd.DataFrame(columns=participant_col_names)
performances = pd.DataFrame(columns=performance_col_names)
components = pd.DataFrame(columns=component_col_names)
elements = pd.DataFrame(columns=element_col_names)
parse_event(xml_data)

  if participant_list:
  if performance_list:
  if participant_list:
  if performance_list:
  if participant_list:
  if performance_list:
  if participant_list:
  if performance_list:
  if participant_list:
  if performance_list:
  if participant_list:
  if performance_list:
  if participant_list:
  if performance_list:
  if performance_list:
  if participant_list:
  if performance_list:
  if performance_list:
  if participant_list:
  if performance_list:
  if performance_list:
  if participant_list:
  if performance_list:


In [133]:
categories = categories.set_index('CAT_ID')
segments = segments.set_index('SCP_ID')
clubs = clubs.set_index('PCT_ID')
participants = participants.set_index('PCT_ID')
performances = performances.set_index('PRF_ID')
components = components.set_index('PRF_ID')
elements = elements.set_index('PRF_ID')

In [21]:
display(categories)
display(segments)
display(clubs)
display(participants)
display(performances)
display(components)
display(elements)

Unnamed: 0,CAT_ID,CAT_NAME,CAT_LEVEL
0,1,"3 юнoшecкий рaзряд, дeвoчки",6
1,2,"2 юнoшecкий рaзряд, мaльчики",5
2,3,"2 юнoшecкий рaзряд, дeвoчки",5
3,4,"1 юнoшecкий рaзряд, дeвoчки",4
4,5,"3 cпoртивный рaзряд, юнoши",3
5,6,"3 cпoртивный рaзряд, дeвушки",3
6,8,"2 cпoртивный рaзряд, дeвушки",2
7,9,"1 cпoртивный рaзряд, юнoши",1
8,10,"1 cпoртивный рaзряд, дeвушки",1
9,11,"юный фигуриcт, дeвoчки",u


Unnamed: 0,SCP_ID,SCP_NAME,SCP_SNAM,CAT_ID
0,1,Произвольная программа,ПП,1
1,2,Произвольная программа,ПП,2
2,3,Произвольная программа,ПП,3
3,4,Произвольная программа,ПП,4
4,5,Произвольная программа,ПП,5
5,6,Произвольная программа,ПП,6
6,9,Короткая программа,KП,8
7,10,Произвольная программа,ПП,8
8,11,Короткая программа,KП,9
9,12,Произвольная программа,ПП,9


Unnamed: 0,PCT_ID,PCT_PLNAME
0,1,"г.о. Дмитров, МУ ДО СШОР Альберта Демченко"
1,3,"г.о. Долгопрудный, АУ ""ФСК - ""Салют """
2,3,"г.о. Долгопрудный, АУ ""ФСК - ""Салют """
3,3,"г.о. Долгопрудный, АУ ""ФСК - ""Салют """
4,7,"г. Ивантеевка, МАУ ДО «СШ Лидер»"
...,...,...
158,27,"г.о. Щелково, МАУ ГОЩ «ФОК «ЛА» им. В.А. Треть..."
159,39,"г.о. Лыткарино, МАУ ДО «СШ Лыткарино»"
160,142,"г.о. Жуковский, МБУ ДО «СШ-ЦС «Метеор»"
161,15,"г.о. Клин, МБУ ДО СШОР «КЛИН СПОРТИВНЫЙ»"


Unnamed: 0,PCT_ID,PCT_PLNAME,PCT_CLBID,PCT_BDAY
0,2,Вера АНУЧКИНА,1,20160831
1,4,Нина ЯКУБОВИЧ,3,20170717
2,5,Дарья СОЛОДОВНИКОВА,3,20160107
3,6,Варвара БОТОВА,3,20170425
4,8,Ксения БОРМОТОВА,7,20150225
...,...,...,...,...
158,186,Ангелина МАТЫЦИНА,27,20080621
159,188,Милана ИЛЬИНА,39,20170130
160,189,Ксения МЕЙЕР,142,20180704
161,190,Дарья СЕВАСТЬЯНОВА,15,20161208


Unnamed: 0,PRF_ID,PAR_ID,SCP_ID,PRF_PLACE,PRF_POINTS,PRF_STAT,PRF_STRTIM
0,5,1,1,27,639,O,09:39:54
1,6,2,1,7,1690,O,10:41:50
2,7,3,1,16,1292,O,10:35:56
3,8,4,1,15,1299,O,10:50:22
4,9,5,1,11,1424,O,10:08:40
...,...,...,...,...,...,...,...
226,232,163,14,8,6562,O,18:59:37
227,1,165,15,2,1172,O,09:14:45
228,2,166,15,4,936,O,09:11:53
229,3,167,15,3,1145,O,09:09:29


Unnamed: 0,PRF_ID,PRF_CMP_NUM,PRF_JDG_NUM,PRF_CMP_VALUE
0,5,03,1,2.25
1,5,03,2,2.00
2,5,03,3,2.00
3,5,03,4,1.50
4,5,03,5,1.25
...,...,...,...,...
3200,4,05,1,3.25
3201,4,05,2,3.00
3202,4,05,3,2.75
3203,4,05,4,3.25


Unnamed: 0,PRF_ID,PRF_INAE,PRF_ELM_NUM,PRF_JDG_NUM,PRF_GOE
0,5,SSp,1,1,-5
1,5,SSp,1,2,-4
2,5,SSp,1,3,-4
3,5,SSp,1,4,-4
4,5,SSp,1,5,-4
...,...,...,...,...,...
8380,4,SSpB,5,1,0
8381,4,SSpB,5,2,0
8382,4,SSpB,5,3,0
8383,4,SSpB,5,4,0


In [23]:
clubs.drop_duplicates(inplace=True)

In [45]:
performances['PRF_STRTIM'] = pd.to_datetime(
    performances['PRF_STRTIM'], format="%H:%M:%S"
)
performances['PRF_POINTS'] = performances['PRF_POINTS'].astype('float64')
elements['PRF_GOE'] = elements['PRF_GOE'].astype('int32')

In [27]:
performances['PRF_POINTS'] = performances['PRF_POINTS'] / 100

In [29]:
performances[performances['PRF_POINTS'] == performances['PRF_POINTS'].max()]

Unnamed: 0,PRF_ID,PAR_ID,SCP_ID,PRF_PLACE,PRF_POINTS,PRF_STAT,PRF_STRTIM
172,205,173,12,1,106.94,O,1900-01-01 17:06:25


In [31]:
elements[elements['PRF_JDG_NUM'] == 1]['PRF_GOE'].mean()

-0.6929039952295766

In [33]:
elements[elements['PRF_JDG_NUM'] == 2]['PRF_GOE'].mean()

-0.7966607036374478

In [35]:
elements[elements['PRF_JDG_NUM'] == 3]['PRF_GOE'].mean()

-0.8491353607632678

In [37]:
elements[elements['PRF_JDG_NUM'] == 4]['PRF_GOE'].mean()

-0.8073941562313656

In [39]:
elements[elements['PRF_JDG_NUM'] == 5]['PRF_GOE'].mean()

-0.7996422182468694

In [41]:
ids = performances[performances['SCP_ID'] == '1']['PRF_ID']

In [51]:
elements.query('PRF_ID == "208"').pivot_table(values=['PRF_GOE'], columns=['PRF_JDG_NUM'], index=['PRF_ID', 'PRF_ELM_NUM', 'PRF_INAE'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PRF_GOE,PRF_GOE,PRF_GOE,PRF_GOE,PRF_GOE
Unnamed: 0_level_1,Unnamed: 1_level_1,PRF_JDG_NUM,1,2,3,4,5
PRF_ID,PRF_ELM_NUM,PRF_INAE,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
208,1,2A<<,-5.0,-5.0,-5.0,-5.0,-5.0
208,2,2Lze,-2.0,-3.0,-4.0,-4.0,-4.0
208,3,2A<<,-4.0,-4.0,-4.0,-4.0,-4.0
208,4,FSSp1,-1.0,-1.0,-1.0,-2.0,-2.0
208,5,2F+2T,0.0,0.0,0.0,0.0,1.0
208,6,LSp2,0.0,0.0,1.0,1.0,1.0
208,7,ChSq1,1.0,1.0,1.0,0.0,0.0
208,8,2F+2T,0.0,0.0,0.0,0.0,0.0
208,9,2Lo+1A+1A+SЕQ,-5.0,-5.0,-5.0,-5.0,-5.0
208,10,CCoSp2,-1.0,-1.0,-1.0,-1.0,-2.0
