# Script to Extract the Points from a PDF

In [65]:
class Event:
    def __init__(self, name):
        self.name = name
        self.distance = None
        self.category = None
        self.surface = None
        self.mixed = False
        base_name = self._get_category(self._is_indoor())
        if self.category not in ['Jumps', 'Throws', 'Combined Events']:
            self._get_distance(base_name)
        self._run_categories()

    def _is_indoor(self):
        if self.name.strip().endswith('sh'):
            self.surface = 'shorttrack'
            return self.name.strip()[:-3]
        else:
            return self.name.strip()
        
    def _get_category(self, name):
        if name.endswith('mix'):
            self.mixed = True
            name = name[:-2]
        if name.endswith('SC'):
            self.category = 'Steeplechase'
            name = name[:-3]
        elif name.endswith('H'):
            self.category = 'Hurdles'
            name = name[:-1]
        elif name.endswith('W'):
            self.category = 'Racewalk'
            name = name[:-1]
        elif name.endswith('J') or name.endswith('V'):
            self.category = 'Jumps'
        elif name.endswith('T') or name.endswith('P'):
            self.category = 'Throws'
        elif name.endswith('.'):
            self.category = 'Combined Events'
        elif name.startswith('4x'):
            self.category = 'Relays'
            name = name[2:]
        else:
            self.category = 'Runs'
        return name
    
    def _get_distance(self, name):
        if name.endswith('rd'):
            name = name[:-3]
            self.surface = 'road'
        
        if name == 'Marathon':
            self.distance = 42.195
            self.surface = 'road'
        elif name == 'HM':
            self.distance = 21.0975
            self.surface = 'road'

        if name.endswith('km'):
            self.distance = int(name[:-2])
            self.surface = 'road'
        elif name.endswith('m'):
            self.distance = int(name[:-1].replace(',','')) / 1000
        elif name.endswith('Miles'):
            self.distance = int(name[:-6]) * 1.60934
        elif name.endswith('Mile'):
            self.distance = 1.60934

        if self.surface == None:
            self.surface = 'outdoor'

        if self.category == 'Relays':
            self.distance *= 4

    def _run_categories(self):
        if self.category == 'Runs':
            if self.distance <= 0.5:
                self.category = 'Sprints'
            elif self.distance < 3:
                self.category = 'Middle Distance'
            else:
                self.category = 'Long Distance'
    
string = '10 km'

Ev = Event(string)
print(Ev.name, Ev.distance, Ev.category, Ev.surface, Ev.mixed)

10 km 10 Long Distance road False


In [66]:
import re
import json
import fitz

points_table_pdf = fitz.open("C:\\Users\\tbwil\\Documents\\Projects\\IAAF_Points\\2025_points_table.pdf")

page_start = 8
num_pages = 28
suffixes = [' sh', ' SC', ' km', ' Mile', ' Miles']

def get_rows(page, protected_phrases=None):
    if protected_phrases is None:
        protected_phrases = []

    # Generate unique placeholders for each protected phrase
    placeholders = {phrase: f"__PLACEHOLDER_{i}__" for i, phrase in enumerate(protected_phrases)}
    
    # Replace each protected phrase with its placeholder
    for phrase, placeholder in placeholders.items():
        page = page.replace(phrase, placeholder)

    # Split on newlines and spaces
    raw_parts = re.split(r'[\n ]+', page)

    # Restore protected phrases
    rows = [part.strip() for part in raw_parts if part.strip() != '']
    for i in range(len(rows)):
        for phrase, placeholder in placeholders.items():
            rows[i] = rows[i].replace(placeholder, phrase)
    
    return rows

def get_start_loc(rows, page_num):
    start = 0
    for i in range(len(rows)):
        if rows[i].replace(' ','').isdigit():
            if (int(rows[i]) % 50 == 0) and (int(rows[i]) <= (1400 - 50*page_num)):
                start = i
    return start

def extract_event_points(page_start):
    # get first page
    page1 = points_table_pdf[page_start].get_text()
    rows1 = get_rows(page1, suffixes)
    num_events = 0
    start = 0
    for i in range(len(rows1)):
        if rows1[i].isdigit():
            if int(rows1[i]) == 1400:
                start = i
            if int(rows1[i]) == 1399:
                num_events = i - start - 1
        if num_events != 0:
            break

    # get headers
    headers = rows1[start-num_events:start]
    event_list = [[] for _ in range(num_events+1)]

    # collect data
    for page_num in range(page_start,page_start+num_pages):
        event_batch = [[] for _ in range(num_events+1)]
        page = points_table_pdf[page_num].get_text()
        rows = get_rows(page, suffixes)
        page_odd = page_num % 2
        start_loc = get_start_loc(rows, page_num-page_start) - num_events * page_odd
        for i in range(start_loc,len(rows)):
            for j in range(num_events+1):
                if((i - start_loc + page_odd) % (num_events+1) == j):
                    event_batch[j].append(rows[i])
        for j in range(num_events+1):
            event_list[j] = event_list[j] + event_batch[j][0:50]

    return headers, event_list

M_headers = []
W_headers = []
M_event_scores = []
W_event_scores = []

for i in range(26):
    header, event_list = extract_event_points(page_start+i*30)
    if i == 0:
        points = event_list[0]
    if i <13:
        M_event_scores = M_event_scores + event_list[1:]
        M_headers = M_headers + header
    else:
        W_event_scores = W_event_scores + event_list[1:]
        W_headers = W_headers + header

M_dict_results = {}
W_dict_results = {}
rd_mile = 0
for i in range(len(M_event_scores)):
    if M_headers[i] == 'Mile':
        if rd_mile:
            M_headers[i] = 'Mile rd'
        rd_mile = 1
    elif M_headers[i] == '10 Miles':
        M_headers[i] = '10 Miles rd'
    event = Event(M_headers[i])
    M_dict_results[M_headers[i]] = {'Distance':event.distance,
                                 'Category':event.category,
                                 'Surface':event.surface,
                                 'Mixed':event.mixed,
                                 'Points':M_event_scores[i]}
rd_mile = 0
for i in range(len(W_event_scores)):
    if W_headers[i] == 'Mile':
        if rd_mile:
            W_headers[i] = 'Mile rd'
        rd_mile = 1
    elif W_headers[i] == '10 Miles':
        W_headers[i] = '10 Miles rd'
    event = Event(W_headers[i])
    W_dict_results[W_headers[i]] = {'Distance':event.distance,
                                 'Category':event.category,
                                 'Surface':event.surface,
                                 'Mixed':event.mixed,
                                 'Scores':W_event_scores[i]}

total_results = {'Points':points, 'Men':M_dict_results, 'Women':W_dict_results}

json_results = json.dumps(total_results, indent=4)
with open('2025_points_table.json', 'w') as f:
    f.write(json_results)