In [None]:
import os
import re
import sys
import sqlite3
import pandas as pd
sys.path.append('../src/')
from scraper.scraper import Scraper
from results.results import Results
from database.create_db import Database
from database.database import Event, Race, Results


In [None]:
db: Database = Database.create_database(path='../data/parsed_data.db')


In [None]:
scraper = Scraper()
events = scraper.getEvents()
years = scraper.getEventsYears()


In [None]:
# Get the list of events and years
events = dict(sorted(scraper.getEvents().items(), key=lambda item: item[1]))
# Remove years and strip
events = {key: ' '.join(word for word in value.split() if not word.isdigit() or len(word) != 4).strip() for key, value in events.items()}
events = {key: re.sub(r'^\d{4}|\d{4}$', '', value).strip() for key, value in events.items()}
# Remove French ordinals
events = {key: re.sub(r'(\d{1,2}(?:e|ème))', '', value).strip() for key, value in events.items()}
# Remove HTML tags
events = {key: re.sub(r'<[^<]+?>', '', value).strip() for key, value in events.items()}
# Sort alphabetically
events = dict(sorted(events.items(), key=lambda item: item[1]))


In [None]:
for code, name in events.items():
    if code in years:
        for year in years[code]: 
            event = Event(event_code=code,
                        event_name=name,
                        year=year,
                        country=None,
                        db=db)
            event.save_to_database()

In [None]:
# Used to resume from a partial download
def parse_done_txt_file(file_path):
    with open(file_path, 'r') as file:
        parsed_data = {}
        for line in file:
            parts = line.strip().split()
            if len(parts) == 2:
                race_name = parts[0]
                years = [year for year in parts[1:]]
                parsed_data.setdefault(race_name, []).extend(years)
    return parsed_data



If you want to skip races, create a text file `output_parsing.txt` containing one code and year per line wanting to be ignored, for example:

```
saintelyon 2018
saintelyon 2017
saintelyon 2016
saintelyon 2015
saintelyon 2014
saintelyon 2013
penyagolosa 2024
penyagolosa 2023
penyagolosa 2022
penyagolosa 2021
penyagolosa 2019
# lut 2016 -> parcours.php is empty
lut 2016
# oxfamtrailwalkerhk 2021 -> Password protected
oxfamtrailwalkerhk 2021
```

If you interrupt the execution, copy the output of the next cell in the file and races already parsed will be ignored.

In [None]:
parsed_races = parse_done_txt_file('output_parsing.txt')
for event, name in events.items():
    if event in years:
        for year in years[event]:
            if event in parsed_races:
                if year in parsed_races[event]:
                    continue
            print(event, year)
            scraper.setEvents([event])
            scraper.setYears([year])
            cps = scraper.getControlPoints()
            if not cps:
                continue
            races = scraper.getRaces()
            rr = scraper.getRandomRunnerBib()
            scraper.downloadData()
            races_data = scraper.getRacesPhysicalDetails()
            if event not in races:
                #st.write(f'No data available for {events[event]} {year}. Please select another event.')
                pass
            elif year not in races[event]:
                #st.write(f'No data available for {events[event]} {year}. Please select another event or year.')
                pass
            else:
                races = races[event][year]
                for code, name in races.items():
                    if code not in cps:
                        continue
                    if code=='maxirace' and 'Orientation' in name:
                        # 'maxirace' has two orientation races not standard
                        continue
                    elif name.lower()=='course des partenaires':
                        continue
                    event_id= Event.get_id_from_code_year(event, year, db)
                    scraper.setRace(code)
                    folder_path = f'data/{event}'
                    filepath = os.path.join(folder_path, f'{event}_{code}_{year}.csv')
                    results_filepath = filepath if os.path.exists(os.path.join('../../',filepath)) else None
                    race_info = scraper.getRaceInfo(bibN=rr[year][code]) if rr[year][code] is not None else {'date':None, 'hd':None}
                    control_points = cps[code]
                    race_data = races_data[code]
                    if race_info: # some races are empty but have empty rows in data (e.g. 'templiers', 'Templi', 2019)
                        departure_datetime=' '.join([race_info['date'], race_info['hd']]) if race_info['date'] else None
                    else:
                        departure_datetime = None
                    r = Race(race_id=code, event_id=event_id, race_name=name, distance=race_data['distance'],
                    elevation_pos=race_data['elevation_pos'], elevation_neg=race_data['elevation_pos'], departure_datetime=departure_datetime,
                    results_filepath=results_filepath, db=db)
                    r.save_to_database()


# TO-DO in this notebook

- [ ] Load CSV results to DB