# Data Preparation

In [40]:
import os
import pandas as pd

def create_dataframe_from_races_csvs_multi_years(base_path, years):
    required_columns = ['Pos', 'Car #', 'Class', 'Drivers', 'Team', 'Car', 'Time', 'Laps', 'Gap']
    all_data = []  # List to hold all data rows

    for year in years:  # Process each year in the list
        year_path = os.path.join(base_path, str(year))  # Path to the specific year

        if not os.path.exists(year_path):
            print(f"The directory for year {year} does not exist.")
            continue  # Skip to the next year if the directory doesn't exist

        for meeting in os.listdir(year_path):  # Iterate through all meetings in the year
            meeting_path = os.path.join(year_path, meeting)
            races_path = os.path.join(meeting_path, "Races")  # Path to the 'Races' folder

            if os.path.exists(races_path):
                for race_file in os.listdir(races_path):  # Iterate through all race files
                    file_path = os.path.join(races_path, race_file)
                    try:
                        df = pd.read_csv(file_path)
                        if set(required_columns).issubset(df.columns):
                            df['season'] = year
                            df['meeting'] = meeting.replace("_", " ")
                            df['race_name'] = race_file.replace(".csv", "").replace("_", " ")
                            all_data.append(df[["season", "meeting", "race_name"] + required_columns])
                        else:
                            print(f"Skipping {file_path} due to missing required columns.")
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")

    # Concatenate all data into a single DataFrame
    final_df = pd.concat(all_data, ignore_index=True) if all_data else pd.DataFrame()
    return final_df


## Load the Data

In [45]:
base_path = ".\data_csv"  # Update to your path
years = [2021, 2022, 2023]
all_races = create_dataframe_from_races_csvs_multi_years(base_path, years)

In [56]:

result = all_races[(all_races['season'] == 2023) & 
                   (all_races['meeting'] == "CrowdStrike 24 Hours of Spa") & 
                   (all_races['race_name'] == 'Main Race after 13 hours')].reset_index(drop=True)

result.to_csv("C:\\Users\\ireev\\Desktop\\Spa_2023_Main_Race_after_13_hours.csv")


## Diagnose the Data

In [44]:
all_races.head()

Unnamed: 0,season,meeting,race_name,Pos,Car #,Class,Drivers,Team,Car,Time,Laps,Gap
0,2021,Barcelona,Main Race,1,88,Pro Cup,"Raffaele Marciello, Felipe Fraga, Jules Gounon",AKKA ASP,Mercedes-AMG GT3,1:47.211,95.0,
1,2021,Barcelona,Main Race,2,54,Pro Cup,"Klaus Bachler, Christian Engelhart, Matteo Cai...",Dinamic Motorsport,Porsche 911 GT3-R (991.II),1:47.148,95.0,2.174
2,2021,Barcelona,Main Race,3,32,Pro Cup,"Dries Vanthoor, Robin Frijns, Charles Weerts",Team WRT,Audi R8 LMS GT3,1:47.612,95.0,4.036
3,2021,Barcelona,Main Race,4,63,Pro Cup,"Mirko Bortolotti, Marco Mapelli, Andrea Caldar...",Orange 1 FFF Racing Team,Lamborghini Huracan GT3 Evo,1:47.027,95.0,9.511
4,2021,Barcelona,Main Race,5,4,Pro Cup,"Maro Engel, Luca Stolz, Nico Bastian",HRT,Mercedes-AMG GT3,1:47.588,95.0,9.984


In [46]:
all_races.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7606 entries, 0 to 7605
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   season     7606 non-null   int64  
 1   meeting    7606 non-null   object 
 2   race_name  7606 non-null   object 
 3   Pos        7606 non-null   object 
 4   Car #      7606 non-null   int64  
 5   Class      7606 non-null   object 
 6   Drivers    7606 non-null   object 
 7   Team       7606 non-null   object 
 8   Car        7606 non-null   object 
 9   Time       7467 non-null   object 
 10  Laps       7533 non-null   float64
 11  Gap        7389 non-null   object 
dtypes: float64(1), int64(2), object(9)
memory usage: 713.2+ KB
