# Data Preparation

In [25]:
import os
import pandas as pd

def create_dataframe_from_races_csvs_multi_years(base_path, years):
    required_columns = ['Pos', 'Car #', 'Class', 'Drivers', 'Team', 'Car', 'Time', 'Laps', 'Gap']
    all_data = []  # List to hold all data rows

    for year in years:  # Process each year in the list
        year_path = os.path.join(base_path, str(year))  # Path to the specific year

        if not os.path.exists(year_path):
            print(f"The directory for year {year} does not exist.")
            continue  # Skip to the next year if the directory doesn't exist

        for meeting in os.listdir(year_path):  # Iterate through all meetings in the year
            meeting_path = os.path.join(year_path, meeting)
            races_path = os.path.join(meeting_path, "Races")  # Path to the 'Races' folder

            if os.path.exists(races_path):
                for race_file in os.listdir(races_path):  # Iterate through all race files
                    file_path = os.path.join(races_path, race_file)
                    try:
                        df = pd.read_csv(file_path)
                        if set(required_columns).issubset(df.columns):
                            df['season'] = year
                            df['meeting'] = meeting.replace("_", " ")
                            df['race_name'] = race_file.replace(".csv", "").replace("_", " ")
                            all_data.append(df[["season", "meeting", "race_name"] + required_columns])
                        else:
                            print(f"Skipping {file_path} due to missing required columns.")
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")

    # Concatenate all data into a single DataFrame
    final_df = pd.concat(all_data, ignore_index=True) if all_data else pd.DataFrame()
    return final_df


## Load the Data

In [26]:
base_path = ".\data_csv"  # Update to your path
years = [2021, 2022, 2023]
all_races = create_dataframe_from_races_csvs_multi_years(base_path, years)
print(all_races.head())

   season    meeting  race_name Pos  Car #    Class  \
0    2021  Barcelona  Main Race   1     88  Pro Cup   
1    2021  Barcelona  Main Race   2     54  Pro Cup   
2    2021  Barcelona  Main Race   3     32  Pro Cup   
3    2021  Barcelona  Main Race   4     63  Pro Cup   
4    2021  Barcelona  Main Race   5      4  Pro Cup   

                                             Drivers  \
0     Raffaele Marciello, Felipe Fraga, Jules Gounon   
1  Klaus Bachler, Christian Engelhart, Matteo Cai...   
2       Dries Vanthoor, Robin Frijns, Charles Weerts   
3  Mirko Bortolotti, Marco Mapelli, Andrea Caldar...   
4               Maro Engel, Luca Stolz, Nico Bastian   

                       Team                          Car      Time  Laps  \
0                  AKKA ASP             Mercedes-AMG GT3  1:47.211  95.0   
1        Dinamic Motorsport   Porsche 911 GT3-R (991.II)  1:47.148  95.0   
2                  Team WRT              Audi R8 LMS GT3  1:47.612  95.0   
3  Orange 1 FFF Racing Team 

In [39]:
result = all_races[(all_races['season'] == "2023") & 
                   (all_races['meeting'] == "CrowdStrike 24 Hours of Spa") & 
                   (all_races['race_name'] == 'Main Race after 8 hours')].reset_index(drop=True)

#print(all_races[(all_races['season'] == "2023") & (all_races['meeting'] == "CrowdStrike 24 Hours of Spa")]['race_name'].unique())

result1 = all_races[all_races['season'] == "2023"]
#result.to_csv("C:\\Users\\ireev\\Desktop\\Spa_2023_Main_Race_after_8_hours.csv")


Unnamed: 0,season,meeting,race_name,Pos,Car #,Class,Drivers,Team,Car,Time,Laps,Gap


## Diagnose the Data

In [20]:
races_2023.head()

Unnamed: 0,season,meeting,race_name,Pos,Car #,Class,Drivers,Team,Car,Time,Laps,Gap
0,2023,Barcelona,Main Race,1,51,Pro Cup,"Alessio Rovera, Robert Shwartzman, Nicklas Nie...",AF Corse - Francorchamps Motors,Ferrari 296 GT3,1:41.540,88.0,
1,2023,Barcelona,Main Race,2,71,Pro Cup,"Antonio Fuoco, Daniel Serra, Davide Rigon",AF Corse - Francorchamps Motors,Ferrari 296 GT3,1:41.665,88.0,0.422
2,2023,Barcelona,Main Race,3,96,Pro Cup,"Thomas Preining, Laurin Heinrich, Dennis Olsen",Rutronik Racing,Porsche 911 GT3 R (992),1:41.562,88.0,2.328
3,2023,Barcelona,Main Race,4,777,Pro Cup,"Maro Engel, Luca Stolz, Fabian Schiller",Mercedes-AMG Team AlManar,Mercedes-AMG GT3 EVO,1:41.467,88.0,2.77
4,2023,Barcelona,Main Race,5,88,Pro Cup,"Raffaele Marciello, Jules Gounon, Timur Bogusl...",AKKODIS ASP Team,Mercedes-AMG GT3 EVO,1:42.162,88.0,4.413


In [21]:
races_2023.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2873 entries, 0 to 2872
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   season     2873 non-null   int64  
 1   meeting    2873 non-null   object 
 2   race_name  2873 non-null   object 
 3   Pos        2873 non-null   object 
 4   Car #      2873 non-null   int64  
 5   Class      2873 non-null   object 
 6   Drivers    2873 non-null   object 
 7   Team       2873 non-null   object 
 8   Car        2873 non-null   object 
 9   Time       2805 non-null   object 
 10  Laps       2851 non-null   float64
 11  Gap        2802 non-null   object 
dtypes: float64(1), int64(2), object(9)
memory usage: 269.5+ KB
