# Data Preparation

In [3]:
import os
import pandas as pd

def create_dataframe_from_races_csvs(base_path, year):
    required_columns = ['Pos', 'Car #', 'Class', 'Drivers', 'Team', 'Car', 'Time', 'Laps', 'Gap']
    all_data = []  # List to hold all data rows

    year_path = os.path.join(base_path, str(year))  # Path to the specific year

    if not os.path.exists(year_path):
        print(f"The directory for year {year} does not exist.")
        return pd.DataFrame()

    for meeting in os.listdir(year_path):  # Iterate through all meetings in the year
        meeting_path = os.path.join(year_path, meeting)
        races_path = os.path.join(meeting_path, "Races")  # Path to the 'Races' folder
        
        if os.path.exists(races_path):
            for race_file in os.listdir(races_path):  # Iterate through all race files
                file_path = os.path.join(races_path, race_file)
                try:
                    df = pd.read_csv(file_path)
                    if set(required_columns).issubset(df.columns):
                        df['season'] = year
                        df['meeting'] = meeting.replace("_", " ")
                        df['race_name'] = race_file.replace(".csv", "").replace("_", " ")
                        all_data.append(df[["season", "meeting", "race_name"] + required_columns])
                    else:
                        print(f"Skipping {file_path} due to missing required columns.")
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")

    # Concatenate all data into a single DataFrame
    final_df = pd.concat(all_data, ignore_index=True) if all_data else pd.DataFrame()
    return final_df

In [6]:
base_path = ".\data_csv"  # Update to your path
year = 2023
df = create_dataframe_from_races_csvs(base_path, year)
print(df.head())

print(len(df))


   season    meeting  race_name  ...      Time  Laps    Gap
0    2023  Barcelona  Main Race  ...  1:41.540  88.0    NaN
1    2023  Barcelona  Main Race  ...  1:41.665  88.0  0.422
2    2023  Barcelona  Main Race  ...  1:41.562  88.0  2.328
3    2023  Barcelona  Main Race  ...  1:41.467  88.0  2.770
4    2023  Barcelona  Main Race  ...  1:42.162  88.0  4.413

[5 rows x 12 columns]
2873


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2873 entries, 0 to 2872
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   season     2873 non-null   int64  
 1   meeting    2873 non-null   object 
 2   race_name  2873 non-null   object 
 3   Pos        2873 non-null   object 
 4   Car #      2873 non-null   int64  
 5   Class      2873 non-null   object 
 6   Drivers    2873 non-null   object 
 7   Team       2873 non-null   object 
 8   Car        2873 non-null   object 
 9   Time       2805 non-null   object 
 10  Laps       2851 non-null   float64
 11  Gap        2802 non-null   object 
dtypes: float64(1), int64(2), object(9)
memory usage: 269.5+ KB
