In [1]:
# Import dependencies
import pandas as pd
from datetime import datetime, timedelta
import re
import os
import glob

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Concatenating All DataFrames

In [2]:
raceNames = [
    'Bahrain',
    'Saudi Arabia',
    'Australia',
    'Emilia Romagna',
    'Miami',
    'Spain',
    'Monaco',
    'Azerbaijan',
    'Canada',
    'Great Britain',
    'Austria',
    'France',
    'Hungary'
]

In [3]:
master_df = pd.DataFrame()
race_df_list = []

In [4]:
# use glob to get all the csv files 
# in the folder
csv_files = glob.glob(os.path.join('./data', "*.csv"))
csv_files.sort()
csv_files

['./data/00_0_df.csv',
 './data/00_1_df.csv',
 './data/00_2_df.csv',
 './data/00_3_df.csv',
 './data/00_4_df.csv',
 './data/00_5_df.csv',
 './data/00_6_df.csv',
 './data/00_7_df.csv',
 './data/01_0_df.csv',
 './data/01_1_df.csv',
 './data/01_2_df.csv',
 './data/01_3_df.csv',
 './data/01_4_df.csv',
 './data/01_5_df.csv',
 './data/01_6_df.csv',
 './data/01_7_df.csv',
 './data/02_0_df.csv',
 './data/02_1_df.csv',
 './data/02_2_df.csv',
 './data/02_3_df.csv',
 './data/02_4_df.csv',
 './data/02_5_df.csv',
 './data/02_6_df.csv',
 './data/02_7_df.csv',
 './data/03_0_df.csv',
 './data/03_1_df.csv',
 './data/03_2_df.csv',
 './data/03_3_df.csv',
 './data/03_4_df.csv',
 './data/03_5_df.csv',
 './data/03_6_df.csv',
 './data/03_7_df.csv',
 './data/03_8_df.csv',
 './data/04_0_df.csv',
 './data/04_1_df.csv',
 './data/04_2_df.csv',
 './data/04_3_df.csv',
 './data/04_4_df.csv',
 './data/04_5_df.csv',
 './data/04_6_df.csv',
 './data/04_7_df.csv',
 './data/05_0_df.csv',
 './data/05_1_df.csv',
 './data/05

In [5]:
# loop over the list of csv files, converting >100 tables into individual race dataframes
for count, fileLocation in enumerate(csv_files):
      
    # read the csv file
    df = pd.read_csv(fileLocation)
      
    # print the location and filename
    print('Location:', fileLocation)
    fileName = fileLocation.split("/")[-1]
    raceIndex = fileName[0:2]
    tableIndex = int(fileName[3])
    print('File Name:', fileName)
    print(f'Race Number Index: {raceIndex}')
    print(f'Table Number Index: {tableIndex}')
    
    # Races 3 and 10 have sprints and need to be concatted differently
    if (raceIndex == '03') or (raceIndex == '10'):
        if fileName == f'{raceIndex}_0_df.csv':
            master_df['Final Race Position'] = df['Pos']
            master_df['No'] = df['No']
            master_df['Driver Name'] = df['Driver']
            master_df['Car'] = df['Car']
            master_df['Race Laps'] = df['Laps']
            master_df['Race Time'] = df['Time/Retired']
            master_df['Race Points'] = df['PTS']
            master_df['Race'] = raceNames[int(raceIndex)]
            master_df.set_index('No', inplace=True)
        elif fileName == f'{raceIndex}_1_df.csv':
            df.set_index('No', inplace=True)
            master_df = master_df.join(df[['Lap', 'Time of day', 'Time', 'Avg Speed']], on='No')
            master_df.rename(columns={"Lap": "Fastest Lap No", "Time": "Fastest Lap Time"}, inplace=True)
        elif fileName == f'{raceIndex}_2_df.csv':
            df.set_index('No', inplace=True)
            df.sort_values(by=['Driver', 'Stops'])
            df.drop_duplicates(subset=['Driver'], keep='last', inplace=True)
            master_df = master_df.join(df[['Stops', 'Total']], on='No')
            master_df.rename(columns={"Total": "Total Pit Time"}, inplace=True)
        elif fileName == f'{raceIndex}_3_df.csv':
            df.set_index('No', inplace=True)
            master_df = master_df.join(df[['Pos']], on='No')
            master_df.rename(columns={"Pos": "Starting Grid Pos"}, inplace=True)
        elif fileName == f'{raceIndex}_4_df.csv':
            df.set_index('No', inplace=True)
            master_df = master_df.join(df[['Pos', 'Laps', 'Time/Retired', 'PTS']], on='No')
            master_df.rename(columns={"Pos": "Final Sprint Postition", "Laps": "Sprint Laps", "Time/Retired": "Sprint Time/Retired", "PTS": "Sprint PTS"}, inplace=True)
        elif fileName == f'{raceIndex}_5_df.csv':
            df.set_index('No', inplace=True)
            master_df = master_df.join(df[['Pos', 'Time']], on='No')
            master_df.rename(columns={"Pos": "Starting Sprint Postition", "Time": "Sprint Grid Time"}, inplace=True)
        elif fileName == f'{raceIndex}_6_df.csv':
            df.set_index('No', inplace=True)
            master_df = master_df.join(df[['Pos', 'Time', 'Gap', 'Laps']], on='No')
            master_df.rename(columns={"Pos": "P2 Postition", "Time": "P2 Time", "Gap": "P2 Gap", "Laps": "P2 Laps"}, inplace=True)
        elif fileName == f'{raceIndex}_7_df.csv':
            df.set_index('No', inplace=True)
            master_df = master_df.join(df[['Pos', 'Q1', 'Q2', 'Q3', 'Laps']], on='No')
            master_df.rename(columns={"Pos": "Qualifying Postition", "Laps": "Qualifying Laps"}, inplace=True)
        elif fileName == f'{raceIndex}_8_df.csv':
            df.set_index('No', inplace=True)
            master_df = master_df.join(df[['Pos', 'Time', 'Gap', 'Laps']], on='No')
            master_df.rename(columns={"Pos": "P1 Postition", "Time": "P1 Time", "Gap": "P1 Gap", "Laps": "P1 Laps"}, inplace=True)
        else:
            print("SOMETHING WENT WRONG")
    else:
        if fileName == f'{raceIndex}_0_df.csv':
            master_df['Final Race Position'] = df['Pos']
            master_df['No'] = df['No']
            master_df['Driver Name'] = df['Driver']
            master_df['Car'] = df['Car']
            master_df['Race Laps'] = df['Laps']
            master_df['Race Time'] = df['Time/Retired']
            master_df['Race Points'] = df['PTS']
            master_df['Race'] = raceNames[int(raceIndex)]
            master_df.set_index('No', inplace=True)
        elif fileName == f'{raceIndex}_1_df.csv':
            df.set_index('No', inplace=True)
            master_df = master_df.join(df[['Lap', 'Time of day', 'Time', 'Avg Speed']], on='No')
            master_df.rename(columns={"Lap": "Fastest Lap No", "Time": "Fastest Lap Time"}, inplace=True)
        elif fileName == f'{raceIndex}_2_df.csv':
            df.set_index('No', inplace=True)
            df.sort_values(by=['Driver', 'Stops'])
            df.drop_duplicates(subset=['Driver'], keep='last', inplace=True)
            master_df = master_df.join(df[['Stops', 'Total']], on='No')
            master_df.rename(columns={"Time": "Fastest Lap Time"}, inplace=True)
        elif fileName == f'{raceIndex}_3_df.csv':
            df.set_index('No', inplace=True)
            master_df = master_df.join(df[['Pos', 'Time']], on='No')
            master_df.rename(columns={"Lap": "Fastest Lap No", "Total": "Total Pit Time", "Pos": "Starting Grid Pos", "Time": "Starting Grid Quali Time"}, inplace=True)
        elif fileName == f'{raceIndex}_4_df.csv':
            df.set_index('No', inplace=True)
            master_df = master_df.join(df[['Pos', 'Q1', 'Q2', 'Q3', 'Laps']], on='No')
            master_df.rename(columns={"Pos": "Qualifying Postition", "Laps": "Qualifying Laps"}, inplace=True)
        elif fileName == f'{raceIndex}_5_df.csv':
            df.set_index('No', inplace=True)
            master_df = master_df.join(df[['Pos', 'Time', 'Gap', 'Laps']], on='No')
            master_df.rename(columns={"Pos": "P3 Postition", "Time": "P3 Time", "Gap": "P3 Gap", "Laps": "P3 Laps"}, inplace=True)
        elif fileName == f'{raceIndex}_6_df.csv':
            df.set_index('No', inplace=True)
            master_df = master_df.join(df[['Pos', 'Time', 'Gap', 'Laps']], on='No')
            master_df.rename(columns={"Pos": "P2 Postition", "Time": "P2 Time", "Gap": "P2 Gap", "Laps": "P2 Laps"}, inplace=True)
        elif fileName == f'{raceIndex}_7_df.csv':
            df.set_index('No', inplace=True)
            master_df = master_df.join(df[['Pos', 'Time', 'Gap', 'Laps']], on='No')
            master_df.rename(columns={"Pos": "P1 Postition", "Time": "P1 Time", "Gap": "P1 Gap", "Laps": "P1 Laps"}, inplace=True)
        else:
            print("SOMETHING WENT WRONG")
            
    # The sprint races have one more raceIndex, so the master_df needs to be reset at 7 and 8    
    if (raceIndex != '03' and raceIndex != '10') and (tableIndex == 7):
        master_df.reset_index(inplace=True)
        race_df_list.append(master_df)
        print(f"{raceNames[int(raceIndex)]}_DF has been added successfully")
        print()
        master_df = pd.DataFrame()
    elif (tableIndex == 8):
        master_df.reset_index(inplace=True)
        race_df_list.append(master_df)
        print(f"{raceNames[int(raceIndex)]}_DF has been added successfully")
        print()
        master_df = pd.DataFrame()
        

Location: ./data/00_0_df.csv
File Name: 00_0_df.csv
Race Number Index: 00
Table Number Index: 0
Location: ./data/00_1_df.csv
File Name: 00_1_df.csv
Race Number Index: 00
Table Number Index: 1
Location: ./data/00_2_df.csv
File Name: 00_2_df.csv
Race Number Index: 00
Table Number Index: 2
Location: ./data/00_3_df.csv
File Name: 00_3_df.csv
Race Number Index: 00
Table Number Index: 3
Location: ./data/00_4_df.csv
File Name: 00_4_df.csv
Race Number Index: 00
Table Number Index: 4
Location: ./data/00_5_df.csv
File Name: 00_5_df.csv
Race Number Index: 00
Table Number Index: 5
Location: ./data/00_6_df.csv
File Name: 00_6_df.csv
Race Number Index: 00
Table Number Index: 6
Location: ./data/00_7_df.csv
File Name: 00_7_df.csv
Race Number Index: 00
Table Number Index: 7
Bahrain_DF has been added successfully

Location: ./data/01_0_df.csv
File Name: 01_0_df.csv
Race Number Index: 01
Table Number Index: 0
Location: ./data/01_1_df.csv
File Name: 01_1_df.csv
Race Number Index: 01
Table Number Index: 1


Location: ./data/10_5_df.csv
File Name: 10_5_df.csv
Race Number Index: 10
Table Number Index: 5
Location: ./data/10_6_df.csv
File Name: 10_6_df.csv
Race Number Index: 10
Table Number Index: 6
Location: ./data/10_7_df.csv
File Name: 10_7_df.csv
Race Number Index: 10
Table Number Index: 7
Location: ./data/10_8_df.csv
File Name: 10_8_df.csv
Race Number Index: 10
Table Number Index: 8
Austria_DF has been added successfully

Location: ./data/11_0_df.csv
File Name: 11_0_df.csv
Race Number Index: 11
Table Number Index: 0
Location: ./data/11_1_df.csv
File Name: 11_1_df.csv
Race Number Index: 11
Table Number Index: 1
Location: ./data/11_2_df.csv
File Name: 11_2_df.csv
Race Number Index: 11
Table Number Index: 2
Location: ./data/11_3_df.csv
File Name: 11_3_df.csv
Race Number Index: 11
Table Number Index: 3
Location: ./data/11_4_df.csv
File Name: 11_4_df.csv
Race Number Index: 11
Table Number Index: 4
Location: ./data/11_5_df.csv
File Name: 11_5_df.csv
Race Number Index: 11
Table Number Index: 5


In [6]:
# Concat the race DataFrames into one F1 DataFrame
f1_df = pd.concat(race_df_list, axis=0, ignore_index=True)

In [7]:
# Set index to the Race and Driver No
f1_df.set_index(['Race', 'No'], inplace=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Final Race Position,Driver Name,Car,Race Laps,Race Time,Race Points,Fastest Lap No,Time of day,Fastest Lap Time,Avg Speed,Stops,Total Pit Time,Starting Grid Pos,Starting Grid Quali Time,Qualifying Postition,Q1,Q2,Q3,Qualifying Laps,P3 Postition,P3 Time,P3 Gap,P3 Laps,P2 Postition,P2 Time,P2 Gap,P2 Laps,P1 Postition,P1 Time,P1 Gap,P1 Laps,Final Sprint Postition,Sprint Laps,Sprint Time/Retired,Sprint PTS,Starting Sprint Postition,Sprint Grid Time
Race,No,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
Bahrain,16,1,Charles Leclerc LEC,Ferrari,57,1:37:33.584,26,51.0,19:31:35,1:34.570,206.018,3.0,1:14.152,1.0,1:30.558,1,1:31.471,1:30.932,1:30.558,15.0,2.0,1:32.640,+0.096s,16.0,2.0,1:32.023,+0.087s,20.0,2.0,1:34.557,+0.364s,22.0,,,,,,
Bahrain,55,2,Carlos Sainz SAI,Ferrari,57,+5.598s,18,52.0,19:33:13,1:35.740,203.501,3.0,1:13.391,3.0,1:30.687,3,1:31.567,1:30.787,1:30.687,15.0,5.0,1:33.053,+0.509s,20.0,3.0,1:32.520,+0.584s,22.0,3.0,1:34.611,+0.418s,23.0,,,,,,
Bahrain,44,3,Lewis Hamilton HAM,Mercedes,57,+9.675s,15,53.0,19:34:51,1:36.228,202.469,3.0,1:16.576,5.0,1:31.238,5,1:32.285,1:31.048,1:31.238,17.0,6.0,1:33.121,+0.577s,15.0,9.0,1:33.144,+1.208s,23.0,7.0,1:34.943,+0.750s,17.0,,,,,,
Bahrain,63,4,George Russell RUS,Mercedes,57,+11.211s,12,56.0,19:39:42,1:36.302,202.313,3.0,1:16.796,9.0,1:32.216,9,1:32.269,1:31.252,1:32.216,17.0,4.0,1:32.935,+0.391s,19.0,4.0,1:32.529,+0.593s,25.0,4.0,1:34.629,+0.436s,23.0,,,,,,
Bahrain,20,5,Kevin Magnussen MAG,Haas Ferrari,57,+14.754s,10,53.0,19:34:54,1:36.623,201.641,3.0,1:17.405,7.0,1:31.808,7,1:31.955,1:31.461,1:31.808,12.0,7.0,1:33.437,+0.893s,15.0,10.0,1:33.183,+1.247s,23.0,19.0,1:36.804,+2.611s,21.0,,,,,,
Bahrain,77,6,Valtteri Bottas BOT,Alfa Romeo Ferrari,57,+16.119s,8,53.0,19:34:55,1:36.599,201.691,3.0,1:16.237,6.0,1:31.560,6,1:31.919,1:31.717,1:31.560,15.0,8.0,1:33.733,+1.189s,21.0,6.0,1:32.951,+1.015s,30.0,20.0,,,2.0,,,,,,
Bahrain,31,7,Esteban Ocon OCO,Alpine Renault,57,+19.423s,6,53.0,19:34:56,1:37.110,200.63,3.0,1:20.110,11.0,1:31.782,11,1:32.041,1:31.782,,12.0,18.0,1:34.957,+2.413s,16.0,12.0,1:33.360,+1.424s,25.0,12.0,1:35.151,+0.958s,15.0,,,,,,
Bahrain,22,8,Yuki Tsunoda TSU,AlphaTauri RBPT,57,+20.386s,4,53.0,19:34:57,1:37.104,200.642,3.0,1:14.633,16.0,1:32.750,16,1:32.750,,,8.0,,,,,14.0,1:33.789,+1.853s,26.0,9.0,1:35.028,+0.835s,20.0,,,,,,
Bahrain,14,9,Fernando Alonso ALO,Alpine Renault,57,+22.390s,2,44.0,19:18:02,1:36.733,201.412,3.0,1:14.782,8.0,1:32.195,8,1:32.346,1:31.621,1:32.195,14.0,16.0,1:34.628,+2.084s,15.0,5.0,1:32.877,+0.941s,24.0,8.0,1:35.000,+0.807s,14.0,,,,,,
Bahrain,24,10,Zhou Guanyu ZHO,Alfa Romeo Ferrari,57,+23.064s,1,39.0,19:09:38,1:36.685,201.512,3.0,1:19.366,15.0,1:33.543,15,1:32.493,1:33.543,,12.0,9.0,1:33.880,+1.336s,18.0,15.0,1:33.953,+2.017s,27.0,11.0,1:35.053,+0.860s,20.0,,,,,,


In [8]:
# Save f1_df to csv file
f1_df.to_csv(f'./f1_df.csv', index=False)

# Data Cleaning

## Filling Null Values

In [9]:
master_df = f1_df.copy()

In [10]:
# Fill NaN values
master_df['Starting Grid Pos'] = master_df['Starting Grid Pos'].fillna(99)

master_df['Fastest Lap No'] = master_df['Fastest Lap No'].fillna(0)
master_df['Time of day'] = master_df['Time of day'].fillna('99:99:99')
master_df['Fastest Lap Time'] = master_df['Fastest Lap Time'].fillna('99:99')
master_df['Avg Speed'] = master_df['Avg Speed'].fillna(0)

master_df['Stops'] = master_df['Stops'].fillna(99)
master_df['Total Pit Time'] = master_df['Total Pit Time'].fillna('99:99')

master_df['P1 Postition'] = master_df['P1 Postition'].fillna(99)
master_df['P1 Time'] = master_df['P1 Time'].fillna('99:99')
master_df['P1 Laps'] = master_df['P1 Laps'].fillna(0)

master_df['P2 Postition'] = master_df['P2 Postition'].fillna(99)
master_df['P2 Time'] = master_df['P2 Time'].fillna('99:99')
master_df['P2 Laps'] = master_df['P2 Laps'].fillna(0)

master_df['P3 Postition'] = master_df['P3 Postition'].fillna(99)
master_df['P3 Time'] = master_df['P3 Time'].fillna('99:99')
master_df['P3 Laps'] = master_df['P3 Laps'].fillna(0)

master_df['Starting Grid Quali Time'] = master_df['Starting Grid Quali Time'].fillna('99:99')
master_df['Q1'] = master_df['Q1'].fillna('99:99')
master_df['Q2'] = master_df['Q2'].fillna('99:99')
master_df['Q3'] = master_df['Q3'].fillna('99:99')

master_df['Sprint Laps'] = master_df['Sprint Laps'].fillna(0)
master_df['Sprint Time/Retired'] = master_df['Sprint Time/Retired'].fillna('0')
master_df['Sprint Grid Time'] = master_df['Sprint Grid Time'].fillna('00:00')
master_df['Sprint PTS'] = master_df['Sprint PTS'].fillna(0)
master_df['Starting Sprint Postition'] = master_df['Starting Sprint Postition'].fillna(0)
master_df['Final Sprint Postition'] = master_df['Final Sprint Postition'].fillna(0)

In [11]:
def formatPos(race):
    for i, value in enumerate(race):
        if (value == 'NC'):
            race.iloc[i] = 99
        else:
            race.iloc[i] = pd.to_numeric(value)
            
    return pd.to_numeric(race)

In [12]:
# Change NC to 99th place
racePos = master_df['Final Race Position']
racePos = formatPos(racePos)
master_df['Final Race Position'] = racePos

sprintPos = master_df['Final Sprint Postition']
sprintPos = formatPos(sprintPos)
master_df['Final Sprint Postition'] = sprintPos

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


## Strip Times of '+' and 's'

In [13]:
# Remove '+' and 's' from Race Time column
master_df['Race Time']= master_df['Race Time'].astype('str')
master_df['Race Time']= master_df['Race Time'].map(lambda x: x.lstrip('+-'))
master_df['Race Time']= master_df['Race Time'].map(lambda x: x.rstrip('s'))

# Remove '+' and 's' from P3 Gap column
master_df['P3 Gap']= master_df['P3 Gap'].astype('str')
master_df['P3 Gap']= master_df['P3 Gap'].map(lambda x: x.lstrip('+-'))
master_df['P3 Gap']= master_df['P3 Gap'].map(lambda x: x.rstrip('s'))

# Remove '+' and 's' from P2 Gap column
master_df['P2 Gap']= master_df['P2 Gap'].astype('str')
master_df['P2 Gap']= master_df['P2 Gap'].map(lambda x: x.lstrip('+-'))
master_df['P2 Gap']= master_df['P2 Gap'].map(lambda x: x.rstrip('s'))

# Remove '+' and 's' from P1 Gap column
master_df['P1 Gap']= master_df['P1 Gap'].astype('str')
master_df['P1 Gap']= master_df['P1 Gap'].map(lambda x: x.lstrip('+-'))
master_df['P1 Gap']= master_df['P1 Gap'].map(lambda x: x.rstrip('s'))

# Remove '+' and 's' from P1 Gap column
master_df['Sprint Time/Retired'] = master_df['Sprint Time/Retired'].astype('str')
master_df['Sprint Time/Retired'] = master_df['Sprint Time/Retired'].map(lambda x: x.lstrip('+-'))
master_df['Sprint Time/Retired'] = master_df['Sprint Time/Retired'].map(lambda x: x.rstrip('s'))

## Convert All HH:MM:SS and MM:SS Times To Seconds

In [14]:
def raceTimeToSeconds(time):
    time_copy = time.copy()
    raceCount = 0
    for count, race in enumerate(time_copy):
        # Check to get the leading time for each race
        if (raceCount == 20):
            raceCount = 0
            
        # Race time is always first index
        if (raceCount == 0):
            end_time = race
            my_list = end_time.split(':')
            if len(my_list) == 3:
                time_f = timedelta(hours=int(my_list[0]),minutes=int(my_list[1]), seconds=float(my_list[2])).total_seconds()
                time_copy.iloc[count] = time_f
            elif len(my_list) == 2:
                time_f = timedelta(minutes=int(my_list[0]), seconds=float(my_list[1])).total_seconds()
                time_copy.iloc[count] = time_f
        # If race contains substring 'lap' indicating 1+ Lap. Will add the number of laps indicated so 1+ Lap 
        # Will add 1 whole lap, 2+ Lap will add two laps, etc.
        elif 'lap' in race:
            time_copy.iloc[count] = time_f+(int(race[0])*time_f)
        # Use regex to check if a float value exists
        elif re.match(r'^-?\d+(?:\.\d+)$', race) is not None:
            time_copy.iloc[count] = (time_f + float(race))
        # Anything else such as DNF, DSQ will get 999999
        elif (race== "DNF"):
            time_copy.iloc[count] = 999999
        elif (race== "DNS"):
            time_copy.iloc[count] = 0
            
        # Increment raceCount
        raceCount += 1
            
    return time_copy

In [15]:
race_time = master_df['Race Time']
new_race_time = raceTimeToSeconds(race_time)
master_df['Race Time'] = new_race_time

sprint_time = master_df['Sprint Time/Retired']
new_sprint_time = raceTimeToSeconds(sprint_time)
master_df['Sprint Time/Retired'] = new_sprint_time

In [16]:
def timeToSeconds(time):
    time_copy = time.copy()
    for count, race in enumerate(time_copy):
        # Check first if value is float (eg. time is already in seconds)
        if (race != "DNF"):
            if re.match(r'^-?\d+(?:\.\d+)$', str(race)) is not None:
                time_f = timedelta(seconds=float(race)).total_seconds()
                time_copy.iloc[count] = time_f
            else:
                end_time = race
                my_list = end_time.split(':')
                if len(my_list) == 3:
                    time_f = timedelta(hours=int(my_list[0]), minutes=int(my_list[1]), seconds=float(my_list[2])).total_seconds()
                    time_copy.iloc[count] = time_f
                elif len(my_list) == 2:
                    time_f = timedelta(minutes=int(my_list[0]), seconds=float(my_list[1])).total_seconds()
                    time_copy.iloc[count] = time_f
        # Anything else such as DNF, DSQ will get 999999
        else:
            time_copy.iloc[count] = 999999
            
    return time_copy

In [17]:
tod_time = master_df['Time of day']
new_tod_time = timeToSeconds(tod_time)
master_df['Time of day'] = new_tod_time

fastest_lap_copy = master_df['Fastest Lap Time']
new_fastlap_time = timeToSeconds(fastest_lap_copy)
master_df['Fastest Lap Time'] = new_fastlap_time

pit_time_copy = master_df['Total Pit Time']
new_pit_time = timeToSeconds(pit_time_copy)
master_df['Total Pit Time'] = new_pit_time

starting_grid_quali_copy = master_df['Starting Grid Quali Time']
new_quali_time = timeToSeconds(starting_grid_quali_copy)
master_df['Starting Grid Quali Time']= new_quali_time

Q1_copy = master_df['Q1']
new_q1_time = timeToSeconds(Q1_copy)
master_df['Q1'] = new_q1_time

Q2_copy = master_df['Q2']
new_q2_time = timeToSeconds(Q2_copy)
master_df['Q2'] = new_q2_time

Q3_copy = master_df['Q3']
new_q3_time = timeToSeconds(Q3_copy)
master_df['Q3'] = new_q3_time

P3_time_copy = master_df['P3 Time']
new_p3_time = timeToSeconds(P3_time_copy)
master_df['P3 Time'] = new_p3_time

P2_time_copy = master_df['P2 Time']
new_p2_time = timeToSeconds(P2_time_copy)
master_df['P2 Time'] = new_p2_time

P1_time_copy = master_df['P1 Time']
new_p1_time = timeToSeconds(P1_time_copy)
master_df['P1 Time'] = new_p1_time

Sprint_grid_time_copy = master_df['Sprint Grid Time']
new_sprint_grid_time = timeToSeconds(Sprint_grid_time_copy)
master_df['Sprint Grid Time'] = new_sprint_grid_time

In [18]:
def gapToNumeric(gap):
    for i, value in enumerate(gap):
        if (value == 'nan'):
            gap.iloc[i] = 999
        else:
            gap.iloc[i] = pd.to_numeric(value)

    return pd.to_numeric(gap)

In [19]:
p3Gap = master_df['P3 Gap']
new_p3Gap = gapToNumeric(p3Gap)
master_df['P3 Gap'] = new_p3Gap

p2Gap = master_df['P2 Gap']
new_p2Gap = gapToNumeric(p2Gap)
master_df['P2 Gap'] = new_p2Gap

p1Gap = master_df['P1 Gap']
new_p1Gap = gapToNumeric(p1Gap)
master_df['P1 Gap'] = new_p1Gap

In [20]:
master_df

Unnamed: 0,No,Final Race Position,Driver Name,Car,Race Laps,Race Time,Race Points,Race,Fastest Lap No,Time of day,Fastest Lap Time,Avg Speed,Stops,Total Pit Time,Starting Grid Pos,Starting Grid Quali Time,Qualifying Postition,Q1,Q2,Q3,Qualifying Laps,P3 Postition,P3 Time,P3 Gap,P3 Laps,P2 Postition,P2 Time,P2 Gap,P2 Laps,P1 Postition,P1 Time,P1 Gap,P1 Laps,Final Sprint Postition,Sprint Laps,Sprint Time/Retired,Sprint PTS,Starting Sprint Postition,Sprint Grid Time
0,16,1,Charles Leclerc LEC,Ferrari,57,5853.584,26,Bahrain,51.0,70295.0,94.57,206.018,3.0,74.152,1.0,90.558,1,91.471,90.932,90.558,15.0,2.0,92.64,0.096,16.0,2.0,92.023,0.087,20.0,2.0,94.557,0.364,22.0,0,0.0,0,0.0,0.0,0.0
1,55,2,Carlos Sainz SAI,Ferrari,57,5859.182,18,Bahrain,52.0,70393.0,95.74,203.501,3.0,73.391,3.0,90.687,3,91.567,90.787,90.687,15.0,5.0,93.053,0.509,20.0,3.0,92.52,0.584,22.0,3.0,94.611,0.418,23.0,0,0.0,0,0.0,0.0,0.0
2,44,3,Lewis Hamilton HAM,Mercedes,57,5863.259,15,Bahrain,53.0,70491.0,96.228,202.469,3.0,76.576,5.0,91.238,5,92.285,91.048,91.238,17.0,6.0,93.121,0.577,15.0,9.0,93.144,1.208,23.0,7.0,94.943,0.75,17.0,0,0.0,0,0.0,0.0,0.0
3,63,4,George Russell RUS,Mercedes,57,5864.795,12,Bahrain,56.0,70782.0,96.302,202.313,3.0,76.796,9.0,92.216,9,92.269,91.252,92.216,17.0,4.0,92.935,0.391,19.0,4.0,92.529,0.593,25.0,4.0,94.629,0.436,23.0,0,0.0,0,0.0,0.0,0.0
4,20,5,Kevin Magnussen MAG,Haas Ferrari,57,5868.338,10,Bahrain,53.0,70494.0,96.623,201.641,3.0,77.405,7.0,91.808,7,91.955,91.461,91.808,12.0,7.0,93.437,0.893,15.0,10.0,93.183,1.247,23.0,19.0,96.804,2.611,21.0,0,0.0,0,0.0,0.0,0.0
5,77,6,Valtteri Bottas BOT,Alfa Romeo Ferrari,57,5869.703,8,Bahrain,53.0,70495.0,96.599,201.691,3.0,76.237,6.0,91.56,6,91.919,91.717,91.56,15.0,8.0,93.733,1.189,21.0,6.0,92.951,1.015,30.0,20.0,6039.0,999.0,2.0,0,0.0,0,0.0,0.0,0.0
6,31,7,Esteban Ocon OCO,Alpine Renault,57,5873.007,6,Bahrain,53.0,70496.0,97.11,200.63,3.0,80.11,11.0,91.782,11,92.041,91.782,6039.0,12.0,18.0,94.957,2.413,16.0,12.0,93.36,1.424,25.0,12.0,95.151,0.958,15.0,0,0.0,0,0.0,0.0,0.0
7,22,8,Yuki Tsunoda TSU,AlphaTauri RBPT,57,5873.97,4,Bahrain,53.0,70497.0,97.104,200.642,3.0,74.633,16.0,92.75,16,92.75,6039.0,6039.0,8.0,99.0,6039.0,999.0,0.0,14.0,93.789,1.853,26.0,9.0,95.028,0.835,20.0,0,0.0,0,0.0,0.0,0.0
8,14,9,Fernando Alonso ALO,Alpine Renault,57,5875.974,2,Bahrain,44.0,69482.0,96.733,201.412,3.0,74.782,8.0,92.195,8,92.346,91.621,92.195,14.0,16.0,94.628,2.084,15.0,5.0,92.877,0.941,24.0,8.0,95.0,0.807,14.0,0,0.0,0,0.0,0.0,0.0
9,24,10,Zhou Guanyu ZHO,Alfa Romeo Ferrari,57,5876.648,1,Bahrain,39.0,68978.0,96.685,201.512,3.0,79.366,15.0,93.543,15,92.493,93.543,6039.0,12.0,9.0,93.88,1.336,18.0,15.0,93.953,2.017,27.0,11.0,95.053,0.86,20.0,0,0.0,0,0.0,0.0,0.0


In [22]:
master_df.to_csv("master_1.csv", index=False)