Every race result since 1995 should give us labels and some important index, like driver, raceId and circuit name. These indices can be used to join other information

Following information on driver:
* Previous year race result on that track (if that circuit was not raced on last year, then value will be -1)
* Qualifying result for that race and driver
* Qualifying timing (in seconds) for that driver in that race
* Practice 1 and Practice 2 result for that race and driver
* Best timing for that driver in Practice Session 1 and 2
* (Optional, if model is not good enough) Telemetry information in qualifying and practice sessions

Following information on circuit:
* Winning driver's average laptime last year
* Number of turns
* Type of circuit (street or purpose built)
* Track temperature
* Air temperature

In [None]:
# All imports
from ast import literal_eval
import pandas as pd
import numpy as np

In [None]:
def remove_unnamed_column(df):
    """
    Remove the "Unnamed: 0" column from a pandas DataFrame.

    Parameters:
    - df: pandas.DataFrame
        The DataFrame from which the "Unnamed: 0" column will be removed.

    Returns:
    - pandas.DataFrame
        The DataFrame with the "Unnamed: 0" column removed, if it existed.
    """
    if "Unnamed: 0" in df.columns:
        return df.drop(columns=["Unnamed: 0"])
    return df

In [None]:
def safe_literal_eval(node):
    try:
        return literal_eval(node)
    except ValueError:
        return None

In [None]:
def expand_json_colums(df:pd.DataFrame,json_cols:list):
    """
    Expand json k:v pairs to be their own columns in the dataframe.

    Parameters:
    - df: pandas.DataFrame
        The DataFrame that contains the json columns to be expanded.
    - json_cols: list
        The names of the pandas columns that contain the json k:v pairs to be expanded.

    Returns:
    - pandas.DataFrame
        The DataFrame with the json columns expanded and the original dropped.

    """

    other_cols = df.drop(json_cols,axis=1)
    json_col_expanded = [
        df[col].apply(lambda x: safe_literal_eval(x)).apply(pd.Series)
        for col in json_cols
        ]

    final_df = pd.concat([other_cols,json_col_expanded], axis=1)
    return final_df

In [None]:
# This dataframe will be used for building a model
final_data = pd.DataFrame()

In [None]:
# Read race results
race_results = pd.read_csv("1995_data/Race_Results_1995_2023.csv")
race_results = remove_unnamed_column(race_results)

# Read qualification results
qualification_results = pd.read_csv("1995_data/Qualification_Results_1995_2023.csv")
qualification_results = remove_unnamed_column(qualification_results)

In [None]:
race_results

Unnamed: 0,number,position,positionText,points,Driver,Constructor,grid,laps,status,Time,season,round,FastestLap
0,1,1,1,10.0,"{'driverId': 'michael_schumacher', 'code': 'MS...","{'constructorId': 'benetton', 'url': 'http://e...",2,71,Finished,"{'millis': '5914154', 'time': '1:38:34.154'}",1995,1,
1,6,2,2,6.0,"{'driverId': 'coulthard', 'code': 'COU', 'url'...","{'constructorId': 'williams', 'url': 'http://e...",3,71,Finished,"{'millis': '5925214', 'time': '+11.060'}",1995,1,
2,28,3,3,4.0,"{'driverId': 'berger', 'url': 'http://en.wikip...","{'constructorId': 'ferrari', 'url': 'http://en...",5,70,+1 Lap,,1995,1,
3,8,4,4,3.0,"{'driverId': 'hakkinen', 'url': 'http://en.wik...","{'constructorId': 'mclaren', 'url': 'http://en...",7,70,+1 Lap,,1995,1,
4,27,5,5,2.0,"{'driverId': 'alesi', 'url': 'http://en.wikipe...","{'constructorId': 'ferrari', 'url': 'http://en...",6,70,+1 Lap,,1995,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11430,2,16,16,0.0,"{'driverId': 'sargeant', 'permanentNumber': '2...","{'constructorId': 'williams', 'url': 'http://e...",20,58,Finished,"{'millis': '5310415', 'time': '+1:27.791'}",2023,22,"{'rank': '12', 'lap': '43', 'Time': {'time': '..."
11431,24,17,17,0.0,"{'driverId': 'zhou', 'permanentNumber': '24', ...","{'constructorId': 'alfa', 'url': 'http://en.wi...",19,58,Finished,"{'millis': '5312046', 'time': '+1:29.422'}",2023,22,"{'rank': '13', 'lap': '43', 'Time': {'time': '..."
11432,55,18,18,0.0,"{'driverId': 'sainz', 'permanentNumber': '55',...","{'constructorId': 'ferrari', 'url': 'http://en...",16,57,Retired,,2023,22,"{'rank': '17', 'lap': '42', 'Time': {'time': '..."
11433,77,19,19,0.0,"{'driverId': 'bottas', 'permanentNumber': '77'...","{'constructorId': 'alfa', 'url': 'http://en.wi...",18,57,+1 Lap,,2023,22,"{'rank': '18', 'lap': '42', 'Time': {'time': '..."


In [None]:
qualification_results.head()

Unnamed: 0,season,round,driverId,driver,constructorId,q1,q2,q3
0,1995,1,damon_hill,Hill,williams,1:20.081,,
1,1995,1,michael_schumacher,Schumacher,benetton,1:20.382,,
2,1995,1,coulthard,Coulthard,williams,1:20.422,,
3,1995,1,herbert,Herbert,benetton,1:20.888,,
4,1995,1,berger,Berger,ferrari,1:20.906,,


In [None]:
# Create global driver mapping from first name -> last name -> driver Id
# This might get a bit complicated, we can simplify as we go along

driver_mapping = {}
drivers_info = pd.read_csv("1995_data/drivers_information.csv")
drivers_info = remove_unnamed_column(drivers_info)
drivers_info

Unnamed: 0,driverId,url,givenName,familyName,dateOfBirth,nationality,permanentNumber,code
0,abate,http://en.wikipedia.org/wiki/Carlo_Mario_Abate,Carlo,Abate,1932-07-10,Italian,,
1,abecassis,http://en.wikipedia.org/wiki/George_Abecassis,George,Abecassis,1913-03-21,British,,
2,acheson,http://en.wikipedia.org/wiki/Kenny_Acheson,Kenny,Acheson,1957-11-27,British,,
3,adams,http://en.wikipedia.org/wiki/Philippe_Adams,Philippe,Adams,1969-11-19,Belgian,,
4,ader,http://en.wikipedia.org/wiki/Walt_Ader,Walt,Ader,1913-12-15,American,,
...,...,...,...,...,...,...,...,...
853,zapico,http://en.wikipedia.org/wiki/Emilio_Zapico,Emilio,Zapico,1944-05-27,Spanish,,
854,zhou,http://en.wikipedia.org/wiki/Zhou_Guanyu,Guanyu,Zhou,1999-05-30,Chinese,24.0,ZHO
855,zonta,http://en.wikipedia.org/wiki/Ricardo_Zonta,Ricardo,Zonta,1976-03-23,Brazilian,,ZON
856,zorzi,http://en.wikipedia.org/wiki/Renzo_Zorzi,Renzo,Zorzi,1946-12-12,Italian,,


In [None]:
# Create a function to get driverId given name

def get_driver_id(name):
    split_name = name.split(" ")
    # If name is in format "Lewis Hamilton return lewis_hamilton"
    if len(split_name) == 2:
        return "_".join(split_name)
    elif len(split_name) == 1:


In [None]:
# Season and Round will help recognise the race and circuit
final_data["Season"] = race_results["season"] # Season number
final_data["Round"] = race_results["round"] # Round number

# Position can be changed into labels of 0 and 1 (1 if position is in top10)
final_data['Position'] = race_results["position"]
final_data["Label"] = final_data['Position'].apply(lambda x: 1 if x <= 10 else 0)

# Join qualification performance
# TODO:

In [None]:
race_results.columns

Index(['number', 'position', 'positionText', 'points', 'Driver', 'Constructor',
       'grid', 'laps', 'status', 'Time', 'season', 'round', 'FastestLap'],
      dtype='object')