In [1]:
# Lectura de archivos y Funciones recurrentes
import pandas as pd
import numpy as np

# Muestra todas las columnas del DataFrame
pd.set_option('display.max_columns', None)

# Función para convertir "M:SS.sss" a timedelta
def parse_mmss(x):
    #Detectamos valores nulos
    if pd.isna(x):
        return pd.NA
    parts = str(x).split(':')
    # Si no hay dos partes (minutos y segundos), devolvemos NA
    if len(parts) != 2:
        return pd.NA
    mins, secs = parts
    # Intentamos convertir a segundos totales
    try:
        total_secs = int(mins) * 60 + float(secs)
    except ValueError:
        return pd.NA
    # Creamos el timedelta
    td = pd.to_timedelta(total_secs, unit='s')
    # Extraemos componentes con pandas
    comp = td.components
    # Calculamos horas totales (incluyendo días si hubiera)
    hh = comp.days * 24 + comp.hours
    mm = comp.minutes
    ss = comp.seconds
    ms = comp.milliseconds
    # Formateamos con ceros a la izquierda y 3 dígitos de milisegundos
    return f"{int(hh):02d}:{int(mm):02d}:{int(ss):02d}.{int(ms):03d}"

def seconds_to_hhmmss(x):
    # Detectamos valores nulos
    if pd.isna(x):
        return pd.NA
    # Intentamos convertir a float (segundos)
    try:
        secs = float(x)
    except ValueError:
        return pd.NA
    # Milisegundos totales
    total_ms = int(round(secs * 1000))
    # Descomposición en hh, mm, ss y ms
    hh = total_ms // (3600 * 1000)
    mm = (total_ms % (3600 * 1000)) // (60 * 1000)
    ss = (total_ms % (60 * 1000)) // 1000
    ms = total_ms % 1000
    # Formateo con ceros a la izquierda
    return f"{hh:02d}:{mm:02d}:{ss:02d}.{ms:03d}"

In [2]:
#Lectura y Carga de dataframes
circuits                = pd.read_csv('./00_data_raw/circuits.csv')
constructor_results     = pd.read_csv('./00_data_raw/constructor_results.csv')
constructor_standings   = pd.read_csv('./00_data_raw/constructor_standings.csv')
constructors            = pd.read_csv('./00_data_raw/constructors.csv')
driver_standings        = pd.read_csv('./00_data_raw/driver_standings.csv')
drivers                 = pd.read_csv('./00_data_raw/drivers.csv')
lap_times               = pd.read_csv('./00_data_raw/lap_times.csv')
pit_stops               = pd.read_csv('./00_data_raw/pit_stops.csv')
qualifying              = pd.read_csv('./00_data_raw/qualifying.csv')
races                   = pd.read_csv('./00_data_raw/races.csv')
results                 = pd.read_csv('./00_data_raw/results.csv')
sprint_results          = pd.read_csv('./00_data_raw/sprint_results.csv')
status                  = pd.read_csv('./00_data_raw/status.csv')
nationality             = pd.read_csv('./00_data_raw/nationality_ISO.csv', index_col=0)

In [3]:
# Columnas a eliminar por dataframe
drop_cols = {
    'circuits':              ['circuitRef','url'],
    'constructor_standings': ['positionText'],
    'constructors':          ['constructorRef', 'url'],
    'driver_standings':      ['positionText'],
    'drivers':               ['code', 'url'],
    'lap_times':             ['milliseconds'],
    'pit_stops':             ['time', 'milliseconds'],
    'races':                 ['fp1_date','fp1_time','fp2_date','fp2_time',
                              'fp3_date','fp3_time','time','quali_time','quali_date','sprint_time','sprint_date','url'],
    'results':               ['positionText','time','milliseconds'],
    'sprint_results':        ['time','milliseconds'],
}

In [4]:
# Formateo de columnas
# Texto
constructors['nationality']         = constructors['nationality'].str.title().str.strip()
status['status']                    = status['status'].str.upper().str.strip()

# Fechas
races['date']                       = pd.to_datetime(races['date'], format='%Y-%m-%d',            errors='coerce').dt.strftime('%d-%m-%Y')
drivers['dob']                      = pd.to_datetime(drivers['dob'], format='%Y-%m-%d',           errors='coerce').dt.strftime('%d-%m-%Y')

# Aplicamos la funcion parse_mmss a las columnas con tiempo tipo string
lap_times['lap_time']               = lap_times['time'].apply(parse_mmss)
qualifying['q1_time']               = qualifying['q1'].apply(parse_mmss)
qualifying['q2_time']               = qualifying['q2'].apply(parse_mmss)
qualifying['q3_time']               = qualifying['q3'].apply(parse_mmss)
results['fastestLapTime_td']        = results['fastestLapTime'].apply(parse_mmss)
sprint_results['fastestLapTime_td'] = sprint_results['fastestLapTime'].apply(parse_mmss)

# Gestion de nulos
for name, df in [
    ('circuits', circuits),
    ('constructor_results', constructor_results),
    ('constructor_standings', constructor_standings),
    ('constructors', constructors),
    ('driver_standings', driver_standings),
    ('drivers', drivers),
    ('lap_times', lap_times),
    ('pit_stops', pit_stops),
    ('qualifying', qualifying),
    ('races', races),
    ('results', results),
    ('sprint_results', sprint_results),
    ('status', status),
]: 
    if name in drop_cols:
        df.drop(columns=drop_cols[name], inplace=True, errors='ignore')
    df.replace('\\N', np.nan, inplace=True)

In [5]:
# Otras transformaciones post limpieza
# Sustituir los puntos por comas en las coordenadas de los circuitos
circuits[['lat','lng']] = circuits[['lat','lng']].map(lambda x: str(x).replace('.', ','))
# # Crear una columna con el nombre completo del piloto a partir del nombre y apellido
drivers['full_name'] = drivers['forename'] + ' ' + drivers['surname']
# Borrar la columna de nombre y apellido
drivers.drop(columns=['forename', 'surname'], inplace=True, errors='ignore')

In [6]:
circuits.head(4)


Unnamed: 0,circuitId,name,location,country,lat,lng,alt
0,1,Albert Park Grand Prix Circuit,Melbourne,Australia,-378497,144968,10
1,2,Sepang International Circuit,Kuala Lumpur,Malaysia,276083,101738,18
2,3,Bahrain International Circuit,Sakhir,Bahrain,260325,505106,7
3,4,Circuit de Barcelona-Catalunya,Montmeló,Spain,4157,226111,109


In [7]:
# Creación de archivos CSV resultado del EDA
circuits.to_csv(                './01_data_EDA/circuits.csv',               index=False)
constructor_results.to_csv(     './01_data_EDA/constructor_results.csv',    index=False)
constructors.to_csv(            './01_data_EDA/constructors.csv',           index=False)
drivers.to_csv(                 './01_data_EDA/drivers.csv',                index=False)
lap_times.to_csv(               './01_data_EDA/lap_times.csv',              index=False)
pit_stops.to_csv(               './01_data_EDA/pit_stops.csv',              index=False)
qualifying.to_csv(              './01_data_EDA/qualifying.csv',             index=False)
races.to_csv(                   './01_data_EDA/races.csv',                  index=False)
results.to_csv(                 './01_data_EDA/results.csv',                index=False)
status.to_csv(                  './01_data_EDA/status.csv',                 index=False)
nationality.to_csv(             './01_data_EDA/nationality.csv',            index=False)