In [1]:
import pandas as pd
import numpy as np
from functools import reduce

In [2]:
# read the files
no_DNF_results_data_file = pd.read_csv(r"/home/winter-storm/f1-data-project/erdos_ds_f1/cleaned_data/f1db-races-race-results-NO-DNF.csv")
quals_data_file = pd.read_csv(r"/home/winter-storm/f1-data-project/erdos_ds_f1/cleaned_data/f1db-races-qualifying-results-CLEANED.csv")
quals_data_file['min_time'] = quals_data_file[['timeMillis', 'q1Millis', 'q2Millis', 'q3Millis']].min(axis=1)
free_practice_1_data = pd.read_csv(r"/home/winter-storm/f1-data-project/erdos_ds_f1/cleaned_data/f1db-races-free-practice-1-results-CLEANED.csv")
free_practice_2_data = pd.read_csv(r"/home/winter-storm/f1-data-project/erdos_ds_f1/cleaned_data/f1db-races-free-practice-2-results-CLEANED.csv")
free_practice_3_data = pd.read_csv(r"/home/winter-storm/f1-data-project/erdos_ds_f1/cleaned_data/f1db-races-free-practice-3-results-CLEANED.csv")
free_practice_4_data = pd.read_csv(r"/home/winter-storm/f1-data-project/erdos_ds_f1/cleaned_data/f1db-races-free-practice-4-results-CLEANED.csv")
start_position_data_file = pd.read_csv(r"/home/winter-storm/f1-data-project/erdos_ds_f1/data_f1db/f1db-races-starting-grid-positions.csv")
start_position_data_file = start_position_data_file[
    (start_position_data_file['year'] > 1985) & (start_position_data_file['year'] < 2025)
    ]

# Create list of unique race ids
race_id_list = no_DNF_results_data_file['raceId'].unique()

In [3]:
fpdf = free_practice_1_data[['raceId', 'driverId', 'timeMillis']].rename(columns={'timeMillis': 'fp1Time'})

fpdf = fpdf.merge(
    free_practice_2_data[['raceId', 'driverId', 'timeMillis']].rename(columns={'timeMillis': 'fp2Time'}),
    on=['raceId', 'driverId'],
    how='left'
)

fpdf = fpdf.merge(
    free_practice_3_data[['raceId', 'driverId', 'timeMillis']].rename(columns={'timeMillis': 'fp3Time'}),
    on=['raceId', 'driverId'],
    how='left'
)

fpdf['min_time'] = fpdf[['fp1Time', 'fp2Time', 'fp3Time']].min(axis=1)
fpdf['min_time'] = fpdf['min_time'].fillna(1000000)

In [4]:
pacedf = free_practice_1_data[['raceId', 'driverId', 'timeMillis']].rename(columns={'timeMillis': 'fp1Time'})

pacedf = pacedf.merge(
    free_practice_2_data[['raceId', 'driverId', 'timeMillis']].rename(columns={'timeMillis': 'fp2Time'}),
    on=['raceId', 'driverId'],
    how='left'
)

pacedf = pacedf.merge(
    free_practice_3_data[['raceId', 'driverId', 'timeMillis']].rename(columns={'timeMillis': 'fp3Time'}),
    on=['raceId', 'driverId'],
    how='left'
)

pacedf = pacedf.merge(
    quals_data_file[['raceId', 'driverId', 'timeMillis', 'q1Millis', 'q2Millis', 'q3Millis']].rename(columns={'timeMillis': 'qMillis'}),
    on=['raceId', 'driverId'],
    how='left'
)

pacedf['min_time'] = pacedf[['fp1Time', 'fp2Time', 'fp3Time', 'qMillis', 'q1Millis', 'q2Millis', 'q3Millis']].min(axis=1)
pacedf['min_time'] = pacedf['min_time'].fillna(1000000)

In [5]:
def new_time_open_ball(time, time_list, time_gap):        # Get sets of open balls with radius 1 second and basepoint each time
    new_open_ball = {time}

    for other_time in time_list:
        if abs(time - other_time) < time_gap: 
            new_open_ball.add(other_time)

    return new_open_ball

def get_clusters(set_of_times):                 # Generate all the clusters of times within 1 second of each other       
    time_clusters = []

    for set_1 in set_of_times:
        if len(time_clusters) > 0: 
            cluster_union = set.union(*time_clusters)
        else: 
            cluster_union = set()

        if set_1 & cluster_union: continue

        cluster = set_1
        for set_2 in set_of_times:
            if cluster & set_2:
                cluster = cluster | set_2
        time_clusters.append(cluster)

    return time_clusters

def sqmean_drivers_per_cluster(clusters_of_time, num_of_drivers):     # Gets squared mean number of drivers per cluster
    squared_mean = 0

    if num_of_drivers == 1: return squared_mean

    for cluster in clusters_of_time:
        num_of_drivers_in_cluster = len(cluster)
        squared_mean += num_of_drivers_in_cluster**2
    
    squared_mean = (squared_mean-num_of_drivers) / (num_of_drivers**2-num_of_drivers)

    return squared_mean

def get_cluster_sqmean(input_race_id_list, position_df, gap):
    cluster_sqmean = []

    for race_id in input_race_id_list:
        race_data = position_df[position_df['raceId'] == race_id]
        time_data = race_data['min_time'].to_list()
        if len(time_data) == 0: 
            cluster_sqmean.append({'raceId': race_id, 'cluster_mean': np.nan})
            continue
        time_sets = [new_time_open_ball(time, time_data, gap) for time in time_data]
        num_drivers = len(time_data)

        race_time_clusters = get_clusters(time_sets)
        cluster_sqmean.append({'raceId': race_id, 'cluster_mean': sqmean_drivers_per_cluster(race_time_clusters, num_drivers)})

    return pd.DataFrame(cluster_sqmean)

In [7]:
free_cluster = get_cluster_sqmean(race_id_list, fpdf, 220).rename(columns={'cluster_mean': 'fpClusterMean'})
quals_cluster = get_cluster_sqmean(race_id_list, quals_data_file, 140).rename(columns={'cluster_mean': 'qualsClusterMean'})
pace_cluster = get_cluster_sqmean(race_id_list, pacedf, 380).rename(columns={'cluster_mean': 'paceClusterMean'})

dataframes = [free_cluster, quals_cluster, pace_cluster]
clusters_df = reduce(lambda left, right: pd.merge(left, right, on='raceId', how='outer'), dataframes)
clusters_df.to_csv('/home/winter-storm/f1-data-project/erdos_ds_f1/Patrick/Feature Testing Data Files/clusters test.csv',
                   index=False)

In [8]:
columns = ['fpClusterMean', 'qualsClusterMean', 'paceClusterMean']
weights = np.array([-2.41387716, -0.90292397, -0.22131765])

clusters_df['clusterMeanWeightedSum'] = clusters_df[columns].values @ weights
clusters_df.to_csv('/home/winter-storm/f1-data-project/erdos_ds_f1/Patrick/Feature Data Files/time cluster data.csv',
                   index=False)