In [None]:
import pandas as pd
import numpy as np
from functools import reduce
from sklearn.preprocessing import StandardScaler

In [2]:
races_data_file = pd.read_csv(r"/home/winter-storm/f1-data-project/erdos_ds_f1/data_f1db/f1db-races.csv")
results_data_file = pd.read_csv(r"/home/winter-storm/f1-data-project/erdos_ds_f1/data_f1db/f1db-races-race-results.csv")
race_id_list = pd.read_csv(r"/home/winter-storm/f1-data-project/erdos_ds_f1/cleaned_data/f1db-races-race-results-CLEANED.csv")['raceId'].unique()

  results_data_file = pd.read_csv(r"/home/winter-storm/f1-data-project/erdos_ds_f1/data_f1db/f1db-races-race-results.csv")


In [3]:
# Generate data which tracks the number of races a driver has participated in before the current race

# Get drivers who participated in each race and the year of the race
race_years = races_data_file[["id", "year"]].copy().rename(columns={"id": "raceId"})
race_driver_pairs = results_data_file[["raceId", "driverId"]].copy()
combined = pd.merge(race_driver_pairs, race_years, on="raceId", how="left")

# Sort by driver, then by year, then by race
combined = combined.sort_values(by=["driverId", "year", "raceId"])

# Get new column which records how many races a driver has participated in 
# up to but not including the current race
combined["racesBefore"] = (
    combined.groupby("driverId").cumcount()
)

# Rename dataframe
racer_racecount = combined[["year", "raceId", "driverId", "racesBefore"]]
racer_racecount

Unnamed: 0,year,raceId,driverId,racesBefore
493,1952,21,adolf-brudes,0
581,1953,24,adolfo-schwelm-cruz,0
10955,1987,437,adrian-campos,0
10975,1987,438,adrian-campos,1
11007,1987,439,adrian-campos,2
...,...,...,...,...
18435,2004,727,zsolt-baumgartner,15
18455,2004,728,zsolt-baumgartner,16
18476,2004,729,zsolt-baumgartner,17
18497,2004,730,zsolt-baumgartner,18


In [4]:
# Generate the average driver race count

def generate_avg_driver_race_count(input_race_id_list):
    average_race_count = []

    for race_id in input_race_id_list:
        this_race_data = racer_racecount[racer_racecount['raceId'] == race_id]
        race_count = list(this_race_data['racesBefore'])
        average_race_count.append({'raceId': race_id, 'avgDriverRaceCount': sum(race_count)/len(race_count)})

    return pd.DataFrame(average_race_count)
generate_avg_driver_race_count(race_id_list)

Unnamed: 0,raceId,avgDriverRaceCount
0,421,64.280000
1,422,65.280000
2,423,64.346154
3,424,65.346154
4,425,64.640000
...,...,...
700,1121,161.850000
701,1122,153.800000
702,1123,163.800000
703,1124,164.800000


In [5]:
# Generate data which tracks the number of cumulative points a racer has over their entire racing career

# Get the race data and fill in nan values
racer_points_df = results_data_file[['driverId', 'raceId', 'year', 'points']].copy()
racer_points_df['points'] = racer_points_df['points'].fillna(0)

# sort the dataframe by racer and then by race
racer_points_df = racer_points_df.sort_values(by=["driverId", "raceId"])

# Add the new column which records the cumulative points over the course of each driver's career
racer_points_df['cumulativePoints'] = (
    racer_points_df.groupby("driverId")["points"]
      .transform(lambda x: x.shift().cumsum().fillna(0))
)

racer_points_df.to_csv("asd.csv", index=False)

In [6]:
def get_avg_cum_racer_points(input_race_id_list):
    avg_cum_racer_points = []

    for race_id in input_race_id_list:
        this_race = racer_points_df[racer_points_df['raceId'] == race_id]
        cum_racer_points = list(this_race['cumulativePoints'])
        this_race_avg = sum(cum_racer_points)/len(cum_racer_points)

        avg_cum_racer_points.append({'raceId': race_id, 'averageCumRacerPoints': this_race_avg})

    return pd.DataFrame(avg_cum_racer_points)
get_avg_cum_racer_points(race_id_list)

Unnamed: 0,raceId,averageCumRacerPoints
0,421,75.260000
1,422,76.260000
2,423,74.288462
3,424,75.250000
4,425,74.380000
...,...,...
700,1121,985.825000
701,1122,981.575000
702,1123,996.025000
703,1124,1001.125000


In [7]:
# Generate data which tracks the number of years a racer has been driving 

# Get the race data 
racer_years_df = results_data_file[['driverId', 'year']].copy().drop_duplicates()

# sort the dataframe by racer and then by race
racer_years_df = racer_years_df.sort_values(by=["driverId", "year"])

# Add the new column which records the cumulative points over the course of each driver's career
racer_years_df['cumulativeYears'] = (
    racer_years_df.groupby('driverId').cumcount()
)

full_racer_years_df = results_data_file[['raceId', 'driverId', 'year']].copy()
full_racer_years_df = pd.merge(
    racer_years_df, 
    full_racer_years_df, 
    on  = ['driverId', 'year'],
    how = 'left'
)
full_racer_years_df = full_racer_years_df.sort_values(by=['raceId'])
full_racer_years_df

Unnamed: 0,driverId,year,cumulativeYears,raceId
13018,johnny-claes,1950,0,1
21204,peter-walker,1950,0,1
26938,yves-giraud-cabantous,1950,0,1
22171,reg-parnell,1950,0,1
1993,birabongse-bhanudej,1950,0,1
...,...,...,...,...
7326,gabriel-bortoleto,2025,0,1133
15398,lewis-hamilton,2025,18,1133
15417,liam-lawson,2025,2,1133
3152,charles-leclerc,2025,7,1133


In [8]:
def get_avg_years_exp(input_race_id_list):
    avg_years_exp = []

    for race_id in race_id_list:
        this_race_data = full_racer_years_df[full_racer_years_df['raceId'] == race_id]
        years_exp = list(this_race_data['cumulativeYears'])
        avg_exp = sum(years_exp) / len(years_exp)
        avg_years_exp.append({'raceId': race_id, 'averageDriverExpYears': avg_exp})

    return pd.DataFrame(avg_years_exp)
get_avg_years_exp(race_id_list)

Unnamed: 0,raceId,averageDriverExpYears
0,421,5.000000
1,422,5.000000
2,423,4.884615
3,424,4.884615
4,425,4.800000
...,...,...
700,1121,7.400000
701,1122,6.950000
702,1123,7.400000
703,1124,7.400000


In [9]:
avg_race_count_df = generate_avg_driver_race_count(race_id_list)
avg_cum_racer_points_df = get_avg_cum_racer_points(race_id_list)
avg_years_exp_df = get_avg_years_exp(race_id_list)

dataframes = [avg_race_count_df, avg_cum_racer_points_df, avg_years_exp_df]
ractrack_exp_df = reduce(lambda left, right: pd.merge(left, right, on='raceId', how='outer'), dataframes)
ractrack_exp_df.to_csv('/home/winter-storm/f1-data-project/erdos_ds_f1/Patrick/Feature Testing Data Files/driver-exp-data.csv', 
                       index=False)

In [12]:
# Define columns and weights
columns = ['avgDriverRaceCount', 'averageCumRacerPoints', 'averageDriverExpYears']
weights = np.array([-0.87595933, 0.03330413, 0.48123393])

# Standardize the data
scaler = StandardScaler()
X_standardized = scaler.fit_transform(ractrack_exp_df[columns])  

# Compute weighted sum
ractrack_exp_df['driverExpWeightedSum'] = X_standardized @ weights

ractrack_exp_df.to_csv('/home/winter-storm/f1-data-project/erdos_ds_f1/Patrick/Feature Data Files/driver exp data.csv',
                       index=False)