In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.optimize import minimize
from scipy.optimize import differential_evolution

In [15]:
# === Load your data ===
#track_df = pd.read_csv(r"../data_f1db/f1db-races.csv").rename(columns={'id': 'raceId'})
dnfs_df = pd.read_csv(r"/home/winter-storm/f1-data-project/erdos_ds_f1/Patrick/Feature Data Files/DNFs.csv")
track_exp_df = pd.read_csv(r"/home/winter-storm/f1-data-project/erdos_ds_f1/Patrick/Feature Testing Data Files/driver-exp-data.csv")
pits_df = pd.read_csv(r"/home/winter-storm/f1-data-project/erdos_ds_f1/Patrick/Feature Testing Data Files/pits.csv")
pre_alpc_df = pd.read_csv(r"/home/winter-storm/f1-data-project/erdos_ds_f1/Patrick/Feature Testing Data Files/pre-race-alpc-data.csv")
clusters_df = pd.read_csv(r"/home/winter-storm/f1-data-project/erdos_ds_f1/Patrick/Feature Testing Data Files/clusters test.csv")
weather_df = pd.read_csv(r"/home/winter-storm/f1-data-project/erdos_ds_f1/Patrick/race-weather-data.csv")
gaps_df = pd.read_csv(r"/home/winter-storm/f1-data-project/erdos_ds_f1/Merlin preview/gaps and circuit type (cleaned).csv")

# === Merge data sources ===
merged = dnfs_df.merge(gaps_df, on='raceId')

# === Specify features and target ===
#feature_columns = ['avgDriverRaceCount', 'averageCumRacerPoints', 'averageDriverExpYears']
#feature_columns = ['avgDriverPSTime', 'avgDriverLapStop', 'avgDriverStops']
#feature_columns = ['FRALPC', 'freeAvgPosDiff', 'qualifyingALPC', 'qualAvgPosDiff', 'paceALPC', 'pacePosDiff']
#feature_columns = ['fpClusterMean', 'qualsClusterMean', 'paceClusterMean']
#feature_columns = ['temperature_max', 'temperature_min', 'precipitation_mm']
feature_columns = ['Bgap_mean', 'Bgap_median', 'Bgap_std', 'Bgap_var', 'First_ten_gap']
target_column = 'totalDNFs'

# === Drop missing values ===
merged_clean = merged[feature_columns + [target_column]].dropna()
X_raw = merged_clean[feature_columns].values
y_target = merged_clean[target_column].values

# === Standardize features ===
scaler = StandardScaler()
X = scaler.fit_transform(X_raw)

# === Print individual feature correlations ===
print("=== Individual Feature Correlations ===")
for i, col in enumerate(feature_columns):
    corr = np.corrcoef(X[:, i], y_target)[0, 1]
    print(f"{col}: {corr:.4f}")

# === Objective: Negative Pearson correlation ===
def objective(weights):
    weighted_sum = X @ weights
    corr = np.corrcoef(weighted_sum, y_target)[0, 1]
    return -corr  # Maximize correlation

def normalized_objective(weights):
    normed = weights / np.linalg.norm(weights)
    weighted_sum = X @ normed
    corr = np.corrcoef(weighted_sum, y_target)[0, 1]
    return -corr

# === Optimization Settings ===
n_features = X.shape[1]
initial_weights = np.random.randn(n_features)  # randomized start
bounds = [(-2, 2)] * n_features

# === Option 1: Unconstrained BFGS ===
result1 = minimize(objective, initial_weights, method='BFGS')
best_weights1 = result1.x
best_correlation1 = -result1.fun

# === Option 2: Normalize after BFGS ===
best_weights2 = best_weights1 / np.linalg.norm(best_weights1)
best_correlation2 = best_correlation1  # same values, normalized weights

# === Option 3: Differential Evolution with normalization ===
result3 = differential_evolution(normalized_objective, bounds)
best_weights3 = result3.x / np.linalg.norm(result3.x)
best_correlation3 = -result3.fun

# === Print results ===
print("\n=== Optimization Results ===")
print("OPTION 1: Unconstrained BFGS")
print("Best weights:", best_weights1)
print("Max correlation:", best_correlation1)

print("\nOPTION 2: Normalize after BFGS")
print("Best weights (normalized):", best_weights2)
print("Max correlation:", best_correlation2)

print("\nOPTION 3: Differential evolution (normalized)")
print("Best weights (normalized):", best_weights3)
print("Max correlation:", best_correlation3)


=== Individual Feature Correlations ===
Bgap_mean: 0.0422
Bgap_median: 0.1459
Bgap_std: 0.0350
Bgap_var: 0.0382
First_ten_gap: 0.4025

=== Optimization Results ===
OPTION 1: Unconstrained BFGS
Best weights: [  385.18899698  6849.79681993    51.85513741   266.2644622
 21683.28764325]
Max correlation: 0.3654492156429184

OPTION 2: Normalize after BFGS
Best weights (normalized): [0.01693557 0.30116442 0.00227991 0.01170683 0.95334722]
Max correlation: 0.3654492156429184

OPTION 3: Differential evolution (normalized)
Best weights (normalized): [-0.67235815 -0.00190674  0.72911026 -0.05451739  0.11557236]
Max correlation: 0.4425446645634803


In [16]:
# Load your data (same as before)
merged = dnfs_df.merge(gaps_df, on='raceId')

# Define features and target
#feature_columns = ['avgDriverRaceCount', 'averageCumRacerPoints', 'averageDriverExpYears']
#feature_columns = ['avgDriverPSTime', 'avgDriverLapStop', 'avgDriverStops']
#feature_columns = ['FRALPC', 'freeAvgPosDiff', 'qualifyingALPC', 'qualAvgPosDiff', 'paceALPC', 'pacePosDiff']
#feature_columns = ['fpClusterMean', 'qualsClusterMean', 'paceClusterMean']
#feature_columns = ['temperature_max', 'temperature_min', 'precipitation_mm']
feature_columns =  ['Bgap_mean', 'Bgap_median', 'Bgap_std', 'Bgap_var', 'First_ten_gap']
target_column = 'totalDNFs'

# Clean the data
merged_clean = merged[feature_columns + [target_column]].dropna()
X = merged_clean[feature_columns].values
y_target = merged_clean[target_column].values

# Individual feature correlations
print("=== Individual Feature Correlations (Unstandardized) ===")
for i, col in enumerate(feature_columns):
    corr = np.corrcoef(X[:, i], y_target)[0, 1]
    print(f"{col}: {corr:.4f}")

# Define objective
def objective(weights):
    weighted_sum = X @ weights
    corr = np.corrcoef(weighted_sum, y_target)[0, 1]
    return -corr

def normalized_objective(weights):
    normed = weights / np.linalg.norm(weights)
    weighted_sum = X @ normed
    corr = np.corrcoef(weighted_sum, y_target)[0, 1]
    return -corr

# Setup
n_features = X.shape[1]
initial_weights = np.random.randn(n_features)
bounds = [(-2, 2)] * n_features

# Optimization Option 1
result1 = minimize(objective, initial_weights, method='BFGS')
best_weights1 = result1.x
best_correlation1 = -result1.fun

# Option 2: normalize after
best_weights2 = best_weights1 / np.linalg.norm(best_weights1)
best_correlation2 = best_correlation1

# Option 3: Differential Evolution with normalized weights
result3 = differential_evolution(normalized_objective, bounds)
best_weights3 = result3.x / np.linalg.norm(result3.x)
best_correlation3 = -result3.fun

# Print
print("\n=== Optimization Results (Unstandardized) ===")
print("OPTION 1: Unconstrained BFGS")
print("Best weights:", best_weights1)
print("Max correlation:", best_correlation1)

print("\nOPTION 2: Normalize after BFGS")
print("Best weights (normalized):", best_weights2)
print("Max correlation:", best_correlation2)

print("\nOPTION 3: Differential evolution (normalized)")
print("Best weights (normalized):", best_weights3)
print("Max correlation:", best_correlation3)


=== Individual Feature Correlations (Unstandardized) ===
Bgap_mean: 0.0422
Bgap_median: 0.1459
Bgap_std: 0.0350
Bgap_var: 0.0382
First_ten_gap: 0.4025

=== Optimization Results (Unstandardized) ===
OPTION 1: Unconstrained BFGS
Best weights: [ 1.0845022  -0.28131927  0.33337034 -0.28768327 -0.24838806]
Max correlation: -0.03817369518927207

OPTION 2: Normalize after BFGS
Best weights (normalized): [ 0.88229843 -0.22886772  0.27121395 -0.23404517 -0.20207648]
Max correlation: -0.03817369518927207

OPTION 3: Differential evolution (normalized)
Best weights (normalized): [-9.26077228e-01 -1.59401888e-01  2.19014537e-01 -9.81002030e-08
  2.62687339e-01]
Max correlation: 0.4423268569479902
