In [None]:
import kagglehub

# Download latest version
folder_path = kagglehub.dataset_download("rohanrao/formula-1-world-championship-1950-2020")

In [None]:
# final project execution starts here
import pandas as pd

# Load the relevant CSVs from the Kaggle dataset
# These files should be placed in the current directory or the path should be adjusted
results = pd.read_csv(folder_path + "/" +"results.csv")
races = pd.read_csv(folder_path + "/" +"races.csv")
qualifying = pd.read_csv(folder_path + "/" +"qualifying.csv")
pit_stops = pd.read_csv(folder_path + "/" +"pit_stops.csv")
constructor_standings = pd.read_csv(folder_path + "/" +"constructor_standings.csv")
constructors = pd.read_csv(folder_path + "/" +"constructors.csv")
drivers = pd.read_csv(folder_path + "/" +"drivers.csv")
status = pd.read_csv(folder_path + "/" +"status.csv")

#races = races[(races['year'] >= 2011) & (races['year'] <= 2018)]

# Step 1: Get latest constructor standings per constructorId and raceId
# You might want to use the max 'points' or latest 'position'
constructor_standings_latest = constructor_standings.sort_values(by=['raceId', 'points'], ascending=[True, False])
constructor_standings_latest = constructor_standings_latest.drop_duplicates(subset=['raceId', 'constructorId'])

# Step 2: Merge constructor name into standings (not into results yet)
constructor_standings_latest = constructor_standings_latest.merge(
    constructors[['constructorId', 'name']], on='constructorId', how='left'
).rename(columns={'name': 'constructor_name'})

# Step 3: Make sure results already has constructorId, not constructor_name
# (we'll map constructorId → name again for consistency)
results = results.merge(
    constructors[['constructorId', 'name']], on='constructorId', how='left'
).rename(columns={'name': 'constructor_name'})

# Step 4: Now safely merge constructor standings using constructorId + raceId
results = results.merge(
    constructor_standings_latest[['raceId', 'constructorId', 'points']],
    on=['raceId', 'constructorId'], how='left'
).rename(columns={'points': 'constructor_standing_points'})


# results = results.merge(constructors[['constructorId', 'name']], on='constructorId', how='left')
# results.rename(columns={'name': 'constructor_name'}, inplace=True)

# Merge to get race details (including year, round, weather proxy info can be derived from race name or date later)
results = results.merge(races[['raceId', 'year', 'round', 'name', 'date', 'circuitId']], on='raceId', how='left')

# # Merge qualifying data to get grid position
# results = results.merge(qualifying[['raceId', 'driverId', 'position']], on=['raceId', 'driverId'], how='left')
# results.rename(columns={'position': 'qualifying_position'}, inplace=True)

# 1. Pit stop time (total pit stop duration per driver per race)
# fixing pit stop being a str
pit_stops['duration'] = pd.to_numeric(pit_stops['duration'], errors='coerce')
pit_stop_time = pit_stops.groupby(['raceId', 'driverId'])['duration'].sum().reset_index()
pit_stop_time = pit_stop_time.rename(columns={'duration': 'total_pit_stop_time'})

# 2. Constructor standing
constructor_standings = constructor_standings.merge(constructors[['constructorId', 'name']], on='constructorId', how='left')
constructor_standings.rename(columns={'name': 'constructor_name'}, inplace=True)
results = results.merge(constructor_standings[['raceId', 'constructor_name', 'points']],
                        on=['raceId', 'constructor_name'], how='left')
results.rename(columns={'points_y': 'constructor_standing_points'}, inplace=True)

# 3. Grid = current skill (already in qualifying_position)

# 4. Absolute skill = average points per driver till that race
# First sort by date to ensure cumulative stats make sense
results = results.sort_values(by=['driverId', 'year', 'round'])
results['cumulative_points'] = results.groupby('driverId')['points_x'].cumsum() - results['points_x']
results['races_so_far'] = results.groupby('driverId').cumcount()
results['avg_driver_points'] = results['cumulative_points'] / results['races_so_far'].replace(0, 1)  # avoid divide by 0

# 5. Weather proxy: extract from race name or date (to be done externally or from `raceId` + location)
# For now, we can add a placeholder or leave for manual mapping later.

# 6. Avg pit stop rate per circuit (track difficulty proxy)
avg_pit_rate = pit_stops.groupby('raceId')['stop'].nunique().reset_index()
avg_pit_rate = avg_pit_rate.merge(races[['raceId', 'circuitId']], on='raceId', how='left')
track_pit_rate = avg_pit_rate.groupby('circuitId')['stop'].mean().reset_index()
track_pit_rate.rename(columns={'stop': 'avg_pit_stops_per_race'}, inplace=True)

# 7. DNF rate = percent of drivers who did not finish (position is null or status not 'Finished')
results = results.merge(status, on='statusId', how='left')  # adds 'status' column
dnf_data = results.copy()
dnf_data['dnf'] = dnf_data['status'].apply(lambda x: 1 if x != 'Finished' else 0)
dnf_rate = dnf_data.groupby('circuitId')['dnf'].mean().reset_index()
dnf_rate.rename(columns={'dnf': 'avg_dnf_rate'}, inplace=True)

# Final merge to assemble features
features = results.merge(pit_stop_time, on=['raceId', 'driverId'], how='left')
features = features.merge(track_pit_rate, on='circuitId', how='left')
features = features.merge(dnf_rate, on='circuitId', how='left')

# # bug fixes
# # Just in case, remove any duplicates
# qualifying_cleaned = qualifying[['raceId', 'driverId', 'position']].dropna()

# # Rename 'position' to something clearer
# qualifying_cleaned = qualifying_cleaned.rename(columns={'position': 'qualifying_position'})

# # Merge into main feature set
# features = features.merge(qualifying_cleaned, on=['raceId', 'driverId'], how='left')

#now use grid pos instead of qualifying pos
# Assuming results is already loaded
grid_data = results[['raceId', 'driverId', 'grid']]
features = features.merge(grid_data, on=['raceId', 'driverId'], how='left')
features['grid'] = results['grid']



# Select final columns
final_features = features[[
    'raceId', 'driverId', 'constructorId', 'points_x',  # target
    'total_pit_stop_time',
    'constructor_standing_points',
    'grid',
    'avg_driver_points',
    'avg_pit_stops_per_race',
    'avg_dnf_rate'
]].dropna()

final_features.rename(columns={'points_x': 'target_points'}, inplace=True)
final_features.head()

# Export the final features to a CSV file
final_features.to_csv("f1_final_features.csv", index=False)

print("✅ Final features exported successfully to 'f1_final_features.csv'")



✅ Final features exported successfully to 'f1_final_features.csv'


In [None]:
# qualifying.columns

In [None]:
print(pit_stops['duration'].dtype)


float64


In [None]:
# Filter races between 2005 and 2020
races_filtered = races

# ✅ 2. Get only the raceIds in the desired year range
valid_race_ids = races_filtered['raceId'].unique()

# ✅ 3. Filter final_features to include only those races
final_features = final_features[final_features['raceId'].isin(valid_race_ids)].reset_index(drop=True)

# ✅ 4. Remove any pre-existing year columns to avoid suffixes
final_features = final_features.drop(columns=['year', 'year_x', 'year_y'], errors='ignore')

# ✅ 5. Merge the 'year' column from filtered races
final_features = final_features.merge(races_filtered[['raceId', 'year']], on='raceId', how='left')
print(final_features)

      raceId  driverId  constructorId  target_points  total_pit_stop_time  \
0        841         1              1           18.0               46.426   
1        842         1              1            4.0               93.011   
2        843         1              1           25.0               61.978   
3        844         1              1           12.0               99.637   
4        845         1              1           18.0               81.457   
...      ...       ...            ...            ...                  ...   
5386    1081       854            210            0.0               49.204   
5387    1083       854            210            4.0               59.979   
5388    1084       854            210            8.0               45.000   
5389    1085       854            210            0.0               73.707   
5390    1086       854            210            0.0               45.519   

      constructor_standing_points  grid  avg_driver_points  \
0            

In [None]:
weather_data = pd.read_csv("/content/wiki_with_weather_categorized.csv")
weather_data = weather_data[['raceId', 'weather_category']]

print(weather_data.columns)
# Step 2: Merge weather into final_features
# Drop all weather-related columns if they already exist
final_features = final_features.drop(columns=[
    'weather_category', 'weather_category_x', 'weather_category_y'
], errors='ignore')
# Load the cleaned weather data
weather_data = pd.read_csv("wiki_with_weather_categorized.csv")[['raceId', 'weather_category']]

# Merge with final_features
final_features = final_features.merge(weather_data, on='raceId', how='left')
final_features['weather_category'] = final_features['weather_category'].fillna(1.5)


#checks
print(final_features[['raceId', 'weather_category']].head())
print("Missing values after fill:", final_features['weather_category'].isna().sum())




FileNotFoundError: [Errno 2] No such file or directory: '/content/wiki_with_weather_categorized.csv'

In [None]:
print(final_features)

In [None]:
final_features.to_csv("f1_final_features.csv", index=False)

In [None]:
final_features.head()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

train_data = final_features[(final_features['year'] >= 2011) & (final_features['year'] <= 2017)]
test_data = final_features[(final_features['year'] >= 2018) & (final_features['year'] <= 2020)]
print(train_data)
print(test_data)
X_train = train_data.drop(columns=['target_points', 'year'])
y_train = train_data['target_points']

X_test = test_data.drop(columns=['target_points', 'year'])
y_test = test_data['target_points']


# y = target variable
y = final_features['target_points']

# X = all independent features
X = final_features[[
    'total_pit_stop_time',
    'constructor_standing_points',
    'grid',
    'avg_driver_points',
    'avg_pit_stops_per_race',
    'avg_dnf_rate',
    'weather_category'
]]



In [None]:
results = []

for split_year in range(2014, 2021):  # training up to this year
    for test_end_year in range(2020, 2023):  # testing ends here
        train_data = final_features[(final_features['year'] >= 2011) & (final_features['year'] <= split_year)]
        test_data = final_features[(final_features['year'] > split_year) & (final_features['year'] <= test_end_year)]

        if train_data.empty or test_data.empty:
            continue

        X_train = train_data.drop(columns=['target_points', 'year'])
        y_train = train_data['target_points']

        X_test = test_data.drop(columns=['target_points', 'year'])
        y_test = test_data['target_points']

        model = LinearRegression()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        results.append({
            'train_years': f"2011–{split_year}",
            'test_years': f"{split_year+1}–{test_end_year}",
            'r2': r2,
            'MAE': mae,
            'RMSE': rmse
        })

results_df = pd.DataFrame(results).sort_values(by='r2', ascending=False)
print(results_df)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.3, random_state=42
# )


In [None]:
results = []

split_year = 2017
test_end_year = 2020
train_data = final_features[(final_features['year'] >= 2011) & (final_features['year'] <= split_year)]
test_data = final_features[(final_features['year'] > split_year) & (final_features['year'] <= test_end_year)]

X_train = train_data.drop(columns=['target_points', 'year'])
y_train = train_data['target_points']

X_test = test_data.drop(columns=['target_points', 'year'])
y_test = test_data['target_points']

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

results.append({
    'train_years': f"2011–{split_year}",
    'test_years': f"{split_year+1}–{test_end_year}",
    'r2': r2,
    'MAE': mae,
    'RMSE': rmse
})

results_df = pd.DataFrame(results).sort_values(by='r2', ascending=False)
print(results_df)

In [None]:
# Coefficients
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

# Performance metrics
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R-squared Score:", r2_score(y_test, y_pred))


In [None]:
import matplotlib as plt

results_df = pd.DataFrame({
    'Actual Points': y_test.values,
    'Predicted Points': y_pred
})
print(results_df)  # Show


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate residuals
residuals = y_test - y_pred
l = len(residuals)
# Plot histogram of residuals
plt.figure(figsize=(10, 5))
sns.histplot(residuals, kde=True, bins=50, color='skyblue')
plt.axvline(0, color='red', linestyle='--')
plt.title("Histogram of Residuals")
plt.xlabel("Residual (Actual - Predicted)")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

print(len(y_test))
print(len(y_pred))
# Assume y_test and y_pred are already defined
plt.figure(figsize=(14, 6))
plt.plot(y_test.values, label='Actual Points', color='green', marker='o')
plt.plot(y_pred, label='Predicted Points', color='red', marker='x')

plt.xlabel("Entry Index (Race/Driver Sample)")
plt.ylabel("Points")
plt.title("Actual vs Predicted Points")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10, 4))
numberOfEntries = 200
plt.plot(y_test.values[:numberOfEntries], label='Actual', marker='o')
plt.plot(y_pred[:numberOfEntries], label='Predicted', marker='x')
plt.xlabel("Index of Races in 2018")
plt.ylabel("Points")
plt.title(f"Actual vs Predicted Points (First {numberOfEntries} Samples)")
plt.legend()
plt.grid(True)
plt.show()

print(y_test)
print(y_pred)


In [None]:
import matplotlib.pyplot as plt

# Step 1: Recover the 'year' column from the original final_features
# Make sure final_features has 'year_x' or 'year' column
if 'year' not in X_test.columns:
    if 'year_x' in final_features.columns:
        X_test['year'] = final_features.loc[X_test.index, 'year_x'].values
    elif 'year' in final_features.columns:
        X_test['year'] = final_features.loc[X_test.index, 'year'].values
    else:
        raise KeyError("No 'year' column found in final_features")

# Step 2: Select a specific year
for selected_year in range(2018,2021):
  year_mask = X_test['year'] == selected_year

  # Step 3: Filter predictions and actual values
  y_test_year = y_test[year_mask]
  y_pred_year = y_pred[year_mask]

  # Step 4: Plot
  plt.figure(figsize=(14, 6))
  plt.plot(y_test_year.values, label='Actual Points', color='green', marker='o')
  plt.plot(y_pred_year, label='Predicted Points', color='red', marker='x')

  plt.xlabel("Entry Index (Races in Selected Year)")
  plt.ylabel("Points")
  plt.title(f"Actual vs Predicted Points for Year {selected_year}")
  plt.legend()
  plt.grid(True)
  plt.tight_layout()
  plt.show()

In [None]:
# Re-import necessary libraries after code environment reset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# Since the environment reset, placeholder variables will be created to simulate outputs
# These lines are placeholders and should be replaced with your actual data when running in Colab

# 1. Model Performance Summary
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

performance_summary = pd.DataFrame({
    'Metric': ['Mean Squared Error', 'Mean Absolute Error', 'R² Score'],
    'Value': [mse, mae, r2]
})

# 2. Feature Importance
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients,
    'Impact Direction': ['Positive' if coef > 0 else 'Negative' for coef in coefficients]
}).sort_values(by='Coefficient', key=abs, ascending=False)

plt.figure(figsize=(10, 5))
sns.barplot(x='Coefficient', y='Feature', data=feature_importance, palette='coolwarm')
plt.title("Feature Importance (Coefficients)")
plt.tight_layout()
plt.show()

# 3. Scenario-Based Analysis
baseline = X_test.iloc[0].copy()
scenarios = []

for change in [-3, 0, 3]:
    scenario = baseline.copy()
    scenario['grid'] = max(1, scenario['grid'] + change)
    # Simulate prediction using a dummy coefficient for qualifying_position
    prediction = scenario['grid']
    scenarios.append({'Scenario': f'Grid {change:+}', 'Predicted Points': prediction})

scenario_df = pd.DataFrame(scenarios)

# 4. Driver-Specific Insights
X_test_with_preds = X_test.copy()
X_test_with_preds['Actual Points'] = y_test.values
X_test_with_preds['Predicted Points'] = y_pred
X_test_with_preds['driverId'] = final_features['driverId']

driver_analysis = X_test_with_preds.groupby('driverId')[['Actual Points', 'Predicted Points']].mean()
driver_analysis['Error (%)'] = ((driver_analysis['Predicted Points'] - driver_analysis['Actual Points']) /
                                driver_analysis['Actual Points']) * 100
driver_analysis.reset_index(inplace=True)

# 5. Track Difficulty Analysis
circuit_info = final_features[['circuitId', 'avg_dnf_rate', 'avg_pit_stops_per_race', 'target_points']]
track_analysis = circuit_info.groupby('circuitId').mean().reset_index()

# 6. Weather vs Performance
weather_analysis = final_features.groupby('weather_category')['target_points'].mean().reset_index()
weather_analysis.columns = ['Weather Category', 'Avg Points']

plt.figure(figsize=(8, 4))
sns.barplot(data=weather_analysis, x='Weather Category', y='Avg Points', palette='Blues')
plt.title("Average Points by Weather Category")
plt.tight_layout()
plt.show()

# 7. Grid Position vs Points
plt.figure(figsize=(8, 5))
sns.scatterplot(x=final_features['grid'], y=final_features['target_points'], alpha=0.5)
sns.regplot(x=final_features['grid'], y=final_features['target_points'], scatter=False, color='red')
plt.title("Grid Position vs Points")
plt.xlabel("Grid Position")
plt.ylabel("Points")
plt.tight_layout()
plt.show()

# Return all key tables for review
performance_summary, feature_importance, scenario_df, driver_analysis.head(), track_analysis.head(), weather_analysis.head()


In [None]:
from scipy.stats import norm
residuals = y_test - y_pred
std_residual = np.std(residuals)

# --- Confidence Interval for predictions ---
confidence = 0.95
z_score = norm.ppf((1 + confidence) / 2)  # z = 1.96 for 95%

# Estimate standard error of prediction
n = len(y_test)
se_pred = np.sqrt(np.var(residuals) * (1 + 1/n))  # Simplified

# Confidence Interval bounds
lower_bound = y_pred - z_score * se_pred
upper_bound = y_pred + z_score * se_pred

# --- Z-scores for each prediction ---
z_scores = residuals / std_residual

# --- Output DataFrame ---
diagnostics_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred,
    'Residual': residuals,
    'Z-Score': z_scores,
    'Lower CI': lower_bound,
    'Upper CI': upper_bound
})

# Flag potential outliers
diagnostics_df['Outlier (95%)'] = diagnostics_df['Z-Score'].abs() > z_score

print(diagnostics_df.head(10))