In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os


sns.set_style("darkgrid")


CLEANED_DATA_PATH = "../data/cleaned/"


available_files = os.listdir(CLEANED_DATA_PATH)
print("Available Cleaned Data Files:", available_files)


file_names = [
    "circuits_cleaned.csv", "constructor_results_cleaned.csv",
    "constructor_standings_cleaned.csv", "constructors_cleaned.csv",
    "driver_standings_cleaned.csv", "drivers_cleaned.csv",
    "lap_times_cleaned.csv", "pit_stops_cleaned.csv",
    "qualifying_cleaned.csv", "races_cleaned.csv",
    "results_cleaned.csv", "seasons_cleaned.csv",
    "sprint_results_cleaned.csv", "status_cleaned.csv"
]


dataframes = {}
for file in file_names:
    file_path = os.path.join(CLEANED_DATA_PATH, file)
    if os.path.exists(file_path):
        df_name = file.replace("_cleaned.csv", "")
        dataframes[df_name] = pd.read_csv(file_path)
        print(f" Loaded {df_name} dataset with shape {dataframes[df_name].shape}")
    else:
        print(f" Warning: {file} not found!")

dataframes['races'].head(), dataframes['drivers'].head(), dataframes['results'].head()


In [None]:

dataframes['results'].describe()


In [None]:

missing_values = {name: df.isnull().sum().sum() for name, df in dataframes.items()}
missing_values_df = pd.DataFrame(list(missing_values.items()), columns=['Dataset', 'Missing Values'])
missing_values_df.sort_values(by="Missing Values", ascending=False)


In [None]:

import numpy as np

constructor_wins = dataframes['constructor_standings'].groupby(["constructorId"])["wins"].sum().sort_values(ascending=False).head(10)

plt.figure(figsize=(12, 6))
sns.lineplot(x=np.arange(len(constructor_wins)), y=constructor_wins.values, marker="o", color="blue")
plt.xticks(np.arange(len(constructor_wins)), constructor_wins.index, rotation=45)
plt.title("Top 10 Constructors by Total Wins")
plt.xlabel("Constructor ID")
plt.ylabel("Total Wins")
plt.show()


In [None]:
top_drivers = dataframes['driver_standings'].groupby("driverId")["wins"].sum().sort_values(ascending=False).head(10)

plt.figure(figsize=(10, 5))
sns.barplot(x=top_drivers.index, y=top_drivers.values, palette="magma")
plt.title("Top 10 Drivers by Wins")
plt.xlabel("Driver ID")
plt.ylabel("Total Wins")
plt.show()


In [None]:

# import seaborn as sns
# import matplotlib.pyplot as plt

# plt.figure(figsize=(8, 6))
# heatmap_data = dataframes['results'][['grid', 'positionOrder']].corr()
# sns.heatmap(heatmap_data, annot=True, cmap="coolwarm", fmt=".2f")
# plt.title("Correlation Between Starting Grid & Final Position")
# plt.show()


In [None]:


# dataframes['lap_times']['milliseconds'] = pd.to_numeric(dataframes['lap_times']['milliseconds'], errors='coerce')

# sampled_lap_times = dataframes['lap_times'].sample(500)

# plt.figure(figsize=(12, 6))
# sns.swarmplot(x=sampled_lap_times["driverId"], y=sampled_lap_times["milliseconds"], palette="viridis", alpha=0.5)
# plt.xticks(rotation=90)
# plt.title("Lap Time Distribution by Driver")
# plt.xlabel("Driver ID")
# plt.ylabel("Lap Time (ms)")
# plt.show()


Pit Stop Strategy Impact on Race Results

In [None]:

# plt.figure(figsize=(12, 6))
# sns.boxplot(x=dataframes['pit_stops']['stop'], y=dataframes['results']['positionOrder'], palette="coolwarm")
# plt.title("Effect of Pit Stops on Finishing Position")
# plt.xlabel("Number of Pit Stops")
# plt.ylabel("Finishing Position")
# plt.show()


In [None]:

# top_drivers = dataframes['driver_standings'].groupby("driverId")["points"].sum().sort_values(ascending=False).head(10)

# plt.figure(figsize=(10, 5))
# sns.barplot(x=top_drivers.index, y=top_drivers.values, palette="coolwarm")
# plt.title("Top 10 Drivers by Total Points")
# plt.xlabel("Driver ID")
# plt.ylabel("Total Points Earned")
# plt.show()


In [None]:

# plt.figure(figsize=(12, 6))
# wins_by_constructor = dataframes['constructor_standings'].groupby("constructorId")["wins"].sum().sort_values(ascending=False)
# sns.barplot(x=wins_by_constructor.index, y=wins_by_constructor.values, palette="Blues_r")
# plt.title("Total Race Wins by Constructor")
# plt.xlabel("Constructor ID")
# plt.ylabel("Wins")
# plt.xticks(rotation=90)
# plt.show()


In [None]:

# plt.figure(figsize=(8, 6))
# sns.scatterplot(x=dataframes['results']['grid'], y=dataframes['results']['positionOrder'], alpha=0.5)
# plt.title("Finishing Position vs. Starting Grid Position")
# plt.xlabel("Starting Grid Position")
# plt.ylabel("Finishing Position")
# plt.show()


In [None]:



# dataframes['lap_times']['milliseconds'] = pd.to_numeric(dataframes['lap_times']['milliseconds'], errors='coerce')


# avg_lap_times = dataframes['lap_times'].groupby("driverId")["milliseconds"].mean().sort_values().head(10)

# plt.figure(figsize=(12, 6))
# sns.barplot(x=avg_lap_times.index, y=avg_lap_times.values, palette="coolwarm")
# plt.title("Top 10 Fastest Drivers by Average Lap Time")
# plt.xlabel("Driver ID")
# plt.ylabel("Average Lap Time (ms)")
# plt.show()


In [None]:

# pit_stops_per_driver = dataframes['pit_stops'].groupby("driverId")["stop"].count().sort_values(ascending=False).head(10)

# plt.figure(figsize=(12, 6))
# sns.barplot(x=pit_stops_per_driver.index, y=pit_stops_per_driver.values, palette="coolwarm")
# plt.title("Top 10 Drivers with Most Pit Stops")
# plt.xlabel("Driver ID")
# plt.ylabel("Total Pit Stops")
# plt.xticks(rotation=45)
# plt.show()
