In [None]:
# import libraries
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math

In [None]:
# load dataset
f1Stats=pd.read_csv(r'..\Dataset\F1Stats.csv');
#convert the fastest lap time to seconds 
def convert_to_seconds(time_str):
    try:
        if pd.isna(time_str):
            return None
        parts = time_str.strip().split(':')
        if len(parts) == 2:
            minutes = int(parts[0])
            seconds = float(parts[1])
            return minutes * 60 + seconds
        elif len(parts) == 1:
            return float(parts[0])  # Already in seconds
    except:
        return None

f1Stats['fastest_lap_time_sec'] = f1Stats['fastest_lap_time_sec'].apply(convert_to_seconds)

# get all the numerical columns
numerical = ['grid_position', 'final_position', 'fastest_lap_time_sec','avg_speed','pitstop_time_sec','total_laps',]
#ensure all numerical values are numeric
for col in numerical:
    f1Stats[col] = f1Stats[col].astype(str).str.strip()              # Remove whitespace
    f1Stats[col] = pd.to_numeric(f1Stats[col], errors='coerce')      # Convert to float with NaNs



# Exploratory Data Analysis


In [None]:
# basic info 
print("Columns:", f1Stats.columns) # displays the column names
print("Information:") 
print(f1Stats.info()) #displays dataset information         
print("Summary Statistics (Numerical):")
print(f1Stats.describe().T)# displays stats for numeric columns
print("Missing Values:", f1Stats.isnull().sum().sum()) #displays the total empty values

In [None]:
#numerical
n_cols = 6
n_rows_num = (len(numerical) + n_cols - 1) // n_cols 
fig, axes = plt.subplots(n_rows_num, n_cols, figsize=(15, 5 * n_rows_num))
axes = axes.flatten()

# histograms for numerical features
for idx, feature in enumerate(numerical):
    sns.histplot(f1Stats[feature], bins=20, ax=axes[idx])
    axes[idx].set_title(f'Histogram of {feature}')
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Count')
# removes empty subplots
for i in range(len(numerical), len(axes)):
    fig.delaxes(axes[i])
plt.tight_layout()
# plt.savefig('numerical_histograms.png', dpi=300, bbox_inches='tight') # saves pic used for report
plt.show()
plt.close()

In [None]:
n = len(numerical)
cols = 2
rows = math.ceil(n / cols)

# Create figure and axes
fig, axes = plt.subplots(rows, cols, figsize=(12, rows * 4))
axes = axes.flatten()
# boxplots for numerical features
for idx, feature in enumerate(numerical):
    sns.boxplot(y=f1Stats[feature], ax=axes[idx], color='skyblue')
    axes[idx].set_title(f'Boxplot of {feature}')
    axes[idx].set_xlabel('')
    axes[idx].set_ylabel(feature)

# Remove unused subplots
for i in range(len(numerical), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
# plt.savefig('numerical_histograms.png', dpi=300, bbox_inches='tight') # saves pic used for report
plt.show()
plt.close()

In [None]:
#correlation heatmaps
corr = f1Stats.corr(numeric_only=True)
sns.heatmap(corr, annot=True, cmap="coolwarm")

Focus on drivers and their fastest laps

In [None]:
sns.boxplot(x='circuit', y='fastest_lap_time_sec', data=f1Stats)
plt.xticks(rotation=90)
plt.title('Lap Time Distribution by Circuit')
plt.show()

In [None]:
sns.histplot(f1Stats['fastest_lap_time_sec'], bins=30, kde=True)
plt.title('Distribution of Fastest Lap Times')
plt.xlabel('Fastest Lap Time (sec)')
plt.show()

In [None]:

pivot = f1Stats.pivot_table(index='driver', columns='circuit', values='fastest_lap_time_sec', aggfunc='mean')
plt.figure(figsize=(14, 6))
sns.heatmap(pivot, annot=True, fmt=".1f", cmap='coolwarm')
plt.title('Average Fastest Lap Time by Driver and Circuit')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=f1Stats, x='avg_speed', y='fastest_lap_time_sec', hue='driver', s=40, alpha=0.7, legend='brief')
plt.title('Avg Speed vs Fastest Lap Time')
plt.xlabel('Average Speed (km/h)')
plt.ylabel('Fastest Lap Time (sec)')
plt.grid(True)
plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Convert year to string to ensure categorical coloring
f1Stats['year'] = f1Stats['year'].astype(str)

In [None]:


# Define a custom color map for years (visible on light and dark backgrounds)
custom_colors = [
    '#1f77b4',  # Blue (2020)
    '#ff7f0e',  # Orange (2021)
    '#2ca02c',  # Green (2022)
    '#d62728',  # Red (2023)
    '#9467bd',  # Purple (2024)
    '#e377c2',  # Pink (2025)
]  # Add more colors if you have more unique years

# Get unique drivers
drivers = f1Stats['driver'].unique()

# Create a separate scatter plot for each driver
for driver in drivers:
    # Filter data for the current driver
    driver_df = f1Stats[f1Stats['driver'] == driver]
    
    # Create a color map for unique years in this driver's data
    unique_years = driver_df['year'].unique()
    color_map = {str(year): custom_colors[i % len(custom_colors)] for i, year in enumerate(unique_years)}
    
    # Create scatter plot with discrete colors
    fig = px.scatter(
        driver_df,
        x='circuit',
        y='fastest_lap_time_sec',
        color='year',  # Differentiate years with discrete colors
        symbol='year',  # Use different symbols for years
        hover_data=['year', 'race', 'country'],  # Show additional info on hover
        title=f'Fastest Lap Times for {driver}',
        labels={
            'fastest_lap_time_sec': 'Fastest Lap Time (seconds)',
            'circuit': 'Circuit',
            'year': 'Year'
        },
        color_discrete_map=color_map  # Apply custom discrete color map
    )
    
    # Update layout for better readability and legend visibility
    fig.update_layout(
        xaxis_title="Circuit",
        yaxis_title="Fastest Lap Time (seconds)",
        showlegend=True,
        height=600,
        xaxis={'tickangle': 45},  # Rotate x-axis labels for readability
        plot_bgcolor='#ffffff',  # White background for light theme
        paper_bgcolor='#ffffff',  # White background for plot area
        font=dict(color='#000000'),  # Black text for readability
        legend=dict(
            title_font_color="#000000",
            font=dict(color="#000000", size=12),
            bgcolor='rgba(255, 255, 255, 0.8)',  # Semi-transparent white background for legend
            bordercolor="#000000",
            borderwidth=1
        )
    )
    
    # Adjust y-axis to have fastest times at the top
    fig.update_yaxes(autorange='reversed')
    
    # Ensure points are visible
    fig.update_traces(marker=dict(size=10, line=dict(width=1, color='DarkSlateGrey')))
    
    # Show the plot
    fig.show()

In [None]:

# Create boxplots for fastest lap times per circuit
fig = px.box(
    f1Stats,
    x='circuit',
    y='fastest_lap_time_sec',
    title='Fastest Lap Time Distribution per Circuit',
    labels={
        'fastest_lap_time_sec': 'Fastest Lap Time (seconds)',
        'circuit': 'Circuit'
    }
)

# Update layout for better readability
fig.update_layout(
    xaxis_title="Circuit",
    yaxis_title="Fastest Lap Time (seconds)",
    showlegend=False,
    height=600,
    xaxis={'tickangle': 45},  # Rotate x-axis labels for readability
    plot_bgcolor='#ffffff',  # White background for light theme
    paper_bgcolor='#ffffff',  # White background for plot area
    font=dict(color='#000000'),  # Black text for readability
)

# Ensure points are visible
fig.update_traces(marker=dict(size=5, color='DarkSlateGrey'))

# Show the plot
fig.show()

In [None]:
# scaling

# removing outliers

Focus on teams and fastest pitstops

In [None]:


# Define a custom color map for years (visible on light and dark backgrounds)
custom_colors = [
    '#1f77b4',  # Blue (2010)
    '#ff7f0e',  # Orange (2015)
    '#2ca02c',  # Green (2020)
    '#d62728',  # Red (2021)
    '#9467bd',  # Purple (2022)
    '#8c564b',  # Brown (2023)
    '#e377c2',  # Pink (2024)
    '#17becf'   # Cyan (2025)
]  # Add more colors if you have more unique years

# Get unique teams
teams = f1Stats['team'].unique()

# Create a separate scatter plot for each team
for team in teams:
    # Filter data for the current team
    team_df = f1Stats[f1Stats['team'] == team]
    
    # Create a color map for unique years in this team's data
    unique_years = team_df['year'].unique()
    color_map = {str(year): custom_colors[i % len(custom_colors)] for i, year in enumerate(unique_years)}
    
    # Create scatter plot with discrete colors
    fig = px.scatter(
        team_df,
        x='circuit',
        y='pitstop_time_sec',
        color='year',  # Differentiate years with discrete colors
        symbol='year',  # Use different symbols for years
        hover_data=['year', 'race', 'country', 'driver'],  # Show additional info on hover
        title=f'Pitstop Times for {team}',
        labels={
            'pitstop_time_sec': 'Pitstop Time (seconds)',
            'circuit': 'Circuit',
            'year': 'Year'
        },
        color_discrete_map=color_map  # Apply custom discrete color map
    )
    
    # Update layout for better readability and legend visibility
    fig.update_layout(
        xaxis_title="Circuit",
        yaxis_title="Pitstop Time (seconds)",
        showlegend=True,
        height=600,
        xaxis={'tickangle': 45},  # Rotate x-axis labels for readability
        plot_bgcolor='#ffffff',  # White background for light theme
        paper_bgcolor='#ffffff',  # White background for plot area
        font=dict(color='#000000'),  # Black text for readability
        legend=dict(
            title_font_color="#000000",
            font=dict(color="#000000", size=12),
            bgcolor='rgba(255, 255, 255, 0.8)',  # Semi-transparent white background for legend
            bordercolor="#000000",
            borderwidth=1
        )
    )
    
    # Adjust y-axis to have shorter pitstop times at the top
    fig.update_yaxes(autorange='reversed')
    
    # Ensure points are visible
    fig.update_traces(marker=dict(size=10, line=dict(width=1, color='DarkSlateGrey')))
    
    # Show the plot
    fig.show()

In [None]:
fig = px.box(
    f1Stats,
    x='circuit',
    y='pitstop_time_sec',
    title='Pitstop Time Distribution per Circuit',
    labels={
        'pitstop_time_sec': 'Pitstop Time (seconds)',
        'circuit': 'Circuit'
    }
)

# Update layout for better readability
fig.update_layout(
    xaxis_title="Circuit",
    yaxis_title="Pitstop Time (seconds)",
    showlegend=False,
    height=600,
    xaxis={'tickangle': 45},  # Rotate x-axis labels for readability
    plot_bgcolor='#ffffff',  # White background for light theme
    paper_bgcolor='#ffffff',  # White background for plot area
    font=dict(color='#000000'),  # Black text for readability
)

# Ensure points are visible
fig.update_traces(marker=dict(size=5, color='DarkSlateGrey'))

# Show the plot
fig.show()

In [None]:
# scaling

# removing outliers

Focus on driver wins

In [None]:
win_counts = f1Stats[f1Stats['final_position'] == 1]['driver'].value_counts()
win_counts.plot(kind='bar', figsize=(12, 6), title='Number of Wins per Driver')
plt.ylabel('Wins')
plt.show()

In [None]:
f1Stats['win'] = (f1Stats['final_position'] == 1).astype(int)
team_win_rate = f1Stats.groupby('team')['win'].mean().sort_values(ascending=False)

team_win_rate.plot(kind='bar', figsize=(12, 6), title='Team Win Rate')
plt.ylabel('Win Rate')
plt.show()

In [None]:
plt.figure(figsize=(14, 6))
sns.boxplot(data=f1Stats, x='grid_position', y='final_position')
plt.title('Distribution of Final Positions per Grid Start')
plt.xlabel('Grid Start Position')
plt.ylabel('Final Position')
plt.show()

In [None]:
avg_position = f1Stats.groupby('driver')['final_position'].mean().sort_values()
plt.figure(figsize=(12, 6))
avg_position.plot(kind='barh', color='skyblue')
plt.title('Average Final Position per Driver')
plt.xlabel('Average Final Position')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Define a custom color map for years (visible on light and dark backgrounds)
custom_colors = [
    '#1f77b4',  # Blue (2020)
    '#ff7f0e',  # Orange (2021)
    '#2ca02c',  # Green (2022)
    '#d62728',  # Red (2023)
    '#9467bd',  # Purple (2024)
    '#e377c2',  # Pink (2025)
]
unique_years = sorted(f1Stats['year'].unique())
color_map = {str(year): custom_colors[i % len(custom_colors)] for i, year in enumerate(unique_years)}

# Get unique drivers
drivers = f1Stats['driver'].unique()

# Loop through drivers
for driver in drivers:
    driver_df = f1Stats[f1Stats['driver'] == driver]

    fig = px.scatter(
        driver_df,
        x='circuit',
        y='final_position',
        color='year',
        symbol='year',
        hover_data=['year', 'race', 'country'],
        title=f'Final Race Positions for {driver}',
        labels={
            'final_position': 'Final Position',
            'circuit': 'Circuit',
            'year': 'Year'
        },
        color_discrete_map=color_map
    )

    # Reverse y-axis so P1 is at the top
    fig.update_yaxes(autorange='reversed')

    # Styling
    fig.update_layout(
        xaxis_title="Circuit",
        yaxis_title="Final Position",
        showlegend=True,
        height=600,
        xaxis={'tickangle': 45},
        plot_bgcolor='#ffffff',
        paper_bgcolor='#ffffff',
        font=dict(color='#000000'),
        legend=dict(
            title_font_color="#000000",
            font=dict(color="#000000", size=12),
            bgcolor='rgba(255, 255, 255, 0.8)',
            bordercolor="#000000",
            borderwidth=1
        )
    )

    fig.update_traces(marker=dict(size=10, line=dict(width=1, color='DarkSlateGrey')))
    fig.show()

In [None]:
f1Stats['final_position'] = pd.to_numeric(f1Stats['final_position'], errors='coerce').fillna(0)

# Create win column
f1Stats['win'] = (f1Stats['final_position'] == 1).astype(int)

# Calculate win proportion per driver
win_proportion = f1Stats.groupby('driver')['win'].mean().sort_values(ascending=False)

# Generate pastel colors using Set3
num_drivers = len(win_proportion)
pastel_cmap = plt.get_cmap('Set3')
colors = [pastel_cmap(i / num_drivers) for i in range(num_drivers)]

# Create pie chart without labels
plt.figure(figsize=(10, 8))
patches, texts, autotexts = plt.pie(
    win_proportion,
    labels=None,
    autopct='%1.1f%%',
    startangle=90,
    colors=colors
)

# Add legend with driver names
plt.legend(patches, win_proportion.index, title="Driver", loc="center left", bbox_to_anchor=(1, 0.5))

plt.title('Proportion of Wins per Driver')
plt.axis('equal')
plt.tight_layout()
plt.show()

# Feature Engineering

# Modeling