# Boston Marathon Performance Analysis

This notebook help us to analysis a marathon using boston marathon dataset

# Import Libraries and Setup

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, QuantileTransformer, LabelEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.inspection import permutation_importance
from scipy import stats
from scipy.stats import jarque_bera, shapiro, normaltest
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.stattools import durbin_watson
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
np.random.seed(42)

# Data Loading and Initial Exploration

In [3]:
url = "/content/Athletes.csv"
df = pd.read_csv(url)

print(f"Dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("\nColumn types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())
print("\nBasic statistics:")
print(df.describe())

Dataset shape: (17220, 10)
Memory usage: 3.04 MB

Column types:
Bib                 int64
Zip                 int64
Age                 int64
Age Group          object
Gender             object
First Half          int64
Second Half         int64
Finish              int64
Positive Split      int64
Percent Change    float64
dtype: object

Missing values:
Bib               0
Zip               0
Age               0
Age Group         0
Gender            0
First Half        0
Second Half       0
Finish            0
Positive Split    0
Percent Change    0
dtype: int64

Basic statistics:
                Bib           Zip           Age    First Half   Second Half  \
count  17220.000000  17220.000000  17220.000000  17220.000000  17220.000000   
mean   15688.916783  38541.031127     42.217828   6526.240999   7587.674158   
std     8827.226766  34262.257821     12.540766   1270.014789   1779.232288   
min        9.000000    627.000000     18.000000   3795.000000   3961.000000   
25%     7997.75000

# Data Cleaning and Preprocessing

In [4]:
df['time'] = df['First Half'] + df['Second Half']
display(df[['First Half', 'Second Half', 'time']].head())
df['Time_seconds'] = df['Finish']
df = df.dropna(subset=['Time_seconds'])

df['Age_group'] = pd.cut(df['Age'], bins=[0, 30, 40, 50, 60, 100], labels=['<30', '30-40', '40-50', '50-60', '60+'])
df['Pace_per_mile'] = df['Time_seconds'] / 26.2
df['Speed_mph'] = 26.2 / (df['Time_seconds'] / 3600)

df['Performance_category'] = pd.cut(df['Time_seconds'],
                                   bins=[0, 10800, 12600, 14400, 16200, np.inf],
                                   labels=['Elite', 'Sub3', 'Sub3.5', 'Sub4', 'Recreational'])

Q1 = df['Time_seconds'].quantile(0.25)
Q3 = df['Time_seconds'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['Time_seconds'] >= lower_bound) & (df['Time_seconds'] <= upper_bound)]

print(f"Cleaned dataset shape: {df.shape}")

Unnamed: 0,First Half,Second Half,time
0,3832,3961,7793
1,3845,4059,7904
2,3795,4157,7952
3,3997,4065,8062
4,3979,4198,8177


Cleaned dataset shape: (17156, 16)


# Advanced Feature Engineering

In [5]:
df_encoded = df.copy()

le_gender = LabelEncoder()
le_country = LabelEncoder()
le_city = LabelEncoder()

df_encoded['Gender_encoded'] = le_gender.fit_transform(df_encoded['Gender'])
df_encoded['Country_encoded'] = le_country.fit_transform(df_encoded['Zip'])
df_encoded['City_encoded'] = le_city.fit_transform(df_encoded['Zip'])

df_encoded['Age_Gender_interaction'] = df_encoded['Age'] * df_encoded['Gender_encoded']
df_encoded['Age_squared'] = df_encoded['Age'] ** 2
df_encoded['Age_cubed'] = df_encoded['Age'] ** 3

df_encoded['Age_bin'] = pd.cut(df_encoded['Age'], bins=10, labels=False)

median_time = df_encoded['Time_seconds'].median()
df_encoded['Time_ratio_to_median'] = df_encoded['Time_seconds'] / median_time

df_encoded['Fast_runner'] = (df_encoded['Time_seconds'] < df_encoded['Time_seconds'].quantile(0.3)).astype(int)

print("Advanced features created successfully")
print(f"Final feature count: {df_encoded.shape[1]}")

Advanced features created successfully
Final feature count: 25
