In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Import Quarterback performance dataset

Collect data

In [2]:
career_stats_passing = pd.read_csv("Career_Stats_Passing.csv")

In [3]:
career_stats_passing

Unnamed: 0,Player Id,Name,Position,Year,Team,Games Played,Passes Attempted,Passes Completed,Completion Percentage,Pass Attempts Per Game,...,TD Passes,Percentage of TDs per Attempts,Ints,Int Rate,Longest Pass,Passes Longer than 20 Yards,Passes Longer than 40 Yards,Sacks,Sacked Yards Lost,Passer Rating
0,tomfarris/2513861,"Farris, Tom",,1948,Chicago Rockets,0,--,--,--,0.0,...,--,--,--,--,--,--,--,--,--,0.0
1,tomfarris/2513861,"Farris, Tom",,1947,Chicago Bears,9,2,0,0.0,0.2,...,0,0.0,0,0.0,--,0,0,0,0,39.6
2,tomfarris/2513861,"Farris, Tom",,1946,Chicago Bears,11,21,8,38.1,1.9,...,1,4.8,2,9.5,--,0,0,0,0,31.5
3,billdemory/2512778,"Demory, Bill",,1974,New York Jets,1,--,--,--,0.0,...,--,--,--,--,--,--,--,--,--,0.0
4,billdemory/2512778,"Demory, Bill",,1973,New York Jets,6,39,12,30.8,6.5,...,2,5.1,8,20.5,--,0,0,8,73,22.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8520,billanderson/2508534,"Anderson, Bill",,1962,Washington Redskins,12,--,--,--,0.0,...,--,--,--,--,--,--,--,--,--,0.0
8521,billanderson/2508534,"Anderson, Bill",,1961,Washington Redskins,14,--,--,--,0.0,...,--,--,--,--,--,--,--,--,--,0.0
8522,billanderson/2508534,"Anderson, Bill",,1960,Washington Redskins,12,--,--,--,0.0,...,--,--,--,--,--,--,--,--,--,0.0
8523,billanderson/2508534,"Anderson, Bill",,1959,Washington Redskins,11,--,--,--,0.0,...,--,--,--,--,--,--,--,--,--,0.0


Career_stats_passing dataset contains information about the quarterbacks performance

In [4]:
qb_performance = career_stats_passing.copy()

Convert string columns to numerical data types

In [5]:
numerical_columns = ['Passes Completed', 'Passes Attempted', 'Passing Yards', 'TD Passes', 'Ints']
for col in numerical_columns:
    qb_performance[col] = pd.to_numeric(qb_performance[col], errors='coerce')

# Feature Engineering

In [22]:
qb_performance['Completion Percentage'] = qb_performance['Passes Completed'] / qb_performance['Passes Attempted']
qb_performance['Yards per Attempt'] = qb_performance['Passing Yards'] / qb_performance['Passes Attempted']
qb_performance['Touchdown per Attempt'] = qb_performance['TD Passes'] / qb_performance['Passes Attempted']
qb_performance['Interception per Attempt'] = qb_performance['Ints'] / qb_performance['Passes Attempted']

Replace infinity values with NaN and Fill any NaN values with 0

In [7]:
qb_performance.replace([np.inf, -np.inf], np.nan, inplace=True)

In [8]:
qb_performance.fillna(0, inplace=True)

Group by player and year to calculate the average performance over time

In [23]:
qb_performance_grouped = qb_performance.groupby(['Player Id', 'Year']).mean().reset_index()
qb_performance_grouped

Unnamed: 0,Player Id,Year,Games Played,Passes Attempted,Passes Completed,Completion Percentage,Pass Attempts Per Game,Passing Yards,TD Passes,Ints,Passer Rating,Yards per Attempt,Touchdown per Attempt,Interception per Attempt
0,a.j.feeley/2504566,2001,1.0,14.0,10.0,0.714286,14.0,143.0,2.0,1.0,114.0,10.214286,0.142857,0.071429
1,a.j.feeley/2504566,2002,6.0,154.0,86.0,0.558442,25.7,0.0,6.0,5.0,75.4,0.000000,0.038961,0.032468
2,a.j.feeley/2504566,2003,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,,,
3,a.j.feeley/2504566,2004,11.0,356.0,191.0,0.536517,32.4,0.0,11.0,15.0,61.7,0.000000,0.030899,0.042135
4,a.j.feeley/2504566,2005,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8291,yalelary/2519113,1960,12.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,,,
8292,yalelary/2519113,1961,14.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,,,
8293,yalelary/2519113,1962,14.0,1.0,0.0,0.000000,0.1,0.0,0.0,0.0,39.6,0.000000,0.000000,0.000000
8294,yalelary/2519113,1963,10.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,,,


Select features for the model

In [10]:
selected_features = ['Completion Percentage', 'Yards per Attempt', 'Touchdown per Attempt', 'Interception per Attempt']

# Random Forest Regressor

In [11]:
X = qb_performance[selected_features]
y = qb_performance['Passer Rating']

### Split the dataset into training and testing sets

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

Train the Random Forest Regressor

In [15]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

Make predictions on the test set

In [25]:
y_pred = rf_regressor.predict(X_test)
y_pred

array([118.8       ,  81.008     ,   4.85965321, ...,   4.85965321,
       103.517     ,  57.507     ])

Evaluate the model's performance

In [18]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [19]:
print(f'Mean Squared Error: {mse:.2f}')
print(f'Root Mean Squared Error: {rmse:.2f}')
print(f'R2 Score: {r2:.2f}')

Mean Squared Error: 90.77
Root Mean Squared Error: 9.53
R2 Score: 0.95


In [28]:
results_df = pd.DataFrame({"Actual Passer Rating": y_test, "Predicted Passer Rating": y_pred})
results_df.head(10)

Unnamed: 0,Actual Passer Rating,Predicted Passer Rating
7505,118.8,118.8
2957,79.8,81.008
7033,0.0,4.859653
1084,0.0,4.859653
856,0.0,4.859653
4304,0.0,4.859653
2132,61.2,55.035
1385,81.2,78.736
5241,0.0,4.859653
6518,65.6,70.799
