In [135]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Import Quarterback performance dataset

### Collect data

In [136]:
career_stats_passing = pd.read_csv("Career_Stats_Passing.csv")

In [137]:
career_stats_passing

Unnamed: 0,Player Id,Name,Position,Year,Team,Games Played,Passes Attempted,Passes Completed,Completion Percentage,Pass Attempts Per Game,...,TD Passes,Percentage of TDs per Attempts,Ints,Int Rate,Longest Pass,Passes Longer than 20 Yards,Passes Longer than 40 Yards,Sacks,Sacked Yards Lost,Passer Rating
0,tomfarris/2513861,"Farris, Tom",,1948,Chicago Rockets,0,--,--,--,0.0,...,--,--,--,--,--,--,--,--,--,0.0
1,tomfarris/2513861,"Farris, Tom",,1947,Chicago Bears,9,2,0,0.0,0.2,...,0,0.0,0,0.0,--,0,0,0,0,39.6
2,tomfarris/2513861,"Farris, Tom",,1946,Chicago Bears,11,21,8,38.1,1.9,...,1,4.8,2,9.5,--,0,0,0,0,31.5
3,billdemory/2512778,"Demory, Bill",,1974,New York Jets,1,--,--,--,0.0,...,--,--,--,--,--,--,--,--,--,0.0
4,billdemory/2512778,"Demory, Bill",,1973,New York Jets,6,39,12,30.8,6.5,...,2,5.1,8,20.5,--,0,0,8,73,22.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8520,billanderson/2508534,"Anderson, Bill",,1962,Washington Redskins,12,--,--,--,0.0,...,--,--,--,--,--,--,--,--,--,0.0
8521,billanderson/2508534,"Anderson, Bill",,1961,Washington Redskins,14,--,--,--,0.0,...,--,--,--,--,--,--,--,--,--,0.0
8522,billanderson/2508534,"Anderson, Bill",,1960,Washington Redskins,12,--,--,--,0.0,...,--,--,--,--,--,--,--,--,--,0.0
8523,billanderson/2508534,"Anderson, Bill",,1959,Washington Redskins,11,--,--,--,0.0,...,--,--,--,--,--,--,--,--,--,0.0


Career_stats_passing dataset contains information about the quarterbacks performance

In [138]:
qb_performance = career_stats_passing.copy()

Convert string columns to numerical data types

In [139]:
numerical_columns = ['Passes Completed', 'Passes Attempted', 'Passing Yards', 'TD Passes', 'Ints']
for col in numerical_columns:
    qb_performance[col] = pd.to_numeric(qb_performance[col], errors='coerce')

### Feature Engineering

Create new features that could potentially improve the model's performance

In [140]:
qb_performance['Completion Percentage'] = qb_performance['Passes Completed'] / qb_performance['Passes Attempted']
qb_performance['Yards per Attempt'] = qb_performance['Passing Yards'] / qb_performance['Passes Attempted']
qb_performance['Touchdown per Attempt'] = qb_performance['TD Passes'] / qb_performance['Passes Attempted']
qb_performance['Interception per Attempt'] = qb_performance['Ints'] / qb_performance['Passes Attempted']
qb_performance['Passing Efficiency'] = qb_performance['Completion Percentage'] * qb_performance['Yards per Attempt']
qb_performance['TD to INT Ratio'] = qb_performance['TD Passes'] / qb_performance['Ints']
qb_performance['TD to Completion Ratio'] = qb_performance['TD Passes'] / qb_performance['Passes Completed']
qb_performance['INT to Completion Ratio'] = qb_performance['Ints'] / qb_performance['Passes Completed']

Replace infinity values with NaN and fill any NaN values with 0

In [148]:
qb_performance.replace([np.inf, -np.inf], np.nan, inplace=True)
qb_performance.fillna(0, inplace=True)

Group by player and year to calculate the average performance over time

In [143]:
qb_performance_grouped = qb_performance.groupby(['Player Id', 'Year']).mean().reset_index()

In [144]:
qb_performance_grouped

Unnamed: 0,Player Id,Year,Games Played,Passes Attempted,Passes Completed,Completion Percentage,Pass Attempts Per Game,Passing Yards,TD Passes,Ints,Passer Rating,Yards per Attempt,Touchdown per Attempt,Interception per Attempt,Passing Efficiency,TD to INT Ratio,TD to Completion Ratio,INT to Completion Ratio
0,a.j.feeley/2504566,2001,1.0,14.0,10.0,0.714286,14.0,143.0,2.0,1.0,114.0,10.214286,0.142857,0.071429,7.295918,2.000000,0.200000,0.100000
1,a.j.feeley/2504566,2002,6.0,154.0,86.0,0.558442,25.7,0.0,6.0,5.0,75.4,0.000000,0.038961,0.032468,0.000000,1.200000,0.069767,0.058140
2,a.j.feeley/2504566,2003,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,a.j.feeley/2504566,2004,11.0,356.0,191.0,0.536517,32.4,0.0,11.0,15.0,61.7,0.000000,0.030899,0.042135,0.000000,0.733333,0.057592,0.078534
4,a.j.feeley/2504566,2005,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8291,yalelary/2519113,1960,12.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8292,yalelary/2519113,1961,14.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8293,yalelary/2519113,1962,14.0,1.0,0.0,0.000000,0.1,0.0,0.0,0.0,39.6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8294,yalelary/2519113,1963,10.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [145]:
qb_performance_grouped.head()

Unnamed: 0,Player Id,Year,Games Played,Passes Attempted,Passes Completed,Completion Percentage,Pass Attempts Per Game,Passing Yards,TD Passes,Ints,Passer Rating,Yards per Attempt,Touchdown per Attempt,Interception per Attempt,Passing Efficiency,TD to INT Ratio,TD to Completion Ratio,INT to Completion Ratio
0,a.j.feeley/2504566,2001,1.0,14.0,10.0,0.714286,14.0,143.0,2.0,1.0,114.0,10.214286,0.142857,0.071429,7.295918,2.0,0.2,0.1
1,a.j.feeley/2504566,2002,6.0,154.0,86.0,0.558442,25.7,0.0,6.0,5.0,75.4,0.0,0.038961,0.032468,0.0,1.2,0.069767,0.05814
2,a.j.feeley/2504566,2003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,a.j.feeley/2504566,2004,11.0,356.0,191.0,0.536517,32.4,0.0,11.0,15.0,61.7,0.0,0.030899,0.042135,0.0,0.733333,0.057592,0.078534
4,a.j.feeley/2504566,2005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Select the features and target variable

In [110]:
feature_columns = ['Completion Percentage', 'Yards per Attempt', 'Touchdown per Attempt', 'Interception per Attempt',
                   'Passing Efficiency', 'TD to INT Ratio', 'TD to Completion Ratio', 'INT to Completion Ratio']

In [111]:
target_column = 'Passer Rating'

# Linear Regression Model

In [112]:
X = qb_performance[['Player Id'] + feature_columns]
y = qb_performance[target_column]

In [127]:
# Save the 'Player Id' column before scaling
player_ids_test = X_test['Player Id'].values

In [128]:
X

Unnamed: 0,Player Id,Completion Percentage,Yards per Attempt,Touchdown per Attempt,Interception per Attempt,Passing Efficiency,TD to INT Ratio,TD to Completion Ratio,INT to Completion Ratio
0,tomfarris/2513861,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000
1,tomfarris/2513861,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000
2,tomfarris/2513861,0.380952,5.142857,0.047619,0.095238,1.959184,0.50,0.125000,0.250000
3,billdemory/2512778,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000
4,billdemory/2512778,0.307692,4.076923,0.051282,0.205128,1.254438,0.25,0.166667,0.666667
...,...,...,...,...,...,...,...,...,...
8520,billanderson/2508534,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000
8521,billanderson/2508534,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000
8522,billanderson/2508534,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000
8523,billanderson/2508534,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000


In [129]:
y

0        0.0
1       39.6
2       31.5
3        0.0
4       22.2
        ... 
8520     0.0
8521     0.0
8522     0.0
8523     0.0
8524     0.0
Name: Passer Rating, Length: 8525, dtype: float64

### Split the data into training and testing sets

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Standardize the data and train the model

In [119]:
# Scale the data without the 'Player Id' column
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop('Player Id', axis=1))
X_test_scaled = scaler.transform(X_test.drop('Player Id', axis=1))

In [120]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

Predict passer ratings for the test set

In [121]:
y_pred = lr.predict(X_test_scaled)
y_pred

array([121.99328409,  75.57330157,   4.8250345 , ...,   4.8250345 ,
        94.6071315 ,  52.79083823])

### Evaluate the model's performance

In [122]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [123]:
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared: {r2:.2f}")

Mean Squared Error: 167.76
Root Mean Squared Error: 12.95
R-squared: 0.90


The mean squared error (MSE) is a measure of the average squared difference between the predicted and actual values in the test set. It measures how well the model is able to capture the variance in the test set. The lower the MSE, the better the model is at predicting the outcome variable.

The root mean squared error (RMSE) is the square root of the MSE, which is a measure of the average difference between the predicted and actual values in the same units as the dependent variable. It is a commonly used measure of the error of prediction and is useful in comparing different models.

The R-squared (R2) is a measure of the proportion of the variance in the dependent variable that is explained by the independent variables in the model. It ranges from 0 to 1, with higher values indicating a better fit of the model.



### Display the first 10 predictions and actual passer ratings

In [124]:
player_id_to_name = qb_performance[['Player Id', 'Name']].set_index('Player Id').to_dict()['Name']

In [125]:
results_df = pd.DataFrame({"Player Id": player_ids_test,
                           "Name": [player_id_to_name[player_id] for player_id in player_ids_test],
                           "Actual Passer Rating": y_test.values.ravel(),
                           "Predicted Passer Rating": y_pred.ravel()})

In [126]:
results_df.head(10)

Unnamed: 0,Player Id,Name,Actual Passer Rating,Predicted Passer Rating
0,joewashington/2528291,"Washington, Joe",118.8,121.993284
1,richgannon/2500754,"Gannon, Rich",79.8,75.573302
2,donmccauley/2520537,"McCauley, Don",0.0,4.825034
3,yalelary/2519113,"Lary, Yale",0.0,4.825034
4,demaryiusthomas/497328,"Thomas, Demaryius",0.0,4.825034
5,bretthundley/2552588,"Hundley, Brett",0.0,4.825034
6,bobgagliano/2500736,"Gagliano, Bob",61.2,58.121996
7,colinkaepernick/2495186,"Kaepernick, Colin",81.2,70.784616
8,deucemcallister/2504773,"McAllister, Deuce",0.0,4.825034
9,jimmygerman/2514780,"German, Jimmy",65.6,64.020559
