In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [14]:
# read in file
df_combined = pd.read_csv("completed_df.csv")
df_combined.head()

Unnamed: 0,posteam,season,OLRank,OffRank,PFFOL,down,ydstogo,was_pressure,yardline_100,shotgun,...,defenders_in_box,number_of_pass_rushers,defense_players,was_pressure.1,defense_man_zone_type,defense_coverage_type,div_game,td_prob,fg_prob,PktTime
0,ARI,2021,30,8,58.14286,1.0,10.0,False,61.0,1.0,...,8.0,4.0,00-0036237;00-0033954;00-0029681;00-0034743;00...,False,ZONE_COVERAGE,COVER_3,0,0.404707,0.257016,2.3
1,ARI,2021,30,8,58.14286,1.0,18.0,False,31.0,1.0,...,7.0,4.0,00-0036237;00-0033954;00-0029681;00-0034743;00...,False,ZONE_COVERAGE,COVER_3,0,0.376114,0.49541,2.3
2,ARI,2021,30,8,58.14286,2.0,17.0,False,30.0,1.0,...,5.0,4.0,00-0033954;00-0029681;00-0034743;00-0036181;00...,False,ZONE_COVERAGE,COVER_4,0,0.290844,0.560816,2.3
3,ARI,2021,30,8,58.14286,2.0,24.0,False,24.0,1.0,...,5.0,4.0,00-0036237;00-0033954;00-0029681;00-0034743;00...,False,ZONE_COVERAGE,COVER_4,0,0.245268,0.655296,2.3
4,ARI,2021,30,8,58.14286,3.0,16.0,False,16.0,1.0,...,6.0,4.0,00-0033954;00-0029681;00-0030161;00-0031254;00...,False,MAN_COVERAGE,COVER_1,0,0.170239,0.74148,2.3


In [15]:
# drop columns that are unimportant/have too many missing values to be useful
df_combined = df_combined.drop(columns = ["qb_hit_1_player_id", "qb_hit_1_player_name", "qb_hit_2_player_id",
                                         "qb_hit_2_player_name", "sack_player_id", "sack_player_name", 
                                         "half_sack_1_player_id", "half_sack_1_player_name", "half_sack_2_player_id", 
                                         "half_sack_2_player_name"], axis=1)

In [16]:
df_combined = df_combined.drop('was_pressure.1', axis=1) # delete the extra was_pressure column
df_combined['was_pressure'] = df_combined['was_pressure'].replace({True: 1, False: 0}) # replace True/False to 1/0

In [17]:
# make numeric values for the teams
df_combined["teams_numeric"] = pd.factorize(df_combined["posteam"])[0]

In [18]:
# replace any missing values in the column "was_pressure" with the value "TRUE"
# This is because the missing values are for observations of a sack, so we cannot remove the missing value as
# it will remove all the rows where a sack occurred
# instead we will replace the missing values with TRUE since a sack is considered pressure by the defense
df_combined["was_pressure"] = df_combined["was_pressure"].apply(lambda x : 1.0 if pd.isna(x) or x == '' else x)

In [20]:
# Preparing the Random Forest Model
X = df_combined.drop(columns = ['sack', 'defense_man_zone_type', 'defense_coverage_type'], axis=1)
y = df_combined['sack']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train.dropna()
y_train = y_train.loc[X_train.index]
X_test = X_test.dropna()
y_test = y_test.loc[X_test.index]

In [21]:
# encode categorical columns
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

categorical_columns = X_train.select_dtypes(include=['object']).columns

X_train[categorical_columns] = ordinal_encoder.fit_transform(X_train[categorical_columns])
X_test[categorical_columns] = ordinal_encoder.transform(X_test[categorical_columns])

In [22]:
# build the model
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [23]:
# check accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.93

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96      5209
           1       0.49      0.21      0.29       369

    accuracy                           0.93      5578
   macro avg       0.72      0.60      0.63      5578
weighted avg       0.92      0.93      0.92      5578



In [24]:
# build confusion matrix for the model
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[5128   81]
 [ 292   77]]
