In [1]:
import numpy as np
import pandas as pd
import gc
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

## Data Preprocessing

In [2]:
# Input directory for datasets
in_dir = "/kaggle/input/nfl-big-data-bowl-2024/"

# Load datasets
games = pd.read_csv(in_dir+"games.csv", header=0)
players = pd.read_csv(in_dir+"players.csv", header=0)
plays = pd.read_csv(in_dir+"plays.csv", header=0)
tackles = pd.read_csv(in_dir+"tackles.csv", header=0)
tracking1 = pd.read_csv(in_dir+"tracking_week_1.csv", header=0)
tracking2 = pd.read_csv(in_dir+"tracking_week_2.csv", header=0)
tracking3 = pd.read_csv(in_dir+"tracking_week_3.csv", header=0)
tracking4 = pd.read_csv(in_dir+"tracking_week_4.csv", header=0)
tracking5 = pd.read_csv(in_dir+"tracking_week_5.csv", header=0)
tracking6 = pd.read_csv(in_dir+"tracking_week_6.csv", header=0)
tracking7 = pd.read_csv(in_dir+"tracking_week_7.csv", header=0)
tracking8 = pd.read_csv(in_dir+"tracking_week_8.csv", header=0)
tracking9 = pd.read_csv(in_dir+"tracking_week_9.csv", header=0)

In [3]:
tracking = tracking1
for ds in [tracking2, tracking3, tracking4, tracking5, tracking6, tracking7, tracking8, tracking9]:
    tracking = pd.concat([tracking, ds], axis=0)
del tracking1
del tracking2
del tracking3
del tracking4
del tracking5
del tracking6
del tracking7
del tracking8
del tracking9
gc.collect()

0

In [4]:
# Didn't end up needing to remove duplicate columns,
# but keeping anyway in case needed later
games_col = set(games.columns)
track_col = set(tracking.columns)
col_diff = games_col.difference(track_col)
col_diff.add("gameId")

# Join games and tracking datasets for game-level data
game_data = pd.merge(tracking, games, on="gameId")
# Join plays and tracking datasets for play-level data
play_data = pd.merge(tracking, plays, on=["gameId", "playId"])
del tracking
del games
del plays
gc.collect()
set(game_data.columns).difference(set(play_data.columns))

{'gameDate',
 'gameTimeEastern',
 'homeFinalScore',
 'homeTeamAbbr',
 'season',
 'visitorFinalScore',
 'visitorTeamAbbr',
 'week'}

In [5]:
#foul_plays = play_data.loc[pd.isnull(play_data["foulName1"]) == False]
#fouls_plays = foul_plays.loc[pd.isnull(foul_plays["foulName2"]) == False]

# Create new column representing whether or not foul occurred on play
fouls_data = play_data[["foulName1", "foulName2", "x", "y", "s", "a", "dis", "o", "dir", "event"]]
fouls_data["foul"] = [0 if pd.isnull(x) & pd.isnull(y) else 1 for x, y in zip(fouls_data.foulName1, fouls_data.foulName2)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fouls_data["foul"] = [0 if pd.isnull(x) & pd.isnull(y) else 1 for x, y in zip(fouls_data.foulName1, fouls_data.foulName2)]


In [6]:
fouls_data.loc[fouls_data.foul == 1].head()

Unnamed: 0,foulName1,foulName2,x,y,s,a,dis,o,dir,event,foul
15571,Horse Collar Tackle,,59.59,22.47,2.12,0.43,0.23,277.16,202.14,,1
15572,Horse Collar Tackle,,59.5,22.26,2.16,0.51,0.23,278.81,203.61,pass_arrived,1
15573,Horse Collar Tackle,,59.41,22.07,2.16,0.61,0.22,278.81,205.72,,1
15574,Horse Collar Tackle,,59.3,21.87,2.22,0.72,0.23,282.38,208.46,,1
15575,Horse Collar Tackle,,59.18,21.67,2.23,0.79,0.23,285.15,211.16,,1


In [7]:
# Generate train and test data for binary classifiers
fouls_data_red = fouls_data.sample(10000)
X = fouls_data_red[["x","y","s","a","dis","o","dir"]]
y = fouls_data_red["foul"]
fouls_data_test = fouls_data.drop(fouls_data_red.index).sample(1000)
X_test = fouls_data_test[["x","y","s","a","dis","o","dir"]]
y_test = fouls_data_test["foul"]

# Replace NaN values with mean of non-NaN values
X_means = X.mean(skipna=True)
X_repmean = X.fillna(X_means)
X_test_means = X_test.mean(skipna=True)
X_test_repmean = X_test.fillna(X_test_means)

In [8]:
# Generate artificially balanced data
fouls_data_bal_1 = fouls_data.loc[fouls_data["foul"] == 1].sample(5500)
fouls_data_bal_0 = fouls_data.loc[fouls_data["foul"] == 0].sample(5500)
fouls_data_bal = pd.concat([fouls_data_bal_1, fouls_data_bal_0]).sample(frac=1)
fouls_data_bal.reindex()

# Sample from balanced data to get test data
fouls_data_test_bal = fouls_data_bal.sample(1000)
fouls_data_bal = fouls_data_bal.drop(fouls_data_test_bal.index)

X = fouls_data_bal[["x","y","s","a","dis","o","dir"]]
y = fouls_data_bal["foul"]
X_test = fouls_data_test_bal[["x","y","s","a","dis","o","dir"]]
y_test = fouls_data_test_bal["foul"]

# Replace NaN values with mean of non-NaN values
X_means = X.mean(skipna=True)
X_repmean = X.fillna(X_means)
X_test_means = X_test.mean(skipna=True)
X_test_repmean = X_test.fillna(X_test_means)

In [9]:
sum(fouls_data_test_bal["foul"] == 1)

495

In [10]:
sum(y_test)/len(y_test)

0.495

## SVM Model

In [11]:
clf = svm.SVC()
clf.fit(X_repmean, y)
clf.support_vectors_

array([[9.9770e+01, 2.8460e+01, 2.0200e+00, ..., 2.1000e-01, 1.6073e+02,
        1.5332e+02],
       [3.2630e+01, 2.1150e+01, 7.8000e-01, ..., 8.0000e-02, 2.6779e+02,
        2.5052e+02],
       [4.9000e+01, 2.3040e+01, 2.8800e+00, ..., 3.1000e-01, 1.4322e+02,
        1.7435e+02],
       ...,
       [1.1840e+01, 2.5210e+01, 2.5800e+00, ..., 2.6000e-01, 3.3459e+02,
        7.6800e+00],
       [3.5330e+01, 2.3960e+01, 3.2600e+00, ..., 3.4000e-01, 7.3070e+01,
        1.7689e+02],
       [5.5190e+01, 3.7260e+01, 2.3700e+00, ..., 2.4000e-01, 2.5713e+02,
        1.9671e+02]])

In [12]:
len(clf.support_vectors_)

9746

In [13]:
preds = clf.predict(X_test_repmean)
true_pred = sum(preds == y_test)
true_pred/len(y_test)

0.527

## Random Forest Model

In [14]:
rfc = RandomForestClassifier(max_depth=15)
rfc.fit(X_repmean, y)

In [15]:
preds = rfc.predict(X_test_repmean)
true_pred = sum(preds == y_test)
true_pred/len(y_test)

0.531

## Save Models

In [16]:
from joblib import dump
dump(clf, "svmmodel.joblib")
dump(rfc, "rfcmodel.joblib")

['rfcmodel.joblib']