# Data Loading

In [10]:
import pandas as pd

df = pd.read_json('../data/trimmed_games.json')

# Data Filtering

In [55]:
print(df.columns)

# change game_won to int
df['game_won'] = df['game_won'].astype(int)

offense_columns = [
    'game_won',
    'passing_attempts',
    'passing_completions',
    'passing_yards',
    'passing_rating',
    'passing_touchdowns',
    'passing_interceptions',
    'passing_sacks',
    'passing_sacks_yards_lost',
    'rushing_attempts',
    'rushing_yards',
    'rushing_touchdowns',
    'receiving_targets',
    'receiving_receptions',
    'receiving_yards',
    'receiving_touchdowns'
]

defense_columns = [
    'game_won',
    'defense_sacks',
    'defense_tackles',
    'defense_tackle_assists',
    'defense_interceptions',
    'defense_interception_yards',
    'defense_interception_touchdowns',
    'defense_safeties'
]

# Filter down to offense_columns
offense = df[offense_columns]
defense = df[defense_columns]

# Drop bad rows
offense = offense.dropna()
defense = defense.dropna()

# Grab y values from 'games_won' and drop it from our DataFrame.
y = offense['game_won'].values

offense = offense.drop('game_won', axis=1)

print(offense.columns)

Index(['player_id', 'year', 'date', 'game_number', 'age', 'team',
       'game_location', 'opponent', 'game_won', 'player_team_score',
       'opponent_score', 'passing_attempts', 'passing_completions',
       'passing_yards', 'passing_rating', 'passing_touchdowns',
       'passing_interceptions', 'passing_sacks', 'passing_sacks_yards_lost',
       'rushing_attempts', 'rushing_yards', 'rushing_touchdowns',
       'receiving_targets', 'receiving_receptions', 'receiving_yards',
       'receiving_touchdowns', 'kick_return_attempts', 'kick_return_yards',
       'kick_return_touchdowns', 'punt_return_attempts', 'punt_return_yards',
       'punt_return_touchdowns', 'defense_sacks', 'defense_tackles',
       'defense_tackle_assists', 'defense_interceptions',
       'defense_interception_yards', 'defense_interception_touchdowns',
       'defense_safeties', 'point_after_attemps', 'point_after_makes',
       'field_goal_attempts', 'field_goal_makes', 'punting_attempts',
       'punting_yards', '

# ML Model

In [52]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Get X
features = offense.columns
X = offense.to_numpy()

print(X.shape)
print(y.shape)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=10)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the train data
train_pred = rf_classifier.predict(X_train)

# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

print(y_pred)
print(y_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_train, train_pred)
print(f"Train Accuracy: {accuracy}")

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

(413159, 15)
(413159,)
[1 0 1 ... 1 1 1]
[1 0 1 ... 1 0 0]
Train Accuracy: 0.5622778169408248
Test Accuracy: 0.5149215800174267


# Defense Data

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Grab y values from 'games_won' and drop it from our DataFrame.
y = defense['game_won'].values
defense = defense.drop('game_won', axis=1)

# Get X
features = defense.columns
X = defense.to_numpy()

print(X.shape)
print(y.shape)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=10)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the train data
train_pred = rf_classifier.predict(X_train)

# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

print(y_pred)
print(y_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_train, train_pred)
print(f"Train Accuracy: {accuracy}")

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

(413159, 7)
(413159,)
[0 0 0 ... 0 1 0]
[1 0 1 ... 1 0 0]
Train Accuracy: 0.5179183546276098
Test Accuracy: 0.5175597831348631


# Hyperparamter Testing

In [None]:
from sklearn.model_selection import GridSearchCV

# Create a RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=10)

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, scoring='accuracy')

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the accuracy of the best model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on Test Set:", accuracy)