In [None]:
%matplotlib inline

# Essentials: Data Cleansing and ETL
import pandas as pd
import numpy as np

# Plotting
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.legend_handler import HandlerLine2D

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import roc_curve, auc # good for evaluation of binary classification problems
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('nflData.csv', sep=",")

In [None]:
print("Rows: ",len(df))

In [None]:
# take the dataframe for plays above and define particular columns we want
play_attr = ['game_id','qtr','game_seconds_remaining','yardline_100','ydstogo','drive','down','play_type','goal_to_go',
             'posteam','defteam','posteam_score', 'defteam_score', 'score_differential','posteam_timeouts_remaining','no_score_prob',
             'opp_fg_prob','opp_safety_prob','home_team']
plays = df[play_attr]

plays = plays[(plays.play_type == "run")|(plays.play_type=="pass")]

In [None]:
# pos team is winning
plays['CurrentScoreBool'] = plays.apply(lambda x: 1 if x.score_differential > 0 else 0, axis=1)

# pos team is home team
plays['Home'] = plays.apply(lambda x: 1 if x.home_team == x.posteam else 0, axis=1)

# final 2 mins of half
plays['TwoMinuteDrill'] = plays.apply(lambda x: 1 if (
    (((x.game_seconds_remaining <= 0)&(x.game_seconds_remaining >= 120))|((x.game_seconds_remaining <= 1920)&(x.game_seconds_remaining >= 1800)))&
    (x.CurrentScoreBool == 0)) else 0, axis=1)


plays['play_type'] = plays.apply(lambda x: 1 if x.play_type == "pass" else 0, axis=1)


In [None]:
# need to clean float data and transfer to integer
plays.game_seconds_remaining = plays.game_seconds_remaining.fillna(0).astype(int)
plays.yardline_100 = plays.yardline_100.fillna(0).astype(int)
plays.down = plays.down.fillna(0).astype(int)
plays.posteam_score = plays.posteam_score.fillna(0).astype(int)
plays.defteam_score = plays.defteam_score.fillna(0).astype(int)
# plays.RushingMean = plays.RushingMean.fillna(0).astype(int)
# plays.PassingMean = plays.PassingMean.fillna(0).astype(int)
plays.score_differential = plays.score_differential.fillna(0).astype(int)
plays.goal_to_go = plays.goal_to_go.fillna(0).astype(int)

In [None]:
# changing float64 to float32
plays.no_score_prob = plays.no_score_prob.fillna(0).astype(np.float32)
plays.opp_fg_prob = plays.opp_fg_prob.fillna(0).astype(np.float32)
plays.opp_safety_prob = plays.opp_safety_prob.fillna(0).astype(np.float32)


plays.no_score_prob = pd.qcut(plays['no_score_prob'], 5, labels=False)
plays.opp_fg_prob = pd.qcut(plays['opp_fg_prob'], 5, labels=False)
plays.opp_safety_prob = pd.qcut(plays['opp_safety_prob'], 5, labels=False)

In [None]:
# drop unneeded columns to begin to de-clutter the set
plays = plays[plays.down != 0]
plays = plays.drop(columns=['home_team'])

plays.head(5)

In [None]:
# TODO: add back defteam

# Define our prediction data
plays_predictors = [
'game_id','qtr','game_seconds_remaining','yardline_100',
'ydstogo','drive','down', 'goal_to_go','posteam_score', 'defteam_score', 
'score_differential','posteam_timeouts_remaining',
'no_score_prob','opp_fg_prob','opp_safety_prob']


# plays_predictors = [
# 'game_id','qtr','game_seconds_remaining','yardline_100', 'ydstogo','drive','down', 'goal_to_go', 'posteam_score', 'defteam_score']


X = plays[plays_predictors]

# Define the prediction target: PlayType
y = plays.play_type

In [None]:
# Split our data into training and test data for both our target and prediction data sets
# random state = 0 means we get same result everytime if we want to change later
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)

In [None]:
# Decision Tree Classifier
desc_tree = DecisionTreeClassifier()
desc_tree.fit(train_X, train_y)

dt_predictions = desc_tree.predict(val_X)

print(dt_predictions[:10])

false_positive_rate, true_positive_rate, thresholds = roc_curve(val_y, dt_predictions)
dt_roc_auc = auc(false_positive_rate, true_positive_rate)

In [None]:
print(dt_roc_auc)