In [None]:
# Find features with most importance

In [1]:
# Importing necessary packages
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.metrics import mean_squared_error, log_loss
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Uploading DataFrame and inspecting its contents
df = pd.DataFrame()
df = pd.read_csv('/Users/dwreiter/Desktop/Work/Springboard/NFL Capstone Project/Data Wrangling/nfl_df.csv', delimiter='\t', index_col=0)
df.head()

Unnamed: 0,Date,Home Field Goal Attempts,Home First Downs,Home Fourth Down Attempts,Home Fourth Down Successes,Home Fumble TDs,Home Fumbles,Home Fumbles Lost,Home Fumbles Recovered,Home Goal To Go Attempts,...,Road Time of Possession,Road Total TDs,Road Total Yds,Road Touchbacks,Road Two Point Conversion Attempts,Road Two Point Conversion Successes,Road Win,Road Wins,Season,Week Number
0,"on December 20, 2015",1,19,2,0,0,2,2,0,0,...,2248,5,506,3,0,0,True,1,nfl-2015-2016,15
1,"on November 29, 2015",2,17,2,1,0,2,1,0,0,...,2247,2,350,1,0,0,True,1,nfl-2015-2016,12
2,"on December 27, 2015",1,19,0,0,2,2,1,3,3,...,1784,1,265,0,1,1,False,0,nfl-2015-2016,16
3,"on November 22, 2015",2,21,0,0,0,2,0,1,1,...,1875,4,415,3,0,0,False,0,nfl-2015-2016,11
4,"on January 3, 2016",1,16,1,0,0,0,0,0,0,...,2197,4,382,7,0,0,True,1,nfl-2015-2016,17


In [3]:
# Setting X and y for Road Teams with X having just statistics, y being Road Win
RW_X = df.drop(['Season', 'Week Number', 'Date', 'Name of Winning Team', 'Name of Losing Team', 'Name of Tying Teams', 'Road Team Name', 'Home Team Name', 'Road Win', 'Road Loss', 'Road Tie', 'Road Wins', 'Road Losses', 'Road Ties', 'Home Win', 'Home Loss', 'Home Tie', 'Home Wins', 'Home Losses', 'Home Ties', 'Road Points', 'Home Points'], axis=1)
RW_y = df['Road Win']

# Splitting the data into Train and Test
RW_X_train, RW_X_test, RW_y_train, RW_y_test = train_test_split(RW_X, RW_y, test_size = 0.2, random_state=42, stratify=RW_y)

In [4]:
# Setting up parameter C
c_space = np.logspace(-5, 8, 5)
RW_param_grid = {'C': c_space}

# Setting up Logistic Regression Model
RW_logreg = LogisticRegression()

# Using GridSearchCV to help avoid overfitting
RW_CV = GridSearchCV(RW_logreg, RW_param_grid, cv=5)

In [5]:
# Fitting Training Data to model and finding best parameter as well as score
RW_CV.fit(RW_X_train, RW_y_train)
print('Best Parameter for Road Win Model: ' + str(RW_CV.best_params_))
print('Best Score for Road Win Model: ' + str(RW_CV.best_score_))

Best Parameter for Road Win Model: {'C': 100000000.0}
Best Score for Road Win Model: 0.923357664234


In [6]:
# Setting up prediction based on testing data for X
RW_y_pred = RW_CV.predict(RW_X_test)

# Printing R Squared
R2_RW = RW_CV.score(RW_X_test, RW_y_test)
print('R squared: ' + str(R2_RW))

# Computing and printing Means Squared Error
RW_rmse = np.sqrt(mean_squared_error(RW_y_test, RW_y_pred))
print('Root Mean Squared Error: ' + str(RW_rmse))

R squared: 0.963768115942
Root Mean Squared Error: 0.190346746907


  output_errors = np.average((y_true - y_pred) ** 2, axis=0,


In [7]:
# Calculating the Accuracy Score as well as printing the Confusion Matrix and Classification Report
RW_score = metrics.accuracy_score(RW_y_test, RW_y_pred)

RW_CM = metrics.confusion_matrix(RW_y_test, RW_y_pred)

RW_CR = metrics.classification_report(RW_y_test, RW_y_pred)

print('Accuracy Score: ' + str(RW_score))
print(RW_CM)
print(RW_CR)

Accuracy Score: 0.963768115942
[[76  3]
 [ 2 57]]
             precision    recall  f1-score   support

      False       0.97      0.96      0.97        79
       True       0.95      0.97      0.96        59

avg / total       0.96      0.96      0.96       138



In [126]:
# Testing stuff
RW_feature_imp = list(RW_CV.best_estimator_.coef_)
RW_feature_imp

[array([  1.11133358e+00,  -5.63544279e-02,   4.19148815e-01,
         -1.33156450e+00,  -8.29733131e-01,   1.54912614e+00,
          2.75951769e-01,   4.59610406e-01,  -7.41258139e-01,
          2.79803329e-02,  -1.24625493e+00,  -1.52626717e+00,
          9.96427789e-01,  -1.35037960e+00,   5.02125793e-02,
          1.67441312e+00,  -1.08904644e+00,  -5.02308232e+00,
          4.83412554e-01,  -2.49622622e-01,   2.78116612e-01,
         -2.37229771e+00,  -1.90343053e-02,   1.79567096e-01,
         -1.06928563e+00,   7.95062494e-02,   2.10995423e-01,
         -2.15831238e-03,  -9.37168718e-01,  -4.22914338e-01,
          3.89051056e-01,   9.33896786e-01,  -2.55300397e-01,
         -2.63912329e-01,   7.34814593e-01,  -1.67992528e+00,
         -5.25011914e-02,  -3.08976628e-02,   6.50404030e-01,
          6.56943140e-01,   4.74193762e-01,  -1.41432303e+00,
          1.24025809e-02,  -7.34347692e+00,  -7.15354967e-02,
         -1.76279800e+00,   5.04870252e-01,  -3.32778358e+00,
        

In [125]:
# Calculating Feature Importance
# Trying to figure out
RW_feature_imp = pd.Series(RW_CV.best_estimator_.coef_, index=RW_X.columns).sort_values(ascending=False)
RW_feature_imp

Exception: Data must be 1-dimensional

In [8]:
# Computing Log Loss
RW_LL = log_loss(RW_y_test, RW_y_pred)
print('Log Loss: ' + str(RW_LL))

Log Loss: 1.2514223244


In [9]:
# Setting up a Random Forest Classifier followed by fitting and predicting on the data
RW_rf = RandomForestClassifier(n_estimators=1000, random_state=42)

RW_rf.fit(RW_X_train, RW_y_train)

RW_rf_pred = RW_rf.predict(RW_X_test)

RW_rf_score = metrics.accuracy_score(RW_y_test, RW_rf_pred)
print('Accuracy Score: ' + str(RW_rf_score))

Accuracy Score: 0.920289855072


In [10]:
# Printing out the Confusion Matrix and Classification Report for the Random Forest Classifier
RW_rf_CM = metrics.confusion_matrix(RW_y_test, RW_rf_pred)
RW_rf_CR = metrics.classification_report(RW_y_test, RW_rf_pred)

print(RW_rf_CM)
print(RW_rf_CR)

[[74  5]
 [ 6 53]]
             precision    recall  f1-score   support

      False       0.93      0.94      0.93        79
       True       0.91      0.90      0.91        59

avg / total       0.92      0.92      0.92       138



In [11]:
# Calculating Feature Importance from Random Forest Classifier
RW_rf_feature_imp = pd.Series(RW_rf.feature_importances_, index=RW_X.columns).sort_values(ascending=False)
RW_rf_feature_imp

Home Rushing Atts                      0.076156
Home Kickoffs                          0.072109
Road Rushing Atts                      0.060304
Home Total TDs                         0.041304
Road Kickoffs                          0.040220
Road Total TDs                         0.036758
Road Time of Possession                0.027263
Home Time of Possession                0.026704
Home Touchbacks                        0.026371
Road Passing Atts                      0.021767
Home Rushing Yds                       0.021335
Road Fourth Down Attempts              0.018913
Home Passing Atts                      0.018577
Road Rushing Yds                       0.017287
Home Fourth Down Attempts              0.015819
Home Total Yds                         0.014841
Road Red Zone Successes                0.012887
Home Kickoff Return Yds                0.012577
Road Red Zone Attempts                 0.012294
Road Sack Yds Given Up                 0.012130
Road Total Yds                         0

In [12]:
# Computing Log Loss for the Random Forest Classifier
RW_rf_LL = log_loss(RW_y_test, RW_rf_pred)
print('Log Loss: ' + str(RW_rf_LL))

Log Loss: 2.75311984298


In [13]:
# Setting X and y for Home Teams with X having just statistics, y being Home Win
HW_X = df.drop(['Season', 'Week Number', 'Date', 'Name of Winning Team', 'Name of Losing Team', 'Name of Tying Teams', 'Road Team Name', 'Home Team Name', 'Road Win', 'Road Loss', 'Road Tie', 'Road Wins', 'Road Losses', 'Road Ties', 'Home Win', 'Home Loss', 'Home Tie', 'Home Wins', 'Home Losses', 'Home Ties', 'Road Points', 'Home Points'], axis=1)
HW_y = df['Home Win']

# Splitting the data into Train and Test
HW_X_train, HW_X_test, HW_y_train, HW_y_test = train_test_split(HW_X, HW_y, test_size = 0.2, random_state=42, stratify=HW_y)

In [14]:
# Setting up parameter C
c_space = np.logspace(-5, 8, 15)
HW_param_grid = {'C': c_space}

# Setting up Logistic Regression Model
HW_logreg = LogisticRegression()

# Using GridSearchCV to help avoid overfitting
HW_CV = GridSearchCV(HW_logreg, HW_param_grid, cv=5)

In [15]:
# Fitting Training Data to model and finding best parameter as well as score
HW_CV.fit(HW_X_train, HW_y_train)
print('Best Parameter for Home Win Model: ' + str(HW_CV.best_params_))
print('Best Score for Home Win Model: ' + str(HW_CV.best_score_))

Best Parameter for Home Win Model: {'C': 0.43939705607607948}
Best Score for Home Win Model: 0.932481751825


In [16]:
# Setting up prediction based on testing data for X
HW_y_pred = HW_CV.predict(HW_X_test)

# Printing R Squared
R2_HW = HW_CV.score(HW_X_test, HW_y_test)
print('R squared: ' + str(R2_HW))

# Computing and printing Means Squared Error
HW_rmse = np.sqrt(mean_squared_error(HW_y_test, HW_y_pred))
print('Root Mean Squared Error: ' + str(HW_rmse))

R squared: 0.913043478261
Root Mean Squared Error: 0.29488391231


  output_errors = np.average((y_true - y_pred) ** 2, axis=0,


In [17]:
# Calculating the Accuracy Score as well as printing the Confusion Matrix and Classification Report
HW_score = metrics.accuracy_score(HW_y_test, HW_y_pred)

HW_CM = metrics.confusion_matrix(HW_y_test, HW_y_pred)

HW_CR = metrics.classification_report(HW_y_test, HW_y_pred)

print('Accuracy Score: ' + str(HW_score))
print(HW_CM)
print(HW_CR)

Accuracy Score: 0.913043478261
[[55  5]
 [ 7 71]]
             precision    recall  f1-score   support

      False       0.89      0.92      0.90        60
       True       0.93      0.91      0.92        78

avg / total       0.91      0.91      0.91       138



In [90]:
# Calculating Feature Importance
# Trying to figure out
HW_feature_imp = pd.Series(HW_CV.best_estimator_, index=HW_X.columns).sort_values(ascending=False)
HW_feature_imp

Road Two Point Conversion Successes    LogisticRegression(C=0.43939705607607948, clas...
Road Two Point Conversion Attempts     LogisticRegression(C=0.43939705607607948, clas...
Home Penalty Yards                     LogisticRegression(C=0.43939705607607948, clas...
Home Punt Return TDs                   LogisticRegression(C=0.43939705607607948, clas...
Home Punt Return Yds                   LogisticRegression(C=0.43939705607607948, clas...
Home Punt Returns                      LogisticRegression(C=0.43939705607607948, clas...
Home Punting Net Yards                 LogisticRegression(C=0.43939705607607948, clas...
Home Punts                             LogisticRegression(C=0.43939705607607948, clas...
Home Red Zone Attempts                 LogisticRegression(C=0.43939705607607948, clas...
Home Red Zone Successes                LogisticRegression(C=0.43939705607607948, clas...
Home Rushing Atts                      LogisticRegression(C=0.43939705607607948, clas...
Home Rushing First Do

In [18]:
# Computing Log Loss
HW_LL = log_loss(HW_y_test, HW_y_pred)
print('Log Loss: ' + str(HW_LL))

Log Loss: 3.00340083135


In [19]:
# Setting up a Random Forest Classifier followed by fitting and predicting on the data
HW_rf = RandomForestClassifier(n_estimators=1000, random_state=42)

HW_rf.fit(HW_X_train, HW_y_train)

HW_rf_pred = HW_rf.predict(HW_X_test)

HW_rf_score = metrics.accuracy_score(HW_y_test, HW_rf_pred)
print('Accuracy Score: ' + str(HW_rf_score))

Accuracy Score: 0.855072463768


In [20]:
# Printing out the Confusion Matrix and Classification Report for the Random Forest Classifier
HW_rf_CM = metrics.confusion_matrix(HW_y_test, HW_rf_pred)
HW_rf_CR = metrics.classification_report(HW_y_test, HW_rf_pred)

print(HW_rf_CM)
print(HW_rf_CR)

[[49 11]
 [ 9 69]]
             precision    recall  f1-score   support

      False       0.84      0.82      0.83        60
       True       0.86      0.88      0.87        78

avg / total       0.85      0.86      0.85       138



In [21]:
# Calculating Feature Importance from Random Forest Classifier
HW_rf_feature_imp = pd.Series(HW_rf.feature_importances_, index=HW_X.columns).sort_values(ascending=False)
HW_rf_feature_imp

Home Kickoffs                          0.073179
Home Rushing Atts                      0.072761
Road Rushing Atts                      0.066187
Home Total TDs                         0.042324
Road Kickoffs                          0.040742
Road Total TDs                         0.033353
Home Touchbacks                        0.028027
Home Time of Possession                0.027929
Road Time of Possession                0.024564
Home Passing Atts                      0.023526
Home Fourth Down Attempts              0.020352
Road Passing Atts                      0.020352
Road Fourth Down Attempts              0.016468
Road Rushing Yds                       0.016070
Home Rushing Yds                       0.015986
Road Sack Yds Given Up                 0.014141
Home Total Yds                         0.012402
Road Red Zone Attempts                 0.011925
Road Total Yds                         0.011892
Home INTs                              0.011747
Home Sack Yds Given Up                 0

In [22]:
# Computing Log Loss for the Random Forest Classifier
HW_rf_LL = log_loss(HW_y_test, HW_rf_pred)
print('Log Loss: ' + str(HW_rf_LL))

Log Loss: 5.00568350341
