In [1]:
# Importing necessary packages
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chisquare

In [2]:
# Uploading DataFrame and inspecting its contents
df = pd.DataFrame()
df = pd.read_csv('/Users/dwreiter/Desktop/Work/Springboard/NFL Capstone Project/Data Wrangling/nfl_df.csv', delimiter='\t', index_col=0)
df.head()

Unnamed: 0,Date,Home Field Goal Attempts,Home First Downs,Home Fourth Down Attempts,Home Fourth Down Successes,Home Fumble TDs,Home Fumbles,Home Fumbles Lost,Home Fumbles Recovered,Home Goal To Go Attempts,...,Road Time of Possession,Road Total TDs,Road Total Yds,Road Touchbacks,Road Two Point Conversion Attempts,Road Two Point Conversion Successes,Road Win,Road Wins,Season,Week Number
0,"on December 20, 2015",1,19,2,0,0,2,2,0,0,...,2248,5,506,3,0,0,True,1,nfl-2015-2016,15
1,"on November 29, 2015",2,17,2,1,0,2,1,0,0,...,2247,2,350,1,0,0,True,1,nfl-2015-2016,12
2,"on December 27, 2015",1,19,0,0,2,2,1,3,3,...,1784,1,265,0,1,1,False,0,nfl-2015-2016,16
3,"on November 22, 2015",2,21,0,0,0,2,0,1,1,...,1875,4,415,3,0,0,False,0,nfl-2015-2016,11
4,"on January 3, 2016",1,16,1,0,0,0,0,0,0,...,2197,4,382,7,0,0,True,1,nfl-2015-2016,17


** For each test below:
<br>
Ho = Statistic has no impact on a team's chance of winning
<br>
Ha = Statistic has an impact on a team's chance of winning
<br>
α = 0.05**

In [3]:
# Running a t-test for Road Rushing Attempts and Road Win
t_stat, p = stats.ttest_ind(df.loc[df['Road Win'],'Road Rushing Atts'], df.loc[df['Road Loss'], 'Road Rushing Atts'], equal_var=False)
print('t-statistic: ' + str(t_stat))
print('p-value: ' + str(p))

t-statistic: 17.2698667801
p-value: 5.89954574468e-55


** Because of how low the p-value is, we can reject the null hypothesis and say Road Rushing Attempts does have an impact on a Road Team's chance of winning.**

In [4]:
# Running a t-test for Road Time of Possession and Road Win
t_stat, p = stats.ttest_ind(df.loc[df['Road Win'], 'Road Time of Possession'], df.loc[df['Road Loss'], 'Road Time of Possession'], equal_var=False)
print('t-statistic: ' + str(t_stat))
print('p-value: ' + str(p))

t-statistic: 11.5380986752
p-value: 4.16229517181e-28


** Because of how low the p-value is, we can reject the null hypothesis and say Road Time of Possession does have an impact on a Road Team's chance of winning.**

In [5]:
# Running a chi-squared test for Road Passing First Downs and Road Win
RPFD_RW_CT = pd.crosstab(df['Road Passing First Downs'], df['Road Win'])
chi, p, d_f, e_f = stats.chi2_contingency(RPFD_RW_CT)
print('chi-squared: ' + str(chi))
print('p-value: ' + str(p))

chi-squared: 28.7972496537
p-value: 0.150740999025


** Because of how high the p-value is, we can accept the null hypothesis and say Road Passing First Downs does not have an impact on a Road Team's chance of winning.**

In [6]:
# Running a chi-squared test for Road Passing Attempts and Home Win
RPA_HW_CT = pd.crosstab(df['Road Passing Atts'], df['Home Win'])
chi, p, d_f, e_f = stats.chi2_contingency(RPA_HW_CT)
print('chi-squared: ' + str(chi))
print('p-value: ' + str(p))

chi-squared: 103.677721227
p-value: 2.44684267008e-06


** Because of how low the p-value is, we can reject the null hypothesis and say Road Passing Attempts does have an impact on a Home Team's chance of winning.**

In [7]:
# Running a chi-squared test for Road Sacks Given Up and Home Win
RSGU_HW_CT = pd.crosstab(df['Road Sacks Given Up'], df['Home Win'])
chi, p, d_f, e_f = stats.chi2_contingency(RSGU_HW_CT)
print('chi-squared: ' + str(chi))
print('p-value: ' + str(p))

chi-squared: 57.3482266227
p-value: 4.33405906973e-09


** Because of how low the p-value is, we can reject the null hypothesis and say Road Sacks Given Up does have an impact on a Home Team's chance of winning.**

In [8]:
# Running a chi-squared test for Home Interception Touchdowns and Home Win
HITD_HW_CT = pd.crosstab(df['Home INT TDs'], df['Home Win'])
chi, p, d_f, e_f = stats.chi2_contingency(HITD_HW_CT)
print('chi-squared: ' + str(chi))
print('p-value: ' + str(p))

chi-squared: 19.8662285319
p-value: 4.85403927745e-05


** Because of how low the p-value is, we can reject the null hypothesis and say Home INT TDs does have an impact on a Home Team's chance of winning.**

In [9]:
# Running a chi-squared test for Home Goal To Go Successes and Home Win
HGTGS_HW_CT = pd.crosstab(df['Home Goal To Go Successes'], df['Home Win'])
chi, p, d_f, e_f = stats.chi2_contingency(HGTGS_HW_CT)
print('chi-squared: ' + str(chi))
print('p-value: ' + str(p))

chi-squared: 54.0936640619
p-value: 2.00495339211e-10


** Because of how low the p-value is, we can reject the null hypothesis and say Home Goal To Go Successes does have an impact on a Home Team's chance of winning.**

In [10]:
# Running a chi-squared test for Road Fourth Down Successes and Home Win
RFDS_HW_CT = pd.crosstab(df['Road Fourth Down Successes'], df['Home Win'])
chi, p, d_f, e_f = stats.chi2_contingency(RFDS_HW_CT)
print('chi-squared: ' + str(chi))
print('p-value: ' + str(p))

chi-squared: 21.6040833409
p-value: 0.000240264746781


** Because of how low the p-value is, we can reject the null hypothesis and say Road Fourth Down Successes does have an impact on a Home Team's chance of winning.**

In [11]:
# Comparing t-tests for Road Rushing Yards, Road Passing Yards and Road Total Yards with Road Win
RRY_t_stat, RRY_p = stats.ttest_ind(df.loc[df['Road Win'],'Road Rushing Yds'], df.loc[df['Road Loss'], 'Road Rushing Yds'], equal_var=False)
print('Roading Rushing Yards t-statistic: ' + str(RRY_t_stat))
print('Road Rushing Yards p-value: ' + str(RRY_p))

RPY_t_stat, RPY_p = stats.ttest_ind(df.loc[df['Road Win'],'Road Passing Yds'], df.loc[df['Road Loss'], 'Road Passing Yds'], equal_var=False)
print('Roading Passing Yards t-statistic: ' + str(RPY_t_stat))
print('Road Passing Yards p-value: ' + str(RPY_p))

RTY_t_stat, RTY_p = stats.ttest_ind(df.loc[df['Road Win'],'Road Total Yds'], df.loc[df['Road Loss'], 'Road Total Yds'], equal_var=False)
print('Road Total Yards t-statistic: ' + str(RTY_t_stat))
print('Road Total Yards p-value: ' + str(RTY_p))

Roading Rushing Yards t-statistic: 9.19537876987
Road Rushing Yards p-value: 6.56489564669e-19
Roading Passing Yards t-statistic: 1.2575033784
Road Passing Yards p-value: 0.209031049196
Road Total Yards t-statistic: 6.74881320859
Road Total Yards p-value: 3.34810154221e-11


** Because of how low the p-values are for Road Rushing Yards and Road Total Yards, we can reject the null hypothesis and say that both have an impact on a Road Team's chance of winning. However, because of how high p-value is for Road Passing Yards, we can accept the null hypothesis and say that it does not have an impact on a Road Team's chance of winning.**

In [12]:
# Comparing t-tests for Home Rushing Yards, Home Passing Yards and Home Total Yards with Home Win
HRY_t_stat, HRY_p = stats.ttest_ind(df.loc[df['Home Win'],'Home Rushing Yds'], df.loc[df['Home Loss'], 'Home Rushing Yds'], equal_var=False)
print('Home Rushing Yards t-statistic: ' + str(HRY_t_stat))
print('Home Rushing Yards p-value: ' + str(HRY_p))

HPY_t_stat, HPY_p = stats.ttest_ind(df.loc[df['Home Win'],'Home Passing Yds'], df.loc[df['Home Loss'], 'Home Passing Yds'], equal_var=False)
print('Home Passing Yards t-statistic: ' + str(HPY_t_stat))
print('Home Passing Yards p-value: ' + str(HPY_p))

HTY_t_stat, HTY_p = stats.ttest_ind(df.loc[df['Home Win'],'Home Total Yds'], df.loc[df['Home Loss'], 'Home Total Yds'], equal_var=False)
print('Home Total Yards t-statistic: ' + str(HTY_t_stat))
print('Home Total Yards p-value: ' + str(HTY_p))

Home Rushing Yards t-statistic: 10.3404950095
Home Rushing Yards p-value: 2.21865803101e-23
Home Passing Yards t-statistic: 1.13272589041
Home Passing Yards p-value: 0.257748779813
Home Total Yards t-statistic: 7.6370938345
Home Total Yards p-value: 8.12502710132e-14


** Because of how low the p-values are for Home Rushing Yards and Home Total Yards, we can reject the null hypothesis and say that both have an impact on a Home Team's chance of winning. However, because of how high p-value is for Home Passing Yards, we can accept the null hypothesis and say that it does not have an impact on a Home Team's chance of winning.**

In [13]:
# Running a Pearson Correlation test between Road Total Yards and Home Total Yards
r, p = stats.pearsonr(df['Road Total Yds'], df['Home Total Yds'])
print('r: ' + str(r))
print('p-value: ' + str(p))

r: 0.0332237989546
p-value: 0.384936055995


** With a correlation coefficient around 0.03, we can see that there is a slight positive correlation between the two statistics but it is very minimal. Also, because of how high the p-value is, we can say the relationship between Road Total Yards and Home Total Yards is not statistically significant.**

In [14]:
# Comparing t-tests for Road Points with Road Win and Home Points with Home Win
RP_t_stat, RP_p = stats.ttest_ind(df.loc[df['Road Win'],'Road Points'], df.loc[df['Road Loss'], 'Road Points'], equal_var=False)
print('Road Points t-statistic: ' + str(RP_t_stat))
print('Road Points p-value: ' + str(RP_p))

HP_t_stat, HP_p = stats.ttest_ind(df.loc[df['Home Win'],'Home Points'], df.loc[df['Home Loss'], 'Home Points'], equal_var=False)
print('Home Points t-statistic: ' + str(HP_t_stat))
print('Home Points p-value: ' + str(HP_p))

Road Points t-statistic: 17.3265225416
Road Points p-value: 2.23599858203e-55
Home Points t-statistic: 18.6748510633
Home Points p-value: 8.08943267606e-63


** Because of how low the p-value is for Road Points, we can reject the null hypothesis and say it does have an impact on a Road Team's chance of winning. Also, because of how low the p-value is for Home Points, we can reject the null hypothesis and say it does have an impact on a Home Team's chance of winning.**

In [15]:
# Running a Pearson Correlation test between Road Points and Home Points
r, p = stats.pearsonr(df['Road Points'], df['Home Points'])
print('r: ' + str(r))
print('p-value: ' + str(p))

r: -0.0231055999376
p-value: 0.545744131302


** With a correlation coefficient around -0.02, we can see that there is a slight negative correlation between the two statistics but it is very minimal. Also, because of how high the p-value is, we can say the relationship between Road Points and Home Points is not statistically significant.**