In [1]:
#Author Hussain Abbas, MSc © 2017
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model
from sklearn import feature_selection

data = pd.read_csv('https://raw.githubusercontent.com/p-vs-np/Derby/master/derby.csv', encoding = 'latin1')

data.columns = ['year', 'winner', 'sire', 'dam', 'second_place', 'third_place', 'time', 'dist', 
                'track_cond', 'weather_high', 'weather_low', '24_hour_precip', '1pm_7pm_precip', 'weather']
data.head()

Unnamed: 0,year,winner,sire,dam,second_place,third_place,time,dist,track_cond,weather_high,weather_low,24_hour_precip,1pm_7pm_precip,weather
0,2017,Always Dreaming,Bodemeister,Above Perfection,Lookin At Lee,Battle of Midway,02:03.6,10 F,Sloppy,62,42,"0.12""","0.02""",Showers
1,2016,Nyquist,Uncle Mo,Seeking Gabrielle,Exaggerator,Gun Runner,02:01.3,10 F,Fast,85,59,"0.28""","0.04""",Mostly Cloudy
2,2015,American Pharoah,Pioneerof The Nile,Littleprincessemma,Firing Line,Dortmund,02:03.0,10 F,Fast,79,52,0,0,Partly Cloudy
3,2014,California Chrome,Lucky Pulpit,Love The Chase,Commanding Curve,Danza,02:03.7,10 F,Fast,73,45,0,0,Mostly Sunny
4,2013,Orb,Malibu Moon,Lady Liberty,Golden Soul,Revolutionary,02:02.9,10 F,Sloppy,64,53,"0.87""","0.44""",Rain


In [2]:
print(type(data))
print(data.dtypes)     

<class 'pandas.core.frame.DataFrame'>
year               int64
winner            object
sire              object
dam               object
second_place      object
third_place       object
time              object
dist              object
track_cond        object
weather_high      object
weather_low       object
24_hour_precip    object
1pm_7pm_precip    object
weather           object
dtype: object


In [3]:
# clean up the data
data['new_time'] = pd.to_numeric(data['time'].str.split(":").str[0]) + pd.to_numeric(data['time'].str.split(":").str[1])/60
data['dist'] = pd.to_numeric(data['dist'].str.replace('F','').str.strip())
data['track_cond'] = data['track_cond'].str.lower()
data['24_hour_precip'] = data['24_hour_precip'].str.replace('"','').str.replace('Trace', '0')
data['24_hour_precip'] = pd.to_numeric(data['24_hour_precip'])
data['1pm_7pm_precip'] = data['1pm_7pm_precip'].str.replace('"','').str.replace('Trace', '0')
data['1pm_7pm_precip'] = pd.to_numeric(data['1pm_7pm_precip'])
data['weather_high'] = pd.to_numeric(data['weather_high'].str.replace('\xa0',''))
data['weather_low'] = pd.to_numeric(data['weather_low'].str.replace('\xa0',''))
data['weather'] = data['weather'].str.lower().str.replace('\xa0','')

In [4]:
# one hot encoding to create dummy variables
data = data.join(pd.get_dummies(data['weather'],prefix = 'weather', drop_first = True))
data = data.join(pd.get_dummies(data['track_cond'],prefix = 'track_cond', drop_first = True))

In [5]:
lm = linear_model.LinearRegression()

train = data[1:]
test = data.iloc[0]

x_train = train.drop(['year', 'winner', 'sire', 'dam', 'second_place', 'third_place', 'time', 
               'new_time', 'track_cond', 'weather'], axis = 1)

x_test = test.drop(['year', 'winner', 'sire', 'dam', 'second_place', 'third_place', 'time', 
               'new_time', 'track_cond', 'weather'])

lm_mod = lm.fit(x_train,train['new_time']) #train the linear regression model

print(data['new_time'][0]) # actual in minutes
print(lm.predict(x_test)) # predicted in minutes

2.06
[ 2.05293885]




In [6]:
print((data['new_time'][0] - lm.predict(x_test))*60) # linear regression OOS forecast is 0.42 seconds lower than actual
print((data['new_time'][0] - lm.predict(x_train).mean())*60) # naive mean prediction is 5.9 seconds higher than actual

[ 0.42366898]
-5.85492957746




In [7]:
lm_mod_test = feature_selection.f_regression(x_train, train['new_time'])

result = pd.DataFrame({'Variable': list(x_train), 
             'Coefficient': lm_mod.coef_, 
             'P-value':lm_mod_test[1], 
              'Significant': lm_mod_test[1]<=0.05})

print("""Surprisingly, weather has no impact on win time. The most important variable is the distance traveled. 
The coefficient on the weather_low of the day is significant and positive, thereby indicating that higher 
lower temperatures are associated with slower win times. Relative to a baseline baseline of "dusty", track 
conditions of "fast" and "heavy" are associated with slower win times. """)

result[['Variable', 'Coefficient', 'P-value', 'Significant']]

Surprisingly, weather has no impact on win time. The most important variable is the distance traveled. 
The coefficient on the weather_low of the day is significant and positive, thereby indicating that higher 
lower temperatures are associated with slower win times. Relative to a baseline baseline of "dusty", track 
conditions of "fast" and "heavy" are associated with slower win times. 


Unnamed: 0,Variable,Coefficient,P-value,Significant
0,dist,0.285934,2.9311159999999997e-87,True
1,weather_high,-5.5e-05,0.7718419,False
2,weather_low,0.001038,0.02257352,True
3,24_hour_precip,0.010049,0.2912793,False
4,1pm_7pm_precip,0.013768,0.3716788,False
5,weather_cloudy,-0.001097,0.1795275,False
6,weather_mostly cloudy,-0.041561,0.2571694,False
7,weather_mostly sunny,-0.038415,0.1762946,False
8,weather_partly cloudy,-0.02005,0.7113638,False
9,weather_partly sunny,-0.048909,0.2242739,False
