In [87]:
!pip install category_encoders



In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.chromium.webdriver import ChromiumDriver
import pandas as pd
import json, time, pickle
import category_encoders as ce

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [1]:
"""
Pick the version of chrome that is supported
for 123-125:
https://googlechromelabs.github.io/chrome-for-testing/
for 122:
https://storage.googleapis.com/chrome-for-testing-public/122.0.6261.57/win64/chromedriver-win64.zip
"""

URL = 'https://racing.turfclub.com.sg/en/horse-performance/'
driver = webdriver.Chrome() #choose chrome (you can choose anything)
wait = WebDriverWait(driver, 10)  # Using explicit wait with a timeout of 10 seconds
def get_links(driver:ChromiumDriver = driver, wait:WebDriverWait=wait, *, json_file:bool = False, save_df:bool = False, verbose:bool=False) -> pd.DataFrame:
    driver.get(URL)
    horses = {}
    while True: #page traversal
        time.sleep(3) #literally wait for the browser to load
        all_links = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.odd td.sorting_1 a, .even td.sorting_1 a')) )#selenium gets confused when elements are finished loading due to the way singapore turf loads their content
        for link in all_links:
            try:
                horse_name = link.text.strip()
                horse_link = link.get_attribute('href')
                if horse_name:
                    horses[horse_name] = horse_link
            except StaleElementReferenceException as e:
                continue
        all_links.clear()
        if len(driver.find_elements(By.CLASS_NAME, 'paginate_button.next.disabled')) > 0: break #check if last page
        wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#DataTables_Table_0_next'))).click()
    driver.delete_all_cookies()
    driver.quit() #close the driver gracefully

    if verbose:
        for horse_name, horse_links in horses.items():
            print(horse_name, horse_links)

    # Write data to JSON file
    if json_file:
        with open('horse_data.json', 'w') as json_f:
            json.dump(horses, json_f, indent=4)

    #create dataframe and save the object with pickle
    horses_df = pd.DataFrame({'Horse Name': list(horses.keys()), 'Horse Links': horses.values()})
    if save_df:
        with open('horse_data.pickle', 'wb') as f:
            pickle.dump(horses_df, f) #store the pickle file for future use

    return horses_df

if __name__ == '__main__':
    horses_df = get_links(save_df=True, json_file=True, verbose=True)

A BETTER TOMORROW https://racing.turfclub.com.sg/en/horse-performance/horse-profile/?id=47274
ACE NINE https://racing.turfclub.com.sg/en/horse-performance/horse-profile/?id=47119
ACE OF DIAMONDS https://racing.turfclub.com.sg/en/horse-performance/horse-profile/?id=47148
ACE SOVEREIGN https://racing.turfclub.com.sg/en/horse-performance/horse-profile/?id=46339
ACE'S WILD https://racing.turfclub.com.sg/en/horse-performance/horse-profile/?id=45296
ACROBAT https://racing.turfclub.com.sg/en/horse-performance/horse-profile/?id=45436
ADIRA PEGASUS https://racing.turfclub.com.sg/en/horse-performance/horse-profile/?id=46553
AFTERMATH https://racing.turfclub.com.sg/en/horse-performance/horse-profile/?id=46648
AGUERO JR https://racing.turfclub.com.sg/en/horse-performance/horse-profile/?id=46700
AHONE https://racing.turfclub.com.sg/en/horse-performance/horse-profile/?id=47061
AHORSEWITHNONAME https://racing.turfclub.com.sg/en/horse-performance/horse-profile/?id=47015
AHTIMS KLIS https://racing.turf

In [5]:
# Read the JSON file
with open('horse_data.json', 'r') as file:
    horse_data:dict = json.load(file)


In [6]:
from selenium.webdriver.chrome.options import Options

# Configure Chrome options for headless mode
options = Options()
options.headless = True

# Set up Selenium WebDriver with Chrome
driver = webdriver.Chrome(options=options)  
wait = WebDriverWait(driver, 10)  # Using explicit wait with a timeout of 10 seconds
df_list=[]

# Define the base URL and the number of pages
for name,url in horse_data.items():
    driver.get(url)
    
    # Wait until the table is present on the page
    while True: #page traversal
        table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'table'))) #dont really need it
        time.sleep(4) #literally wait for the browser to load
        all_rows = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.odd, .even')) )#selenium gets confused when elements are finished loading due to the way singapore turf loads their content
        for row in all_rows:
            try:
                cells = row.find_elements(By.TAG_NAME, 'td')
                if len(cells) < 18:  # Adjust the number as per your requirement
                    continue  # Skip processing this row
                row_data = {
                    'HorseName': name,
                    'Barrier': cells[8].text.strip(),
                    'CarriedWeight': cells[9].text.strip(),
                    'Distance': cells[5].text.strip(),
                    'Rating': cells[4].text.strip(),
                    'HorseWeight': cells[10].text.strip(),
                    'Going': cells[7].text.strip(),
                    'Track': cells[6].text.strip(),
                    'Jockey': cells[16].text.strip(),
                    'Trainer': cells[17].text.strip(),
                    'LBW': cells[12].text.strip(),
                }
                with open(f'horse_profiles.json', 'a') as f:
                    json.dump(row_data, f)
                    f.write('\n')
            except StaleElementReferenceException as e:
                continue
        all_rows.clear()
        if len(driver.find_elements(By.CLASS_NAME, 'paginate_button.next.disabled')) > 0: break #check if last page
        wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#DataTables_Table_0_next'))).click()
# Concatenate all DataFrames in the list
df = pd.concat(df_list, ignore_index=True)
with open('horse_data.pickle', 'wb') as f:
    pickle.dump(horses_df, f) #store the pickle file for future use
# Close the WebDriver
driver.delete_all_cookies()
driver.quit()

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=123.0.6312.122)
Stacktrace:
	GetHandleVerifier [0x00007FF775067032+63090]
	(No symbol) [0x00007FF774FD2C82]
	(No symbol) [0x00007FF774E6EC65]
	(No symbol) [0x00007FF774E4CA7C]
	(No symbol) [0x00007FF774EDD687]
	(No symbol) [0x00007FF774EF2AC1]
	(No symbol) [0x00007FF774ED6D83]
	(No symbol) [0x00007FF774EA83A8]
	(No symbol) [0x00007FF774EA9441]
	GetHandleVerifier [0x00007FF7754625AD+4238317]
	GetHandleVerifier [0x00007FF77549F70D+4488525]
	GetHandleVerifier [0x00007FF7754979EF+4456495]
	GetHandleVerifier [0x00007FF775140576+953270]
	(No symbol) [0x00007FF774FDE54F]
	(No symbol) [0x00007FF774FD9224]
	(No symbol) [0x00007FF774FD935B]
	(No symbol) [0x00007FF774FC9B94]
	BaseThreadInitThunk [0x00007FF9FABC257D+29]
	RtlUserThreadStart [0x00007FF9FBA6AA48+40]


After some trial and error, we settled upon using Length Behind Winner (LBW) as our response. We predict that the winning horse will be the horse with the lowest LBW. We originally wanted to use Finish Time instead, but found that the the data is commonly missing on Singapore Turf Club website.

However, LBW is almost always filled. 

Next, we will filter out rows with missing data. We do not need them as we are in an excess of data points (more than 40,000), and furthermore, the vast majority of rows do not have missing data. Usually, it is only the Horse Weight that might be missing. An eyeball test tells us that only about 1 in 20 rows will have a missing Horse Weight column, and as such as decided to simply remove these rows.


# Start of data cleaning

### Finding missing rows

In [91]:
# Load JSON data from file
with open('horse_profiles.json', 'r') as f:
    json_data = json.load(f)

# Create DataFrame from JSON data
df = pd.DataFrame(json_data)

filterCols = []
# Iterate over columns and count empty strings in each column
for column in df.columns:
    empty_string_count = (df[column] == "").sum()
    print("Number of empty strings in", column, ":", empty_string_count)
    if(empty_string_count>0):
        filterCols.append(column)
        
print("\nColumns to filter",filterCols)

Number of empty strings in HorseName : 0
Number of empty strings in Barrier : 0
Number of empty strings in CarriedWeight : 5636
Number of empty strings in Distance : 0
Number of empty strings in Placing : 0
Number of empty strings in Rating : 3357
Number of empty strings in HorseWeight : 7762
Number of empty strings in Going : 3
Number of empty strings in Track : 0
Number of empty strings in Jockey : 36
Number of empty strings in Trainer : 2
Number of empty strings in LBW : 54
Number of empty strings in Finish Time : 16222

Columns to filter ['CarriedWeight', 'Rating', 'HorseWeight', 'Going', 'Jockey', 'Trainer', 'LBW', 'Finish Time']


### Filtering out missing rows

In [92]:
#filter out rows of missing data from json
# Create a boolean mask to identify rows with empty strings in multiple columns

# Initialize mask with all True values
mask = pd.Series(True, index=df.index)

# Update mask for each column in filterCols
for col in filterCols:
    mask = mask & (df[col] != "")

# Filter out rows with empty strings from the DataFrame
filtered_df = df[mask]

for column in filtered_df.columns:
    empty_string_count = (filtered_df[column] == "").sum()
    print("Number of empty strings in", column, ":", empty_string_count)
    if(empty_string_count>0):
        filterCols.append(column)
        
filtered_df

Number of empty strings in HorseName : 0
Number of empty strings in Barrier : 0
Number of empty strings in CarriedWeight : 0
Number of empty strings in Distance : 0
Number of empty strings in Placing : 0
Number of empty strings in Rating : 0
Number of empty strings in HorseWeight : 0
Number of empty strings in Going : 0
Number of empty strings in Track : 0
Number of empty strings in Jockey : 0
Number of empty strings in Trainer : 0
Number of empty strings in LBW : 0
Number of empty strings in Finish Time : 0


Unnamed: 0,HorseName,Barrier,CarriedWeight,Distance,Placing,Rating,HorseWeight,Going,Track,Jockey,Trainer,LBW,Finish Time
0,A BETTER TOMORROW,4,58.0,1700,2/12,46,476,G,P,R CURATOLO,J PETERS,4.3,1:47.05
1,A BETTER TOMORROW,11,58.0,1600,1/11,42,483,G,T,R CURATOLO,J PETERS,2.0,1:36.23
2,A BETTER TOMORROW,8,55.0,1400,4/10,43,483,G,T,APP S JAMIL,J PETERS,2.3,1:23.08
3,A BETTER TOMORROW,8,55.0,1400,3/12,43,475,G,T,APP S JAMIL,J PETERS,2.0,1:24.39
4,A BETTER TOMORROW,4,57.5,1600,9/10,45,477,G,P,R WOODWORTH,J PETERS,7.6,1:41.14
...,...,...,...,...,...,...,...,...,...,...,...,...,...
41123,ZYGARDE,5,59.0,1700,5/12,48,520,G,P,M LERNER,KS TAN,5.9,1:46.56
41124,ZYGARDE,2,58.0,1600,5/12,48,526,G,P,S NOH,KS TAN,7.5,1:39.67
41125,ZYGARDE,10,58.5,1200,1/12,43,529,G,P,M LERNER,KS TAN,0.5,1:12.46
41126,ZYGARDE,4,57.5,1200,7/12,45,525,G,P,M LERNER,KS TAN,8.8,1:12.94


## Dealing with numeric cols

### Converting finish times to seconds

In [93]:
# Function to convert mm:ss format to total seconds
def mmss_to_seconds(time_str):
    if isinstance(time_str, str) and ':' in time_str:  # Check if the value is a string and contains the ':' delimiter
        minutes, seconds = map(float, time_str.split(':'))
        return minutes * 60.0 + seconds
    else:
        # Handle cases where the value is not in the expected format
        return None  # Or any other appropriate action

# Apply the function to the FinishingTime column
filtered_df['FinishTime_Seconds'] = filtered_df['Finish Time'].apply(mmss_to_seconds)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['FinishTime_Seconds'] = filtered_df['Finish Time'].apply(mmss_to_seconds)


In [94]:
none_count = filtered_df['FinishTime_Seconds'].value_counts().get(None, 0)
print("Number of None occurrences in the column:", none_count)
print("Therefore all are converted successfully")

Number of None occurrences in the column: 0
Therefore all are converted successfully


### Converting Placing to numeric

In [103]:
# Function to convert mm:ss format to total seconds
def placing(place_str):
    if isinstance(place_str, str) and '/' in place_str:  # Check if the value is a string and contains the ':' delimiter
        place,total = map(int, place_str.split('/'))
        return place
    else:
        # Handle cases where the value is not in the expected format
        return None  # Or any other appropriate action

# Apply the function to the FinishingTime column
filtered_df['Placing_Numeric'] = filtered_df['Placing'].apply(placing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Placing_Numeric'] = filtered_df['Placing'].apply(placing)


Now, we will need to convert the categorical data into numerical data so that they can be used in our Multiple Linear Regression Model. This is done through Target Encoding for Jockey, Trainer, HorseName. Label Encoding for Going and Track.

### Converting all numeric predictors to float datatype (since they were 'object' datatype from scraping)

In [97]:
numericCols = ['CarriedWeight','Distance','Rating', 'HorseWeight','LBW']

for col in numericCols:
    filtered_df[col] = filtered_df[col].astype(float)

# Confirm the conversion
print(filtered_df['CarriedWeight'].dtype)

float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[col] = filtered_df[col].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[col] = filtered_df[col].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[col] = filtered_df[col].astype(float)
A value is trying to be set on a copy of a slice from a

### Label Encoding for Track and Going

In [99]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'Going' column to encode categorical labels into numerical labels
filtered_df['encoded_Going'] = label_encoder.fit_transform(filtered_df['Going'])
filtered_df['encoded_Going'] += 1
# Display the mapping between original categorical values and encoded numerical labels
print("'Going' Mapping:")
for original_value, encoded_value in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{original_value}: {encoded_value+1}")

print("'Track' Mapping:")
filtered_df['encoded_Track'] = label_encoder.fit_transform(filtered_df['Track'])
filtered_df['encoded_Track'] += 1

# Display the mapping between original categorical values and encoded numerical labels
print("Mapping between original categorical values and encoded numerical labels:")
for original_value, encoded_value in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{original_value}: {encoded_value+1}")

'Going' Mapping:
F: 1
G: 2
Y: 3
'Track' Mapping:
Mapping between original categorical values and encoded numerical labels:
P: 1
T: 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['encoded_Going'] = label_encoder.fit_transform(filtered_df['Going'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['encoded_Going'] += 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['encoded_Track'] = label_encoder.fit_transform(filtered_df['Track'])
A value i

### Target Encoding for Jockey, Trainer, HorseName

In [100]:
# Initialize the TargetEncoder
target_encoder = ce.TargetEncoder(cols=['Jockey'])

# Fit and transform the DataFrame to perform target encoding
filtered_df['encoded_Jockey'] = target_encoder.fit_transform(filtered_df['Jockey'], filtered_df['LBW'])

# Initialize the TargetEncoder
target_encoder = ce.TargetEncoder(cols=['Trainer'])

# Fit and transform the DataFrame to perform target encoding
filtered_df['encoded_Trainer'] = target_encoder.fit_transform(filtered_df['Trainer'], filtered_df['LBW'])

# Initialize the TargetEncoder
target_encoder = ce.TargetEncoder(cols=['HorseName'])

# Fit and transform the DataFrame to perform target encoding
filtered_df['encoded_HorseName'] = target_encoder.fit_transform(filtered_df['HorseName'], filtered_df['LBW'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['encoded_Jockey'] = target_encoder.fit_transform(filtered_df['Jockey'], filtered_df['LBW'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['encoded_Trainer'] = target_encoder.fit_transform(filtered_df['Trainer'], filtered_df['LBW'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-co

In [104]:
# Display the encoded DataFrame
filtered_df

Unnamed: 0,HorseName,Barrier,CarriedWeight,Distance,Placing,Rating,HorseWeight,Going,Track,Jockey,Trainer,LBW,Finish Time,FinishTime_Seconds,encoded_Going,encoded_Track,encoded_Jockey,encoded_Trainer,encoded_HorseName,Placing_Numeric
0,A BETTER TOMORROW,4,58.0,1700.0,2/12,46.0,476.0,G,P,R CURATOLO,J PETERS,4.3,1:47.05,107.05,2,1,3.592162,4.703901,4.959534,2
1,A BETTER TOMORROW,11,58.0,1600.0,1/11,42.0,483.0,G,T,R CURATOLO,J PETERS,2.0,1:36.23,96.23,2,2,3.592162,4.703901,4.959534,1
2,A BETTER TOMORROW,8,55.0,1400.0,4/10,43.0,483.0,G,T,APP S JAMIL,J PETERS,2.3,1:23.08,83.08,2,2,4.172277,4.703901,4.959534,4
3,A BETTER TOMORROW,8,55.0,1400.0,3/12,43.0,475.0,G,T,APP S JAMIL,J PETERS,2.0,1:24.39,84.39,2,2,4.172277,4.703901,4.959534,3
4,A BETTER TOMORROW,4,57.5,1600.0,9/10,45.0,477.0,G,P,R WOODWORTH,J PETERS,7.6,1:41.14,101.14,2,1,4.779805,4.703901,4.959534,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41123,ZYGARDE,5,59.0,1700.0,5/12,48.0,520.0,G,P,M LERNER,KS TAN,5.9,1:46.56,106.56,2,1,4.470060,5.552150,4.317842,5
41124,ZYGARDE,2,58.0,1600.0,5/12,48.0,526.0,G,P,S NOH,KS TAN,7.5,1:39.67,99.67,2,1,4.101913,5.552150,4.317842,5
41125,ZYGARDE,10,58.5,1200.0,1/12,43.0,529.0,G,P,M LERNER,KS TAN,0.5,1:12.46,72.46,2,1,4.470060,5.552150,4.317842,1
41126,ZYGARDE,4,57.5,1200.0,7/12,45.0,525.0,G,P,M LERNER,KS TAN,8.8,1:12.94,72.94,2,1,4.470060,5.552150,4.317842,7


## End of data cleaning

We can now train our MLR Model using the processed data.

In [None]:
from sklearn import linear_model
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

#predictors
HorseName Barrier CarriedWeightDistance Rating HorseWeight Going Track Jockey Trainer LBW	encoded_Going	encoded_Track	encoded_Jockey	encoded_Trainer	encoded_HorseName
predictors = ['encoded_Going']
X = filtered_df[predictors]

#response
y = filtered_df[['LBW']] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

print("x_train shape :", X_train.shape)
print("y_train shape :", y_train.shape)
print("x_test shape :", X_test.shape)
print("y_test shape :", y_test.shape)
# y.describe()

linreg = linear_model.LinearRegression()
linreg.fit(X_train, y_train)

#include mean squared error etc here
# Coefficients of the Linear Regression line
print('Intercept of Regression \t: b = ', linreg.intercept_)
print('Coefficients of Regression \t: a = ', linreg.coef_)
print()

# Predict Total values corresponding to HP
y_train_pred = linreg.predict(X_train)
y_test_pred = linreg.predict(X_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
print()

# Plot the Predictions vs the True values
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'w-', linewidth = 1)
axes[0].set_xlabel("True values of the Response Variable (Train)")
axes[0].set_ylabel("Predicted values of the Response Variable (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'w-', linewidth = 1)
axes[1].set_xlabel("True values of the Response Variable (Test)")
axes[1].set_ylabel("Predicted values of the Response Variable (Test)")
plt.show()

We can see that our model has a pretty high Mean Squared Error. This is to be expected as there is the inherent randomness of possible outcomes when it comes to horse racing.

Now that we have our model, we can now make a prediction based on the model.

In [None]:
#predicting horse race on XYZ date

def predictTime(bar, cWeight, hWeight, dist, rating, going, track, jockey, trainer):
    predictedTime = regr.predict([[bar], [cWeight], [hWeight], [dist], [rating], [going], [track], [jockey], [trainer]])
    return predictedTime

In [None]:
filtered_df[['Distance']].describe()