In [62]:
import os
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
import seaborn as sns
from matplotlib import pyplot as plt

### Enter venue and date to predict here:

In [63]:
points_df = pd.read_csv('Points.csv', sep=';')
points_dict = dict(zip(points_df.Rank, points_df.Points))

#ENTER VENUE AND DATE HERE
    
ven_pred = 'Lousa'
date_pred = '20200322'

#ENTER RIDERS NOT PARTICIPATING IN RACE HERE
np_riders = ['Martin MAES']


### Create dataframes of riders whos results should be predicted

Some riders are not participating in all races, this could be because of injuries, or they are riders that won't participate in all world cups for various reasons (for example Martin Maes is an enduro rider who only sometimes (rarely) participate in downhill events, therefore he's removed from the participating riders). The create_prediction_df creates dataframes containing only the riders that should be included in the prediction.

In [64]:
def create_prediction_df():

    
    comp_df = pd.read_csv('riderprices2020.csv', sep=';')
    comp_df_m = comp_df[(comp_df.Gender == 'M') & (comp_df.Injured == 'No') & (~comp_df.Name.isin(np_riders))].copy()
    comp_df_w = comp_df[(comp_df.Gender == 'W') & (comp_df.Injured == 'No')].copy()

    comp_dfs = [comp_df_m, comp_df_w]  
    
    for df in comp_dfs:
        
        df.drop(['Price', 'Injured', 'Gender'], axis=1, inplace=True)
        df['Venue'] = ven_pred
        df['Date'] = pd.to_datetime(date_pred, format='%Y%m%d')
        df['Year'] = df.Date.dt.year

    return (comp_df_m, comp_df_w)

### Create dataframes

The function create_results_from_UCI_df creates a dataframe from excel-files containing results downloaded from the UCI (Union Cycliste Internationale) Website 

In [65]:
def create_results_from_UCI_df(path):

    files = os.listdir(path)
    files_xlsx = [f for f in files if f[-4:] == 'xlsx']
    filepaths = [path + '/' + file for file in files_xlsx]

    udf = pd.DataFrame()
    for f in filepaths:
        data = pd.read_excel(f)
        stripped_f = f.rstrip('.xlsx').replace('Results_UCI_M/', '').replace('Results_UCI_W/', '')
        split_f = stripped_f.split('_')
        data['Venue'] = split_f[0]
        data['Category'] = split_f[1]
        data['Date'] = split_f[2]
        data['Name'] = data['First Name'] + ' ' + data['Last Name']
        data['Date'] = pd.to_datetime(data['Date'], format='%Y%m%d')
        data.drop(['First Name', 'Last Name', 'Phase', 'Heat', 'IRM', 'Team', 'Gender', 'Result', 'Country', 'Category', 'BIB'], axis=1, inplace=True)
        udf = udf.append(data, sort=False)
        udf.dropna(subset=['Rank'], inplace=True)
        udf.Rank = udf.Rank.astype(int)
        udf = udf[(udf.Rank < 81)]
        udf.sort_values(by=['Date', 'Rank'], inplace=True)
 
        #Create points feature
        udf['Points'] = udf.Rank.map(points_dict)
        udf.Points.fillna(0, inplace=True)
    
        #Create year feature
        udf['Year'] = udf.Date.dt.year
    
    return udf


The function create_results_from_scraped_web_data creates a dataframe from the results scraped from a separate downhill website (see the separate Scraped_web_data_ETL.ipynb for more info)

In [66]:
def create_results_from_scraped_web_data():
    scraped_df = pd.read_excel('race_dfs_output.xlsx')
    
    #change name errors
    name_dict = {'Sam HILL' : 'Samuel HILL', 'Mick HANNAH' : 'Michael HANNAH'}
    venue_dict = {'Fort-William' : 'Fortwilliam', 'Les-Gets' : 'Lesgets', 'Les-Deux-Alpes' : 'Lesdeuxalpes'}
    scraped_df.Name.replace(name_dict, inplace=True)
    scraped_df.Venue.replace(venue_dict, inplace=True)
    scraped_df['Date'] = pd.to_datetime(scraped_df['Date'], format='%Y%m%d')
    
    #Create points feature
    scraped_df['Points'] = scraped_df.Rank.map(points_dict)
    scraped_df.Points.fillna(0, inplace=True)
    scraped_df.drop(['Unnamed: 0'], axis=1, inplace=True)
    scraped_df = scraped_df[(scraped_df.Rank < 81)]


    scraped_df.sort_values(by=['Date', 'Rank'], inplace=True)


    
    return scraped_df
    

### Create features
The create_features and findpreviouswin functions creates 5 features based on earlier results the rider has had. These features are chosen because of the assumption that prior results from the rider will be an indicator of future results. Especially more recent results.



In [67]:
def create_features(cfdf):
    #Create Moving Average over last 3 races
    cfdf['MA3 Pos'] = cfdf.groupby('Name')['Rank'].transform(lambda x: x.rolling(3, 1).mean().shift())

    #Create last race position feature
    cfdf['Last RP'] = cfdf.groupby('Name')['Rank'].transform(lambda x: x.rolling(1, 1).mean().shift())

    #Best position in the last 5 races
    cfdf['Best pos'] = cfdf.groupby('Name')['Rank'].transform(lambda x: x.rolling(5, 1).min().shift())

    #Average position current season
    cfdf['AP this year'] = cfdf.groupby(['Name', 'Year'])['Rank'].transform(lambda x: x.rolling(10, 1).mean().shift())

    #Number of races rider has participated in current season
    #cfdf['Races CS'] = cfdf.groupby(['Name', 'Year'])['Rank'].transform(lambda x: x.rolling(10, 1).count().shift())
    #cfdf['Races CS'].fillna(0, inplace=True)
    
    return cfdf


In [68]:
def findpreviouswin(compdate, compyear, name, df):  
    # Function to create a feature for the riders average position the previous year
    
    avgpos_pyear = df[(df.Name == name) & (df.Year == compyear-1)].Rank.mean()
    
    return pd.Series({'AP last season' : avgpos_pyear})



## Fantasy model

To avoid data leakage the last race will be hold out from the training data instead of holding out random samples from the training set. 


In [69]:
def fantasy_model(dfinp):
    
    # Set dev_model to 1 to test the model before training on entire dataset. 
    # dev_model = 0 means that the model will take all data into account before predicting the
    # upcoming race
    dev_model = 0
    
    #preprocessing of dataframe
    
    #train and test set to evaluate performance of model before training model on all data and using it to predict
    #future competitions
    
    if dev_model == 1:
    
        df_test = dfinp[(dfinp['Venue'] == 'Snowshoe') & (dfinp['Date'] == '2019-09-06')].copy()
        df_train = dfinp[(dfinp['Venue'] != 'Snowshoe') & (dfinp['Date'] != '2019-09-06') & (dfinp.Rank < 81)].copy()
    
    
    #create df to train model on all data before using it to predict upcoming competitions, excluding upcoming competition
    
    if dev_model == 0:
    
        df_train = dfinp[(dfinp['Venue'] != ven_pred) & (dfinp['Date'] != date_pred) & (dfinp.Rank < 81)].copy()
        #df_pred is the dataframe containing the venue to be predicted
        df_test = dfinp[(dfinp['Venue'] == ven_pred) & (dfinp['Date'] == pd.to_datetime(date_pred, format='%Y%m%d'))].copy()


    
    y_train = df_train.Points
    df_train.drop('Points', axis=1, inplace=True)
    y_test = df_test.Points
    df_test.drop('Points', axis=1, inplace=True)
    
    
    # Preprocessing for numerical data
    numerical_transformer = SimpleImputer(strategy='median')
    
    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    numerical_cols = ['Age', 'Year', 'MA3 Pos', 'Last RP', 'Best pos', 'AP this year', 'AP last season']
    categorical_cols = ['Venue', 'Name']
    

    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
     ])
    
    
    # Keep selected columns only
    my_cols = categorical_cols + numerical_cols
    X_train = df_train[my_cols].copy()
    #X_test = df_pred[my_cols].copy()
    X_test = df_test[my_cols].copy()
    
    eval_set = [(df_test, y_test)]
    
    model = XGBRegressor(n_estimators=1800, learning_rate=0.001, n_jobs=4, max_depth=5, 
                         objective="reg:squarederror", eval_set = eval_set
                        ,early_stopping_rounds=100, eval_metric='mae')
    

    
    # Bundle preprocessing and modeling code in a pipeline
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])
    

    #scores = cross_val_score(my_pipeline, X_train, y,
    #                          cv=10,
    #                          scoring='neg_mean_absolute_error')
    #print('Neg MAE:', scores)
    #print("NEG MAE mean:", scores.mean())
    #print('Standard deviation:', scores.std())
    
    # Preprocessing of training data, fit model 
    my_pipeline.fit(X_train, y_train)
    
    # Predict points distribution for the race event
    test_pred = my_pipeline.predict(X_test)
    
    # Evaluate predictions
    mae = mean_absolute_error(y_test, test_pred)
    
    print('MAE:', mae)
    
    return pd.Series(test_pred, df_test.Name)


In [70]:
def race_prediction_men():
    #enter names of result folders
    path1 = 'Results_UCI_M'
    rdf1 = create_results_from_UCI_df(path1)
    
    scraped_df = create_results_from_scraped_web_data()

    result_df_M = scraped_df.append(rdf1, sort=True)
        

    comp_df_m = create_prediction_df()[0]
    
    result_df_M = result_df_M.append(comp_df_m, sort=True)

    
    #create dataframe with features
    mdf = create_features(result_df_M)

    #create feature with average number points per race for the previous season
    mdf[['AP last season']] = mdf.apply(lambda row: findpreviouswin(row.Date, row.Year, row.Name, mdf), axis=1)

    mdf.reset_index(drop=True,inplace=True)

    #Create correlation plot over the dataframe with numerical features
    #f, ax = plt.subplots(figsize=(10, 8))
    #corr = mdf.corr()
    #sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
    #        square=True, ax=ax)
    
    return fantasy_model(mdf)


In [71]:
race_prediction_men()

MAE: 17.708703247706094


Name
Danny HART                     87.091087
Amaury PIERRON                113.033897
Charlie HARRISON               39.832825
Loic BRUNI                    109.481789
Greg MINNAAR                   67.027321
Troy BROSNAN                   93.122063
Greg WILLIAMSON                29.191748
Thomas ESTAQUE                 29.345692
Dakotah NORTON                 24.994755
Dean LUCAS                     41.754486
Loris VERGIER                  78.745804
Laurie GREENLAND               91.374176
Aaron GWIN                     62.711662
Luca SHAW                      38.432251
Connor FEARON                  43.772614
Bruce KLEIN                    28.693590
Baptiste PIERRON               37.665695
Rémi THIRION                   41.396042
Martin MAES                    85.639343
Jure ŽABJEK                    33.243626
Mark WALLACE                   40.659214
David TRUMMER                  46.926147
Kirk MCDOWALL                  17.564997
Loris REVELLI                  15.374117
Michael HAN

In [72]:
def race_prediction_women():
    #enter names of result folders
    path1 = 'Results_UCI_W'
    rdf1 = create_results_from_UCI_df(path1)
    #path2 = 'Results_Web_M'
    #rdf2 = create_results_from_web_df(path2)
    #result_df_M = rdf2.append(rdf1, sort=True)

    
    comp_df_w = create_prediction_df()[1]

    result_df_W = rdf1.append(comp_df_w, sort=True)



    #create dataframe with features
    wdf = create_features(result_df_W)

    #create feature with average number points per race for the previous season
    wdf[['AP last season']] = wdf.apply(lambda row: findpreviouswin(row.Date, row.Year, row.Name, wdf), axis=1)

    
    return fantasy_model(wdf)
#race_prediction_women()

### Rider selection
All riders have a set price (this price does not update during the season). The function rider_selection reads in a dataframe containing the prices and the predictions for the races. It uses these dataframes to calculate how much points per dollar each rider is expected to return. This function will be used when finding the best possible rider combinations.

In [73]:
def rider_selection(gender):
    rider_price_df = pd.read_csv('riderprices2020.csv', sep=';')
    
    
    if gender == 'M':
        predictions = race_prediction_men()
    else:
        predictions = race_prediction_women()


    

    pri_df = pd.DataFrame(predictions).reset_index()
    pri_df = pri_df.rename(columns={0: "Points"})

    pri_df = pri_df.merge(rider_price_df, left_on='Name', right_on='Name', how='left')
    pri_df['Pts./k$'] = pri_df.Points / pri_df.Price*1000
    #print(pri_df.head(50))
    
    if gender == 'M':
        pri_df = pri_df.groupby('Price').nth(list(range(4))).reset_index()
    else:
        pri_df = pri_df.groupby('Price').nth(list(range(2))).reset_index()
    
    
    return pri_df[(pri_df['Pts./k$'] > 0.1)] 


### Find the best possible combination of riders
The goal is to have the best possible fantasy downhill team, consisting of a maximum of 6 riders. Maximum 2 women riders and maximum 4 male riders.

This function, find_rider_combinations, iterates through all the combinations of 4 Male riders and 2 Female riders and fins the combination that has the highest expected total number of points within the team budget of 1 500 000 USD.

In [74]:
def find_rider_combinations():
    import itertools    
    from itertools import combinations
    
    #dataframe with riders in the men category
    ridercm_df = rider_selection('M')
    
    #dataframe with riders in the women category
    ridercw_df = rider_selection('W')
    #Create all possible combinations of riders
    #mask for non participating riders
    #nonpart = (riderc_df.Name != "Martin MAES")
    #rcomb_df = pd.DataFrame.from_records(list(itertools.combinations(riderc_df[nonpart].Name, 4)), columns=['R1', 'R2', 'R3', 'R4'])
    
    
    #Create two separate dataframes for men and women. 
    #Create combinations of 4 male riders and combinations of 2 female riders.
    
    rcombm_df = pd.DataFrame.from_records(list(itertools.combinations(ridercm_df.Name, 4)), columns=['MR1', 'MR2', 'MR3', 'MR4'])

    #Dictionaries of rider & price/points
    rpdict = dict(zip(ridercm_df.Name,ridercm_df.Price))
    rptdict = dict(zip(ridercm_df.Name,ridercm_df.Points))

    rcombm_df['CPriceM'] = rcombm_df.MR1.map(rpdict) + rcombm_df.MR2.map(rpdict) + rcombm_df.MR3.map(rpdict) + rcombm_df.MR4.map(rpdict)
    rcombm_df['CPointsM'] = rcombm_df.MR1.map(rptdict) + rcombm_df.MR2.map(rptdict) + rcombm_df.MR3.map(rptdict) + rcombm_df.MR4.map(rptdict)
    rcombm_df['Comb. Pts/k$'] = rcombm_df.CPointsM / rcombm_df.CPriceM * 1000

    rcombm_df.sort_values(by=['CPointsM'], ascending = False, inplace=True)
    rcombm_df['RidersM'] = list(zip(rcombm_df.MR1, rcombm_df.MR2, rcombm_df.MR3, rcombm_df.MR4))

    #rcombm_df.drop(rcombm_df[rcombm_df.CPriceM > 50000].index, inplace=True)

    rcombm_df = rcombm_df[(rcombm_df.CPriceM < 1500000) & (rcombm_df.CPriceM > 340000)].copy()
    
    #Creation of dataframes for female riders
    rcombw_df = pd.DataFrame.from_records(list(itertools.combinations(ridercw_df.Name, 2)), columns=['WR1', 'WR2'])
    
    rpwdict = dict(zip(ridercw_df.Name,ridercw_df.Price))
    rptwdict = dict(zip(ridercw_df.Name,ridercw_df.Points))
    
    rcombw_df['CPriceW'] = rcombw_df.WR1.map(rpwdict) + rcombw_df.WR2.map(rpwdict) 
    rcombw_df['CPointsW'] = rcombw_df.WR1.map(rptwdict) + rcombw_df.WR2.map(rptwdict) 
    rcombw_df['Comb. Pts/k$ W'] = rcombw_df.CPointsW / rcombw_df.CPriceW * 1000

    rcombw_df.sort_values(by=['CPointsW'], ascending = False, inplace=True)
    rcombw_df['RidersW'] = list(zip(rcombw_df.WR1, rcombw_df.WR2))

    #find best combinations of both men and women
    combinate_df = pd.DataFrame.from_records(list(itertools.product(rcombw_df.RidersW, rcombm_df.RidersM)), columns=['RidersW', 'RidersM'])
    
    
    combinate_df = combinate_df.merge(rcombw_df, left_on='RidersW', right_on='RidersW')
    combinate_df = combinate_df.merge(rcombm_df, left_on='RidersM', right_on='RidersM')


    
    
    combinate_df['Combined Price'] = combinate_df.CPriceW + combinate_df.CPriceM
    combinate_df['Combined Points'] = combinate_df.CPointsW + combinate_df.CPointsM

    combinate_df.drop(['MR1', 'MR2', 'MR3', 'MR4', 'WR1', 'WR2', 'CPriceW', 'CPriceM', 'CPointsW', 'CPointsM'], axis=1, inplace=True)


    combinate_df.sort_values(by=['Combined Points'], ascending = False, inplace=True)
    
    return combinate_df[(combinate_df['Combined Price'] < 1500001)]



In [76]:
# Display the entire column width
pd.set_option('display.max_colwidth', -1)

# Find the optimal rider combinations
find_rider_combinations()

MAE: 17.708703247706094
MAE: 22.31589916774205
