In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import metrics


ModuleNotFoundError: No module named 'pandas'

In [None]:
races = pd.read_csv('all_races.csv', nrows = 10000)
races.head()

In [None]:
def clean_data(df):
    # fix one missing net time
    flt = (df.place==1)&(df.event=='family-race')&(df.event_year==2015)
    df.loc[flt, 'net_time'] = df[flt].official_time
    df.official_time = pd.to_timedelta(df.official_time)
    # many cases of missing net time
    df.net_time = np.where(df.net_time=='-', np.nan, df.net_time)
    df.net_time = pd.to_timedelta(df.net_time)
    # extract the date
    df['birth_year'] = df['birth_date'].str[6:].astype(int)
    return df
    
def input_missing_event_net_time(df):
    # if the runner is on top10, set missing net time equal to the official time
    df.net_time = np.where((df.net_time.isnull())&(df.place <= 10), 
                            df.official_time, df.net_time)
    df['delay_time'] = df['official_time'].dt.seconds - df['net_time'].dt.seconds
    df['delay_time_mean'] = df.delay_time.rolling(window=10, min_periods=5).mean()
    df['net_time_mean_sec'] = df['official_time'].dt.seconds - df['delay_time_mean']
    df['net_time'] = np.where(df.net_time.isnull(),
                            pd.to_timedelta(df.net_time_mean_sec, unit='s'), 
                            df.net_time)
    df = df.drop(['net_time_mean_sec','delay_time_mean','delay_time'], axis=1)
    assert not (df.official_time < df.net_time).any() 
    assert not df.net_time.isnull().any() 
    return df

def add_features(df):
    df['pace'] = df.net_time / df.distance
    return df

In [None]:
races = clean_data(races)
races = races.groupby(['event','event_year']).apply(input_missing_event_net_time)
races = add_features(races)

In [None]:
# using combination of name and birth date to identify unique runners
races['birth_date_name'] = races['birth_date'] + '_' + races['name']


In [None]:
years = set(races[races.event=='maratona'].event_year.unique()) & \
set(races[races.event=='meia_maratona'].event_year.unique())
years

In [None]:
races_train = pd.DataFrame()
for year in years:
    marathon_runners = races[(races.event_year==year)&(races.event=='maratona')].birth_date_name
    half_events = races[(races.event_year==year)&(races.event=='meia_maratona')&(
                        races.birth_date_name.isin(marathon_runners))]
    half_and_marathon_runners = half_events.birth_date_name.values
    races_train = races_train.append(races[(races.event_year==year)&(
                                    races.event.isin(['maratona','meia_maratona']))&(
                                    races.birth_date_name.isin(half_and_marathon_runners))])