In [1]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

import tensorflow as tf

In [2]:
df1=pd.read_csv('athlete_events.csv', sep=',', index_col=None, engine='python')

In [3]:
# Some information before pre-processing
print("NOC:", len(df1.NOC.unique()))
print("CITY:", len(df1.City.unique()))
print("Sport:", len(df1.Sport.unique()))
print("Event:", len(df1.Event.unique()))

print("Did Not Win:", sum(pd.isnull(df1['Medal'])))
print("Gold Medalists:", len(df1.loc[df1['Medal'] == 'Gold']))
print("Silver Medalists", len(df1.loc[df1['Medal'] == 'Silver']))
print("Bronze Medalists", len(df1.loc[df1['Medal'] == 'Bronze']))

NOC: 230
CITY: 42
Sport: 66
Event: 765
Did Not Win: 231333
Gold Medalists: 13372
Silver Medalists 13116
Bronze Medalists 13295


In [4]:
# First I want to test with complete values, 
# I will split into train & test set for Error
df2 = df1.dropna(subset=['Age','Height','Weight'])
df2.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,
5,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,"Speed Skating Women's 1,000 metres",
6,5,Christine Jacoba Aaftink,F,25.0,185.0,82.0,Netherlands,NED,1992 Winter,1992,Winter,Albertville,Speed Skating,Speed Skating Women's 500 metres,


In [5]:
def pre_processing(df, flag = False):
    """
        Preprocessing specific to the dataset that I'm working with
    """
    # ID is an Unique number for each athlete
    # Drop Names: names have no bearing on individual's performance
    # Cannot 1 hot encode ID, then number of features would explode
    df = df.drop('Name', axis = 1)
    
    # Drop Teams: Teams and NOCs are (almost) identical, hence cleaning
    df = df.drop('Team', axis = 1)
    
    # Drop Games: It just contains Season and Year, which we have separately anyway
    df = df.drop('Games', axis = 1)
    
    # 1 Hot Encoding Years
    df = pd.get_dummies(df, columns=['Year'])
    
    
#     # Games column has the year as extra, removing
#     df['Games'] = df['Games'].str[5:]
#     # Renaming to keep it meaningful
#     #df = df.rename({'Games': 'Season'}, axis=1)
    

    # One Hot Encode the following features using pandas get_dummies function
    df = pd.get_dummies(df, columns=['Sex', 'Season'])
    
    # Replace athletes having no medal with 0
    # df = df.fillna(value = {'Medal': 0})
    # Replace athletes having gold, silver and bronze medal with 200, 150, 100
    # I felt these values would properly quantify the weightage of winning a medal
#     adjusted_vals = {"Medal": {"Gold": 200, 
#                                "Silver": 150, 
#                                "Bronze": 100}
#                     }
#     df = df.replace(adjusted_vals)
    
    
    
    # DNW = Did not win
    df = df.fillna(value = {'Medal': 'DNW'})
    
    # One Hot Encode the following features using pandas get_dummies function
    df = pd.get_dummies(df, columns=['Medal'])


    
    # Number of New Features added after 1-Hot-Encoding features with high cardinality
#     print("NOC:", len(df.NOC.unique()))
#     print("CITY:", len(df.City.unique()))
#     print("Sport:", len(df.Sport.unique()))
#     print("Event:", len(df.Event.unique()))
    
    if(flag == True):
        # One Hot Encode the following features using pandas get_dummies function
        df = pd.get_dummies(df, columns=['NOC', 'City', 'Sport', 'Event'])
    else:
        # drop these functions
        df = df.drop(['NOC', 'City', 'Sport', 'Event'], axis = 1)
    
    
    return df

In [6]:
df = pre_processing(df2, False)

In [7]:
df.head()

Unnamed: 0,ID,Age,Height,Weight,Year_1896,Year_1900,Year_1904,Year_1906,Year_1908,Year_1912,...,Year_2014,Year_2016,Sex_F,Sex_M,Season_Summer,Season_Winter,Medal_Bronze,Medal_DNW,Medal_Gold,Medal_Silver
0,1,24.0,180.0,80.0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,1,0,0
1,2,23.0,170.0,60.0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,1,0,0
4,5,21.0,185.0,82.0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0
5,5,21.0,185.0,82.0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0
6,5,25.0,185.0,82.0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0


In [8]:
df_large = pre_processing(df2, True)

In [9]:
df_large

Unnamed: 0,ID,Age,Height,Weight,Year_1896,Year_1900,Year_1904,Year_1906,Year_1908,Year_1912,...,"Event_Wrestling Men's Super-Heavyweight, Greco-Roman","Event_Wrestling Men's Unlimited Class, Greco-Roman","Event_Wrestling Men's Welterweight, Freestyle","Event_Wrestling Men's Welterweight, Greco-Roman","Event_Wrestling Women's Featherweight, Freestyle","Event_Wrestling Women's Flyweight, Freestyle","Event_Wrestling Women's Heavyweight, Freestyle","Event_Wrestling Women's Light-Heavyweight, Freestyle","Event_Wrestling Women's Lightweight, Freestyle","Event_Wrestling Women's Middleweight, Freestyle"
0,1,24.0,180.0,80.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,23.0,170.0,60.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,21.0,185.0,82.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,5,21.0,185.0,82.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,5,25.0,185.0,82.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271111,135569,29.0,179.0,89.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
271112,135570,27.0,176.0,59.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
271113,135570,27.0,176.0,59.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
271114,135571,30.0,185.0,96.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# print("NOC:", len(df.NOC.unique()))
# print("CITY:", len(df.City.unique()))
# print("Sport:", len(df.Sport.unique()))
# print("Event:", len(df.Event.unique()))

In [11]:
df.shape

(206165, 47)

In [12]:
df_large.shape

(206165, 961)

In [13]:
# Basic Completion Agent
class BCA():
    def __init__(self):
        pass
    
    def fit(self, X_train, Y_train):
        pass
        self.y_col_name = Y_train.columns[0]
        self.y_mean = Y_train[self.y_col_name].mean()
    
    
    def pred(self, X_test):
        pass
        X_test_m = np.shape(X_test)[0]
        #Y_pred = pd.DataFrame(self.y_mean, columns = self.y_col_name)
        Y_pred = pd.DataFrame(self.y_mean, index = np.arange(X_test_m), columns = {self.y_col_name})
        return Y_pred

## *The following outputs are errors, the first dict is the RMSE of each feature, and the second dict is the MAE.

In [14]:
# Testing Basic Completion Agent

In [15]:
def BCA_Error(df, y_vals):
    # dict containing individual y_val MSEs
    rmse_dict = {}
    mae_dict = {}
    for y in y_vals:
        # Lets train and predict Age
        X = df.drop(y, axis = 1)
        Y = df[[y]]
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
        
        # Create a model
        BCA_model = BCA()
        
        # Fit/Train the model
        BCA_model.fit(X_train, Y_train)
        
        # Predict
        Y_pred = BCA_model.pred(X_test)        
        
        
        # Calculate RMSE
        rmse = np.sqrt(metrics.mean_squared_error(Y_test, Y_pred))
        
        # Calculate MAE
        mae = metrics.mean_absolute_error(Y_test, Y_pred)
        
        # add to dict
        rmse_dict[y] = rmse
        mae_dict[y] = mae
        
    return rmse_dict, mae_dict

In [16]:
y_vals = ['Age', 'Height', 'Weight']
t1 = time.time()
print(BCA_Error(df, y_vals))
t2 = time.time()
print("Time Taken:", int(t2 - t1))

({'Age': 5.509763131202196, 'Height': 10.586037663370348, 'Weight': 14.365553232112697}, {'Age': 4.138206974083723, 'Height': 8.465884121705892, 'Weight': 11.094197490399097})
Time Taken: 0


In [17]:
y_vals = ['Age', 'Height', 'Weight']
t1 = time.time()
print(BCA_Error(df_large, y_vals))
t2 = time.time()
print("Time Taken:", int(t2 - t1))

({'Age': 5.509763131202196, 'Height': 10.586037663370348, 'Weight': 14.365553232112697}, {'Age': 4.138206974083723, 'Height': 8.465884121705892, 'Weight': 11.094197490399097})
Time Taken: 3


- should be same, coz not dependent on other columns while taking mean/mode

In [18]:
# Testing using Libraries

In [19]:
def LinRegError(df, y_vals):
    # dict containing individual y_val MSEs
    rmse_dict = {}
    mae_dict = {}
    for y in y_vals:
        # Lets train and predict Age
        X = df.drop(y, axis = 1)
        Y = df[[y]]
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
        
        # Create a model
        lin_reg_model = LinearRegression()
        
        # Fit/Train the model
        lin_reg_model.fit(X_train, Y_train)
        
        # Predict
        Y_pred = lin_reg_model.predict(X_test)
        
        # Calculate RMSE
        rmse=np.sqrt(metrics.mean_squared_error(Y_test, Y_pred))
        
        # Calculate MAE
        mae = metrics.mean_absolute_error(Y_test, Y_pred)
        
        # add to dict
        rmse_dict[y] = rmse
        mae_dict[y] = mae
        
    return rmse_dict, mae_dict

In [20]:
y_vals = ['Age', 'Height', 'Weight']
t1 = time.time()
print(LinRegError(df, y_vals))
t2 = time.time()
print("Time Taken:", int(t2 - t1))

({'Age': 5.3017053182260545, 'Height': 6.230227902336726, 'Weight': 8.345069561912402}, {'Age': 3.99557081798677, 'Height': 4.78982054741217, 'Weight': 5.798668803334796})
Time Taken: 1


In [21]:
y_vals = ['Age', 'Height', 'Weight']
t1 = time.time()
print(LinRegError(df_large, y_vals))
t2 = time.time()
print("Time Taken:", int(t2 - t1))

({'Age': 3954.068899119992, 'Height': 159.80280508156613, 'Weight': 2147.7725583315796}, {'Age': 25.982488347853145, 'Height': 4.753625664539187, 'Weight': 18.391961272467036})
Time Taken: 48


- Gets Worse, RMSE as well as MAE

In [25]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib
#help(joblib.parallel)



In [26]:
def RandomForestRegressorError(df, y_vals):
    # dict containing individual y_val MSEs
    rmse_dict = {}
    mae_dict = {}
    for y in y_vals:
        # Lets train and predict Age
        X_df = df.drop(y, axis = 1)
        Y_df = df[[y]]
        
        X = np.array(X_df)
        Y = np.array(Y_df).flatten()
        
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
        
        
        # Create model with 1000 decision trees
        RFR_model = RandomForestRegressor(n_estimators = 50, max_depth = 80, random_state = 0, verbose = 1, n_jobs=-1)
        
        # Train the model on training data
        RFR_model.fit(X_train, Y_train)
        
        # Use the forest's predict method on the test data
        Y_pred = RFR_model.predict(X_test)
        
        # Calculate RMSE
        rmse=np.sqrt(metrics.mean_squared_error(Y_test, Y_pred))
        
        # Calculate MAE
        mae = metrics.mean_absolute_error(Y_test, Y_pred)
        
        # add to dict
        rmse_dict[y] = rmse
        mae_dict[y] = mae
        
    return rmse_dict, mae_dict

In [27]:
y_vals = ['Age', 'Height', 'Weight']
t1 = time.time()
print(RandomForestRegressorError(df, y_vals))
t2 = time.time()
print("Time Taken:", int(t2 - t1))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   18.8s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done  50 out of  50 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   16.3s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done  50 out of  50 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    9

({'Age': 4.291203032558678, 'Height': 4.880403153973122, 'Weight': 7.657090485794006}, {'Age': 2.8670312613683215, 'Height': 3.2436650255863015, 'Weight': 4.648500230397983})
Time Taken: 53


In [28]:
y_vals = ['Age', 'Height', 'Weight']
t1 = time.time()
print(RandomForestRegressorError(df_large, y_vals))
t2 = time.time()
print("Time Taken:", int(t2 - t1))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  6.7min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done  50 out of  50 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  4.6min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done  50 out of  50 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  3.3

({'Age': 3.68033175948988, 'Height': 3.5723596812696488, 'Weight': 4.442610727983362}, {'Age': 2.58805591579757, 'Height': 2.413007182804602, 'Weight': 2.792027933863245})
Time Taken: 1043


In [29]:
def NN_Error(df, y_vals):
    # dict containing individual y_val MSEs
    rmse_dict = {}
    mae_dict = {}
    for y in y_vals:
        # Lets train and predict Age
        X_df = df.drop(y, axis = 1)
        Y_df = df[[y]]
        
        X = np.array(X_df)
        Y = np.array(Y_df).flatten()
        
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
        
        
        # Train set is 60% of total, val set is 20% of total (and test also 20%)
        # X_train, X_val, Y_train, Y_val = train_test_split(X_tv, Y_tv, test_size=0.01, random_state = 0)
        
        
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        
        
#         input_layer = tf.keras.layers.Input(shape=(X_train.shape[1],))
#         dense_layer_1 = tf.keras.layers.Dense(100, activation='relu')(input_layer)
#         dense_layer_2 = tf.keras.layers.Dense(50, activation='relu')(dense_layer_1)
#         dense_layer_3 = tf.keras.layers.Dense(25, activation='relu')(dense_layer_2)
#         output = tf.keras.layers.Dense(1)(dense_layer_3)

#         NN_model = tf.keras.models.Model(inputs=input_layer, outputs=output)
#         NN_model.compile(loss="mean_squared_error" , optimizer="adam", metrics=["mean_squared_error"])

#         # Fit/Train
#         history = NN_model.fit(X_train, Y_train, batch_size=128, epochs=20, verbose=1, validation_split=0.2)


        # Define model
        NN_model = tf.keras.models.Sequential()
        NN_model.add(tf.keras.layers.Dense(400, input_dim=(X_train.shape[1]), activation = "relu"))
#         NN_model.add(tf.keras.layers.Dropout(0.2))
        NN_model.add(tf.keras.layers.Dense(200, activation = "relu"))
#         NN_model.add(tf.keras.layers.Dropout(0.2))
        NN_model.add(tf.keras.layers.Dense(100, activation = "relu"))
#         NN_model.add(tf.keras.layers.Dropout(0.2))
        NN_model.add(tf.keras.layers.Dense(40, activation = "relu"))
        NN_model.add(tf.keras.layers.Dense(1))


        NN_model.compile(loss = "mean_squared_error" , optimizer = "adam", metrics = ["mean_squared_error"])
        NN_model.fit(X_train, Y_train, epochs=20, verbose=1, validation_split=0.2)#, batch_size=64)
            

                
        # Predict
        Y_pred = NN_model.predict(X_test)
        
        # Calculate RMSE
        rmse=np.sqrt(metrics.mean_squared_error(Y_test, Y_pred))
        
        # Calculate MAE
        mae = metrics.mean_absolute_error(Y_test, Y_pred)
        
        # add to dict
        rmse_dict[y] = rmse
        mae_dict[y] = mae
        
    return rmse_dict, mae_dict

In [30]:
y_vals = ['Age', 'Height', 'Weight']
t1 = time.time()
print(NN_Error(df, y_vals))
t2 = time.time()
print("Time Taken:", int(t2 - t1))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
({'Age': 5.194774426270923, 'Height': 6.279282555077576, 'Weight': 8.468955943320703}, {'Age': 3.8782030146806536, 'Height': 4.915925786395349, 'Weight': 6.106072553536993})
Time Taken: 1081


In [31]:
y_vals = ['Age', 'Height', 'Weight']
t1 = time.time()
print(NN_Error(df_large, y_vals))
t2 = time.time()
print("Time Taken:", int(t2 - t1))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
({'Age': 4.519100323180397, 'Height': 5.259561058890453, 'Weight': 5.543389528690274}, {'Age': 3.3430048929821887, 'Height': 3.895569299872684, 'Weight': 3.944308217826031})
Time Taken: 1206
