In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf

df = pd.read_csv('athlete_events.csv')

In [3]:
list(df)

['ID',
 'Name',
 'Sex',
 'Age',
 'Height',
 'Weight',
 'Team',
 'NOC',
 'Games',
 'Year',
 'Season',
 'City',
 'Sport',
 'Event',
 'Medal']

In [5]:
df_filtered = df[['Name','Sport','Height','Weight','Sex']]
df_grouped = df_filtered.dropna().groupby('Name').first().reset_index()
df_grouped.shape

(99041, 5)

In [7]:
len(df_grouped.Sport.unique())

55

In [15]:
# Do a one'hot encode of the Sport feature
df_one_hot_encode = pd.get_dummies(df_grouped.Sport,prefix='sport', drop_first=True)

In [16]:
# Add the other features
df_one_hot_encode['Height'] = df_grouped.Height
df_one_hot_encode['Weight'] = df_grouped.Weight
df_one_hot_encode['Sex'] = df_grouped.Sex

In [18]:
df_one_hot_encode.loc[df_one_hot_encode.Sex.isin(['M']),'sex']= 0
df_one_hot_encode.loc[df_one_hot_encode.Sex.isin(['F']),'sex']= 1
df_one_hot_encode = df_one_hot_encode.drop(columns='Sex')

In [112]:
print("Shape: " + str(df_one_hot_encode.shape[0]))
print(df_one_hot_encode.sex.value_counts())

Shape: 99041
0.0    69389
1.0    29652
Name: sex, dtype: int64


In [252]:
# further processing 
train_data = df_one_hot_encode[:NUM_TRAIN_DATA]
#train_data_female = df_one_hot_encode[:NUM_TRAIN_DATA][df_one_hot_encode.sex==0]
test_data = df_one_hot_encode[NUM_TRAIN_DATA:]

# Preparing Dataset for consumption.

model_input = train_data.iloc[:, -3:-1].values
labels = np.asarray([[i[0], 1 - i[0]] for i in train_data.iloc[:, -1:].values])
print(model_input.shape, labels.shape)

validation_features = test_data.iloc[:, -3:-1].values
validation_labels = [[i[0], 1 - i[0]] for i in test_data.iloc[:, -1:].values]


def normalize(v):
    m = np.mean(v)
    std = np.std(v)
    return (v-m)/std

def np_power(v, power):
    for _ in range(power):
        v*=v
    return v

def feature_generator(model_input):
    return np.transpose([np.transpose(normalize(model_input[:,0])),
                         np.transpose(normalize(model_input[:,1])),
                         np.transpose(normalize(model_input[:,0]/model_input[:,1])),
                         np.transpose(normalize(model_input[:,0]*model_input[:,1])),
                         np.transpose(normalize(np_power(model_input[:,0],2))),
                         np.transpose(normalize(np_power(model_input[:,1],2))),
                         np.transpose(normalize(np_power(model_input[:,0],3))),
                         np.transpose(normalize(np_power(model_input[:,1],3))),
                         ])

model_input_normalized = feature_generator(model_input)
validation_features_normalized = feature_generator(validation_features)

print(model_input_normalized.shape)

((96000, 2), (96000, 2))
(96000, 8)


In [253]:
# Begin Machine Learning
NUM_TRAIN_DATA = 96000
NUM_INPUTS = model_input_normalized.shape[1]
NUM_OUTPUTS = 2
LEARNING_RATE = 0.01

#initialize weights
weights = tf.Variable(tf.random_normal([NUM_INPUTS, NUM_OUTPUTS]))
biases = tf.Variable(tf.random_normal([NUM_OUTPUTS]))

net_input = tf.placeholder(tf.float32, [None, NUM_INPUTS])

def regression(x, weights, biases):
    affine = tf.matmul(x, weights) + biases
    return tf.nn.softmax(affine)

y_hat = regression(net_input, weights, biases)
y_true = tf.placeholder(tf.float32, [None, NUM_OUTPUTS])

In [254]:
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y_true, logits=y_hat)
cost = tf.reduce_mean(cross_entropy)

optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(cross_entropy)


correctly_predicted = tf.equal(tf.argmax(y_hat, 1), tf.argmax(y_true, 1))
accuracy = tf.reduce_mean(tf.cast(correctly_predicted, "float"))

sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [255]:
# Equivalent to value counts for binary features
f, m = np.sum(labels, axis = 0)
print("{}% male class imbalance".format(m/(f+m)))

0.70240625% male class imbalance


In [256]:
batch_size = 100
n_epochs = 5

for epoch_i in range(n_epochs):
    for batch_i in range(NUM_TRAIN_DATA // batch_size):
        index = batch_i*batch_size
        batch_xs = model_input_normalized[index:index+batch_size]
        batch_ys = labels[index:index+batch_size]
        sess.run(optimizer, feed_dict={
            net_input: batch_xs,
            y_true: batch_ys
        })
    print('Validation Accuracy')
    print(sess.run(accuracy,
                   feed_dict={
                       net_input: validation_features_normalized,
                       y_true: validation_labels
                   }))

Validation Accuracy
0.76192
Validation Accuracy
0.760934
Validation Accuracy
0.760605
Validation Accuracy
0.760605
Validation Accuracy
0.760934
