In [1]:
# Running RandomForest regressions to discover a lifters best lift
# across the 3 lifts.
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.datasets import load_digits

In [2]:
# Sampling 20,000 data entries
n = 767672 
s = 20000
skip = sorted(random.sample(range(1,n+1),n-s))
data = pd.read_csv('zeroopenpowerlifting.csv', skiprows=skip)
data

Unnamed: 0,Name,Sex,Event,Equipment,Age,AgeClass,Division,BodyweightKg,WeightClassKg,Squat1Kg,...,TotalKg,Place,Wilks,McCulloch,Glossbrenner,IPFPoints,Tested,Country,Federation,Date
0,Elizabeth Ciortuz,F,SBD,Raw,19.0,18-19,F-OR,72.70,75,95.0,...,300.0,7,290.95,302.58,256.21,486.30,,,GPC-AUS,2018-10-27
1,Jasmine Hoare,F,SBD,Raw,31.0,24-34,F-OR,77.70,82.5,130.0,...,380.0,4,353.56,353.56,310.54,599.76,,,GPC-AUS,2018-10-27
2,Kerryn Siems,F,SBD,Wraps,30.0,24-34,F-OR,67.20,67.5,140.0,...,390.0,1,399.32,399.32,352.28,656.52,,,GPC-AUS,2018-10-27
3,Andrew Yuile,M,SBD,Wraps,36.0,35-39,M-OR,79.50,82.5,112.5,...,345.0,9,236.47,236.47,228.46,317.40,,,GPC-AUS,2018-10-27
4,Emmanual Ioane,M,SBD,Wraps,36.0,35-39,M-OR,133.70,140,0.0,...,,DQ,,,,,,,GPC-AUS,2018-10-27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,Dakota Baker,M,BD,Raw,23.0,20-23,nov,137.89,140,0.0,...,420.0,1,235.24,235.24,223.84,,Yes,USA,NASA,2019-07-20
19996,William Tabron,M,BD,Raw,35.5,35-39,sm2,109.77,110,0.0,...,450.0,1,264.99,264.99,253.25,,Yes,USA,NASA,2019-06-15
19997,Michael Hanlein,M,BD,Raw,26.5,24-34,int,68.86,75,0.0,...,350.0,1,265.63,265.63,257.62,,Yes,USA,NASA,2020-02-22
19998,Robert McKinzie,M,D,Raw,58.5,55-59,dlpure,119.25,125,0.0,...,227.5,1,130.99,169.10,125.51,445.80,Yes,USA,NASA,2020-02-22


In [3]:
# Dropping columns not needed for this test
data = data.drop(columns=['Name', 'AgeClass', 'Division',
                          'Squat4Kg', 'Bench4Kg', 'Deadlift4Kg',
                          'Place', 'Tested', 'Country', 'Federation',
                          'Date', 'IPFPoints', 'TotalKg', 'Squat1Kg',
                          'Squat2Kg', 'Squat3Kg','Bench1Kg', 'Bench2Kg',
                          'Bench3Kg', 'Deadlift1Kg', 'Deadlift2Kg',
                          'Deadlift3Kg', 'McCulloch', 'Glossbrenner',
                          'WeightClassKg'])

data.columns
data.isnull().sum()

Sex                   0
Event                 0
Equipment             0
Age                   0
BodyweightKg          0
Best3SquatKg          0
Best3BenchKg          0
Best3DeadliftKg       0
Wilks              1165
dtype: int64

In [4]:
# General dropna just  to clear those without a WILKS
data = data.dropna()

In [5]:
# Dropping athletes who did not compete in complete SBD competitions
indexNames = data[data['Event'] == 'S'].index
data.drop(indexNames, inplace=True)

indexNames = data[data['Event'] == 'B'].index
data.drop(indexNames, inplace=True)

indexNames = data[data['Event'] == 'D'].index
data.drop(indexNames, inplace=True)

indexNames = data[data['Event'] == 'SD'].index
data.drop(indexNames, inplace=True)

indexNames = data[data['Event'] == 'SB'].index
data.drop(indexNames, inplace=True)

indexNames = data[data['Event'] == 'BD'].index
data.drop(indexNames, inplace=True)

In [6]:
data

Unnamed: 0,Sex,Event,Equipment,Age,BodyweightKg,Best3SquatKg,Best3BenchKg,Best3DeadliftKg,Wilks
0,F,SBD,Raw,19.0,72.70,112.5,60.0,127.5,290.95
1,F,SBD,Raw,31.0,77.70,145.0,80.0,155.0,353.56
2,F,SBD,Wraps,30.0,67.20,160.0,72.5,157.5,399.32
3,M,SBD,Wraps,36.0,79.50,125.0,77.5,142.5,236.47
5,M,SBD,Wraps,27.0,99.60,215.0,155.0,242.5,373.36
...,...,...,...,...,...,...,...,...,...
19983,M,SBD,Raw,37.0,66.68,200.0,155.0,237.5,461.35
19985,M,SBD,Raw,23.0,89.58,212.5,147.5,232.5,379.15
19986,M,SBD,Wraps,28.0,82.37,200.0,142.5,200.0,363.77
19989,M,SBD,Raw,53.0,81.83,180.0,115.0,200.0,333.24


In [7]:
# Remove the last couple of columns
data = data.drop(columns = ['Event'])

In [8]:
# Convert string data to numeric
sex = {'M': 1, 'F': 0, 'Mx': 3}
equipment = {'Raw': 0, 'Wraps': 1, 'Multi-ply': 2, 'Single-ply': 3, 'Straps': 4}

data.Sex = [sex[item] for item in data.Sex]
data.Equipment = [equipment[item] for item in data.Equipment]

data

Unnamed: 0,Sex,Equipment,Age,BodyweightKg,Best3SquatKg,Best3BenchKg,Best3DeadliftKg,Wilks
0,0,0,19.0,72.70,112.5,60.0,127.5,290.95
1,0,0,31.0,77.70,145.0,80.0,155.0,353.56
2,0,1,30.0,67.20,160.0,72.5,157.5,399.32
3,1,1,36.0,79.50,125.0,77.5,142.5,236.47
5,1,1,27.0,99.60,215.0,155.0,242.5,373.36
...,...,...,...,...,...,...,...,...
19983,1,0,37.0,66.68,200.0,155.0,237.5,461.35
19985,1,0,23.0,89.58,212.5,147.5,232.5,379.15
19986,1,1,28.0,82.37,200.0,142.5,200.0,363.77
19989,1,0,53.0,81.83,180.0,115.0,200.0,333.24


In [9]:
# Normalise the data, scaling to values between 0-1
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))
data[['Sex', 'Equipment', 'Age', 'BodyweightKg',
        'Wilks', 'Best3BenchKg', 'Best3DeadliftKg', 'Best3SquatKg']] = scaler.fit_transform(data[['Sex', 'Equipment', 'Age', 'BodyweightKg',
        'Wilks', 'Best3BenchKg', 'Best3DeadliftKg', 'Best3SquatKg']])

In [10]:
# So, running 3 different test and training sets.
# First for squats, then bench, finally deadlifts,
# See the model accuracy for predicting someone's lifts.
# Further experiments to see which variables are critical
# to accurate prediction. How accurate can we get with age,
# sex, weight, and equipment?

test_squat = data[['Sex', 'Equipment', 'Age', 'BodyweightKg',
        'Wilks', 'Best3BenchKg', 'Best3DeadliftKg']]

target_squat = data[['Best3SquatKg']]

test_bench = data[['Sex', 'Equipment', 'Age', 'BodyweightKg',
        'Wilks', 'Best3SquatKg', 'Best3DeadliftKg']]

target_bench = data[['Best3BenchKg']]

test_dead = data[['Sex', 'Equipment', 'Age', 'BodyweightKg',
        'Wilks', 'Best3BenchKg', 'Best3SquatKg']]

target_dead = data[['Best3DeadliftKg']]

In [11]:
# Preparing sets
S = test_squat
s = target_squat
B = test_bench
b = target_bench
D = test_dead
d = target_dead

In [12]:
# Split sets into training and test sets
from sklearn.model_selection import train_test_split
S_train, S_test, s_train, s_test = train_test_split(S, s, test_size=0.1, random_state=3)
B_train, B_test, b_train, b_test = train_test_split(B, b, test_size=0.1, random_state=3)
D_train, D_test, d_train, d_test = train_test_split(D, d, test_size=0.1, random_state=3)

In [13]:
from sklearn.svm import SVR
n_samples, n_features = 10, 5

In [20]:
rng = np.random.RandomState(0)
S = rng.randn(n_samples, n_features)
s = rng.randn(n_samples)
B = rng.randn(n_samples, n_features)
b = rng.randn(n_samples)
D = rng.randn(n_samples, n_features)
d = rng.randn(n_samples)

In [17]:
clf = SVR(C=1.0, epsilon=0.2)

In [18]:
clf.fit(S_train, s_train)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [19]:
clf.score(S_test, s_test)

0.7899203162907724

In [21]:
clf.fit(B_train, b_train)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [23]:
clf.score(B_test, b_test)

0.72760340357776

In [24]:
clf.fit(D_train, d_train)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [25]:
clf.score(D_test, d_test)

0.709750453841411