In [134]:
# Running various regressions to discover a lifters best lift
# across the 3 lifts.
import pandas as pd
import numpy as np
import random

from sklearn.datasets import load_digits

In [135]:
# Sampling 20,000 data entries
n = 767672 
s = 20000
skip = sorted(random.sample(range(1,n+1),n-s))
data = pd.read_csv('zeroopenpowerlifting.csv', skiprows=skip)
data

Unnamed: 0,Name,Sex,Event,Equipment,Age,AgeClass,Division,BodyweightKg,WeightClassKg,Squat1Kg,...,TotalKg,Place,Wilks,McCulloch,Glossbrenner,IPFPoints,Tested,Country,Federation,Date
0,Renee Theyers,F,SBD,Wraps,29.0,24-34,F-OR,65.60,67.5,110.0,...,335.0,2,349.07,349.07,308.07,568.54,,,GPC-AUS,2018-10-27
1,Wesley Vick,M,SBD,Wraps,28.0,24-34,M-OR,89.90,90,245.0,...,670.0,3,427.97,427.97,412.43,641.00,,Australia,GPC-AUS,2018-10-27
2,Belinda Ryder,F,SBD,Wraps,44.0,40-44,F-OR,51.90,52,110.0,...,322.5,2,402.64,419.95,356.93,613.69,,Australia,GPC-AUS,2015-05-19
3,Cynthia Sepulveda,F,SBD,Wraps,29.0,24-34,F-OR,50.90,52,90.0,...,257.5,5,326.33,326.33,289.52,487.17,,Australia,GPC-AUS,2015-05-19
4,Danielle Moody,F,SBD,Wraps,26.0,24-34,F-OR,74.20,75,110.0,...,340.0,4,325.41,325.41,286.36,546.99,,,GPC-AUS,2015-05-19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,Noelle Brigden,F,SBD,Raw,41.5,40-44,retrowmp,50.17,52,0.0,...,332.5,1,426.04,430.30,378.24,645.90,Yes,USA,NASA,2019-06-22
19996,Drea Davis,F,SBD,Wraps,23.0,20-23,wjr,68.95,75,0.0,...,350.0,2,351.87,351.87,310.28,581.33,Yes,USA,NASA,2019-06-22
19997,Barbara Cheney,F,SBD,Wraps,70.0,70-74,wmp,91.22,90+,0.0,...,,DQ,,,,,Yes,USA,NASA,2020-02-22
19998,Emmanuel Bernal,M,SBD,Raw,28.0,24-34,retroint,89.90,90,0.0,...,510.0,1,325.77,325.77,312.23,467.69,Yes,USA,NASA,2019-05-18


In [136]:
# Dropping columns not needed for this test
data = data.drop(columns=['Name', 'AgeClass', 'Division',
                          'Squat4Kg', 'Bench4Kg', 'Deadlift4Kg',
                          'Place', 'Tested', 'Country', 'Federation',
                          'Date', 'IPFPoints', 'TotalKg', 'Squat1Kg',
                          'Squat2Kg', 'Squat3Kg','Bench1Kg', 'Bench2Kg',
                          'Bench3Kg', 'Deadlift1Kg', 'Deadlift2Kg',
                          'Deadlift3Kg', 'McCulloch', 'Glossbrenner'])

data.columns
data.isnull().sum()

Sex                   0
Event                 0
Equipment             0
Age                   0
BodyweightKg          0
WeightClassKg        91
Best3SquatKg          0
Best3BenchKg          0
Best3DeadliftKg       0
Wilks              1197
dtype: int64

In [137]:
# General dropna just  to clear those without a WILKS
data = data.dropna()

In [138]:
# Dropping athletes who did not compete in complete SBD competitions
indexNames = data[data['Event'] == 'S'].index
data.drop(indexNames, inplace=True)

indexNames = data[data['Event'] == 'B'].index
data.drop(indexNames, inplace=True)

indexNames = data[data['Event'] == 'D'].index
data.drop(indexNames, inplace=True)

indexNames = data[data['Event'] == 'SD'].index
data.drop(indexNames, inplace=True)

indexNames = data[data['Event'] == 'SB'].index
data.drop(indexNames, inplace=True)

indexNames = data[data['Event'] == 'BD'].index
data.drop(indexNames, inplace=True)

In [139]:
data

Unnamed: 0,Sex,Event,Equipment,Age,BodyweightKg,WeightClassKg,Best3SquatKg,Best3BenchKg,Best3DeadliftKg,Wilks
0,F,SBD,Wraps,29.0,65.60,67.5,122.5,62.5,150.0,349.07
1,M,SBD,Wraps,28.0,89.90,90,270.0,140.0,260.0,427.97
2,F,SBD,Wraps,44.0,51.90,52,120.0,60.0,142.5,402.64
3,F,SBD,Wraps,29.0,50.90,52,105.0,40.0,112.5,326.33
4,F,SBD,Wraps,26.0,74.20,75,120.0,65.0,155.0,325.41
...,...,...,...,...,...,...,...,...,...,...
19991,F,SBD,Raw,29.0,78.61,82.5,132.5,85.0,167.5,355.79
19994,M,SBD,Raw,25.5,126.82,140,232.5,155.0,267.5,372.18
19995,F,SBD,Raw,41.5,50.17,52,115.0,75.0,142.5,426.04
19996,F,SBD,Wraps,23.0,68.95,75,142.5,67.5,140.0,351.87


In [140]:
# Remove the last couple of columns
data = data.drop(columns = ['Event', 'WeightClassKg'])

In [141]:
# Convert string data to numeric
sex = {'M': 1, 'F': 0, 'Mx': 3}
equipment = {'Raw': 0, 'Wraps': 1, 'Multi-ply': 2, 'Single-ply': 3, 'Straps': 4}

data.Sex = [sex[item] for item in data.Sex]
data.Equipment = [equipment[item] for item in data.Equipment]

data

Unnamed: 0,Sex,Equipment,Age,BodyweightKg,Best3SquatKg,Best3BenchKg,Best3DeadliftKg,Wilks
0,0,1,29.0,65.60,122.5,62.5,150.0,349.07
1,1,1,28.0,89.90,270.0,140.0,260.0,427.97
2,0,1,44.0,51.90,120.0,60.0,142.5,402.64
3,0,1,29.0,50.90,105.0,40.0,112.5,326.33
4,0,1,26.0,74.20,120.0,65.0,155.0,325.41
...,...,...,...,...,...,...,...,...
19991,0,0,29.0,78.61,132.5,85.0,167.5,355.79
19994,1,0,25.5,126.82,232.5,155.0,267.5,372.18
19995,0,0,41.5,50.17,115.0,75.0,142.5,426.04
19996,0,1,23.0,68.95,142.5,67.5,140.0,351.87


In [142]:
# Normalise the data, scaling to values between 0-1
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))
data[['Sex', 'Equipment', 'Age', 'BodyweightKg',
        'Wilks', 'Best3BenchKg', 'Best3DeadliftKg', 'Best3SquatKg']] = scaler.fit_transform(data[['Sex', 'Equipment', 'Age', 'BodyweightKg',
        'Wilks', 'Best3BenchKg', 'Best3DeadliftKg', 'Best3SquatKg']])

In [143]:
# So, running 3 different test and training sets.
# First for squats, then bench, finally deadlifts,
# See the model accuracy for predicting someone's lifts.
# Further experiments to see which variables are critical
# to accurate prediction. How accurate can we get with age,
# sex, weight, and equipment?

test_squat = data[['Sex', 'Equipment', 'Age', 'BodyweightKg',
        'Wilks', 'Best3BenchKg', 'Best3DeadliftKg']]

target_squat = data[['Best3SquatKg']]

test_bench = data[['Sex', 'Equipment', 'Age', 'BodyweightKg',
        'Wilks', 'Best3SquatKg', 'Best3DeadliftKg']]

target_bench = data[['Best3BenchKg']]

test_dead = data[['Sex', 'Equipment', 'Age', 'BodyweightKg',
        'Wilks', 'Best3BenchKg', 'Best3SquatKg']]

target_dead = data[['Best3DeadliftKg']]

In [144]:
# Preparing sets
S = test_squat
s = target_squat
B = test_bench
b = target_bench
D = test_dead
d = target_dead

In [145]:
# Split sets into training and test sets
from sklearn.model_selection import train_test_split
S_train, S_test, s_train, s_test = train_test_split(S, s, test_size=0.1, random_state=3)
B_train, B_test, b_train, b_test = train_test_split(B, b, test_size=0.1, random_state=3)
D_train, D_test, d_train, d_test = train_test_split(D, d, test_size=0.1, random_state=3)

In [146]:
# Prepare model, then fit the models and check scores!
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=10)

In [147]:
model.fit(S_train, s_train.values.ravel())

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [148]:
model.score(S_test, s_test)

0.9646430680110408

In [149]:
model.fit(B_train, b_train.values.ravel())

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [150]:
model.score(B_test, b_test)

0.9335962073175207

In [151]:
model.fit(D_train, d_train.values.ravel())

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [152]:
model.score(D_test, d_test)

0.9455136642423051