In [180]:
# Running RandomForest regressions to discover a lifters best lift
# across the 3 lifts.
import pandas as pd
import numpy as np
import random

from sklearn.datasets import load_digits

In [181]:
# Sampling 20,000 data entries
n = 767672 
s = 40000
skip = sorted(random.sample(range(1,n+1),n-s))
data = pd.read_csv('zeroopenpowerlifting.csv', skiprows=skip)
data

Unnamed: 0,Name,Sex,Event,Equipment,Age,AgeClass,Division,BodyweightKg,WeightClassKg,Squat1Kg,...,TotalKg,Place,Wilks,McCulloch,Glossbrenner,IPFPoints,Tested,Country,Federation,Date
0,Andrea Rowan,F,SBD,Wraps,45.0,45-49,F-OR,104.00,110,120.0,...,390.0,3,321.25,338.91,274.56,550.08,,,GPC-AUS,2018-10-27
1,Dakoda Plumridge,F,SBD,Wraps,27.0,24-34,F-OR,78.60,82.5,172.5,...,492.5,1,455.17,455.17,399.56,773.08,,Australia,GPC-AUS,2018-10-27
2,Nicole Brown,F,SBD,Raw,28.0,24-34,F-OR,57.70,60,82.5,...,247.5,5,284.47,284.47,251.48,439.90,,,GPC-AUS,2018-10-27
3,Andrew Yuile,M,SBD,Wraps,36.0,35-39,M-OR,79.50,82.5,112.5,...,345.0,9,236.47,236.47,228.46,317.40,,,GPC-AUS,2018-10-27
4,Dean Panopoulos,M,SBD,Wraps,26.0,24-34,M-OR,81.70,82.5,202.5,...,565.0,4,380.73,380.73,367.63,562.92,,Australia,GPC-AUS,2018-10-27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,Terri VandeVegte,F,BD,Raw,59.0,55-59,wm2,122.47,90+,0.0,...,237.5,1,189.23,248.84,160.16,,Yes,USA,NASA,2019-06-22
39996,Caleb Winey,M,SBD,Wraps,20.0,20-23,jr,93.71,100,0.0,...,572.5,2,358.35,369.11,342.88,520.92,Yes,USA,NASA,2019-06-22
39997,Nathan Crawford,M,BD,Raw,52.5,50-54,m2,81.19,82.5,0.0,...,265.0,1,179.26,208.84,172.61,,Yes,USA,NASA,2019-06-08
39998,Emmanuel Bernal,M,SBD,Wraps,28.0,24-34,int,89.90,90,0.0,...,510.0,1,325.77,325.77,312.23,467.69,Yes,USA,NASA,2019-05-18


In [182]:
# Dropping columns not needed for this test
data = data.drop(columns=['Name', 'AgeClass', 'Division',
                          'Squat4Kg', 'Bench4Kg', 'Deadlift4Kg',
                          'Place', 'Tested', 'Country', 'Federation',
                          'Date', 'IPFPoints', 'TotalKg', 'Squat1Kg',
                          'Squat2Kg', 'Squat3Kg','Bench1Kg', 'Bench2Kg',
                          'Bench3Kg', 'Deadlift1Kg', 'Deadlift2Kg',
                          'Deadlift3Kg', 'McCulloch', 'Glossbrenner',
                          'WeightClassKg'])

data.columns
data.isnull().sum()

Sex                   0
Event                 0
Equipment             0
Age                   0
BodyweightKg          0
Best3SquatKg          0
Best3BenchKg          0
Best3DeadliftKg       0
Wilks              2326
dtype: int64

In [183]:
# General dropna just  to clear those without a WILKS
data = data.dropna()

In [184]:
# Dropping athletes who did not compete in complete SBD competitions
indexNames = data[data['Event'] == 'S'].index
data.drop(indexNames, inplace=True)

indexNames = data[data['Event'] == 'B'].index
data.drop(indexNames, inplace=True)

indexNames = data[data['Event'] == 'D'].index
data.drop(indexNames, inplace=True)

indexNames = data[data['Event'] == 'SD'].index
data.drop(indexNames, inplace=True)

indexNames = data[data['Event'] == 'SB'].index
data.drop(indexNames, inplace=True)

indexNames = data[data['Event'] == 'BD'].index
data.drop(indexNames, inplace=True)

In [185]:
data

Unnamed: 0,Sex,Event,Equipment,Age,BodyweightKg,Best3SquatKg,Best3BenchKg,Best3DeadliftKg,Wilks
0,F,SBD,Wraps,45.0,104.00,140.0,80.0,170.0,321.25
1,F,SBD,Wraps,27.0,78.60,182.5,105.0,205.0,455.17
2,F,SBD,Raw,28.0,57.70,90.0,55.0,102.5,284.47
3,M,SBD,Wraps,36.0,79.50,125.0,77.5,142.5,236.47
4,M,SBD,Wraps,26.0,81.70,215.0,120.0,230.0,380.73
...,...,...,...,...,...,...,...,...,...
39991,M,SBD,Wraps,56.0,123.60,320.0,190.0,290.0,456.94
39993,M,SBD,Wraps,36.0,84.82,162.5,107.5,200.0,309.79
39996,M,SBD,Wraps,20.0,93.71,190.0,147.5,235.0,358.35
39998,M,SBD,Wraps,28.0,89.90,182.5,115.0,212.5,325.77


In [186]:
# Remove the last couple of columns
data = data.drop(columns = ['Event'])

In [187]:
# Convert string data to numeric
sex = {'M': 1, 'F': 0, 'Mx': 3}
equipment = {'Raw': 0, 'Wraps': 1, 'Multi-ply': 2, 'Single-ply': 3, 'Straps': 4}

data.Sex = [sex[item] for item in data.Sex]
data.Equipment = [equipment[item] for item in data.Equipment]

data

Unnamed: 0,Sex,Equipment,Age,BodyweightKg,Best3SquatKg,Best3BenchKg,Best3DeadliftKg,Wilks
0,0,1,45.0,104.00,140.0,80.0,170.0,321.25
1,0,1,27.0,78.60,182.5,105.0,205.0,455.17
2,0,0,28.0,57.70,90.0,55.0,102.5,284.47
3,1,1,36.0,79.50,125.0,77.5,142.5,236.47
4,1,1,26.0,81.70,215.0,120.0,230.0,380.73
...,...,...,...,...,...,...,...,...
39991,1,1,56.0,123.60,320.0,190.0,290.0,456.94
39993,1,1,36.0,84.82,162.5,107.5,200.0,309.79
39996,1,1,20.0,93.71,190.0,147.5,235.0,358.35
39998,1,1,28.0,89.90,182.5,115.0,212.5,325.77


In [188]:
# Normalise the data, scaling to values between 0-1
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))
data[['Sex', 'Equipment', 'Age', 'BodyweightKg',
        'Wilks', 'Best3BenchKg', 'Best3DeadliftKg', 'Best3SquatKg']] = scaler.fit_transform(data[['Sex', 'Equipment', 'Age', 'BodyweightKg',
        'Wilks', 'Best3BenchKg', 'Best3DeadliftKg', 'Best3SquatKg']])

In [189]:
# So, running 3 different test and training sets.
# First for squats, then bench, finally deadlifts,
# See the model accuracy for predicting someone's lifts.
# Further experiments to see which variables are critical
# to accurate prediction. How accurate can we get with age,
# sex, weight, and equipment?

test_squat = data[['Sex', 'Equipment', 'Age', 'BodyweightKg',
        'Wilks', 'Best3BenchKg', 'Best3DeadliftKg']]

target_squat = data[['Best3SquatKg']]

test_bench = data[['Sex', 'Equipment', 'Age', 'BodyweightKg',
        'Wilks', 'Best3SquatKg', 'Best3DeadliftKg']]

target_bench = data[['Best3BenchKg']]

test_dead = data[['Sex', 'Equipment', 'Age', 'BodyweightKg',
        'Wilks', 'Best3BenchKg', 'Best3SquatKg']]

target_dead = data[['Best3DeadliftKg']]

In [209]:
# Preparing sets
S = test_squat
s = target_squat
B = test_bench
b = target_bench
D = test_dead
d = target_dead

In [210]:
from sklearn.datasets import load_iris
S, s = load_iris(return_X_y=True)
B, b = load_iris(return_X_y=True)
D, d = load_iris(return_X_y=True)

In [211]:
# Split sets into training and test sets
from sklearn.model_selection import train_test_split
S_train, S_test, s_train, s_test = train_test_split(S, s, test_size=0.1, random_state=3)
B_train, B_test, b_train, b_test = train_test_split(B, b, test_size=0.1, random_state=3)
D_train, D_test, d_train, d_test = train_test_split(D, d, test_size=0.1, random_state=3)

In [212]:
data

Unnamed: 0,Sex,Equipment,Age,BodyweightKg,Best3SquatKg,Best3BenchKg,Best3DeadliftKg,Wilks
0,0.000000,0.333333,0.362416,0.396226,0.262911,0.190476,0.414634,0.379157
1,0.000000,0.333333,0.120805,0.241631,0.342723,0.250000,0.500000,0.566594
2,0.000000,0.000000,0.134228,0.114425,0.169014,0.130952,0.250000,0.327679
3,0.333333,0.333333,0.241611,0.247109,0.234742,0.184524,0.347561,0.260497
4,0.333333,0.333333,0.107383,0.260499,0.403756,0.285714,0.560976,0.462406
...,...,...,...,...,...,...,...,...
39991,0.333333,0.333333,0.510067,0.515520,0.600939,0.452381,0.707317,0.569071
39993,0.333333,0.333333,0.241611,0.279489,0.305164,0.255952,0.487805,0.363117
39996,0.333333,0.333333,0.026846,0.333597,0.356808,0.351190,0.573171,0.431083
39998,0.333333,0.333333,0.134228,0.310408,0.342723,0.273810,0.518293,0.385483


In [228]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

model = LogisticRegression(random_state = 3)

In [229]:
model.fit(S_train, s_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=3, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [230]:
model.score(S_test, s_test)

1.0

In [231]:
model.fit(B_train, b_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=3, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [234]:
model.score(B_test, b_test)

1.0

In [233]:
model.fit(D_train, d_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=3, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [235]:
model.score(D_test, d_test)

1.0