In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
plt_kwargs = {'figsize': (10, 6)}

In [5]:
#Import and clean the data so that there are no null values
powerlifting = pd.read_csv("openpowerlifting.csv")
# We are only concerned about the following variables:
exp_df = powerlifting[['Sex',
                      'Equipment',
                      'Age',
                      'BodyweightKg',
                      'BestSquatKg',
                      'BestBenchKg',
                      'BestDeadliftKg',
                      'TotalKg',
                      'Place',
                      'Wilks']].copy()
exp_df = exp_df[(powerlifting['Place']!='DQ')].copy()
exp_df = exp_df[np.isfinite(exp_df['Age'])]
exp_df = exp_df[np.isfinite(exp_df['BestSquatKg'])]
exp_df = exp_df[np.isfinite(exp_df['BestBenchKg'])]
exp_df = exp_df[np.isfinite(exp_df['BestDeadliftKg'])]
exp_df.dropna(inplace = True)


In [6]:
exp_df.corr()

Unnamed: 0,Age,BodyweightKg,BestSquatKg,BestBenchKg,BestDeadliftKg,TotalKg,Wilks
Age,1.0,0.105376,-0.033221,0.034519,-0.037722,-0.016317,-0.071761
BodyweightKg,0.105376,1.0,0.653526,0.663657,0.639799,0.674396,0.207896
BestSquatKg,-0.033221,0.653526,1.0,0.911168,0.907288,0.976412,0.789293
BestBenchKg,0.034519,0.663657,0.911168,1.0,0.877546,0.956831,0.715002
BestDeadliftKg,-0.037722,0.639799,0.907288,0.877546,1.0,0.962932,0.727998
TotalKg,-0.016317,0.674396,0.976412,0.956831,0.962932,1.0,0.77365
Wilks,-0.071761,0.207896,0.789293,0.715002,0.727998,0.77365,1.0


In [7]:
exp_df['Equipment'].value_counts()

Raw           69977
Wraps         21831
Single-ply    13011
Multi-ply      1843
Name: Equipment, dtype: int64

In [8]:
# Create experimental dataframes for each equipment type
exp1_raw = exp_df[(exp_df['Equipment']=='Raw')].copy()

exp1_single = exp_df[(exp_df['Equipment']=='Single-ply')].copy()

exp1_wraps = exp_df[(exp_df['Equipment']=='Wraps')].copy()

exp1_multi = exp_df[(exp_df['Equipment']=='Multi-ply')].copy()

In [9]:
# This method takes in one of the above experimental dataframes 
# and returns a list of coefficients that help us build the model. 

# Once we create the model for each type of equipment for each event, 
# we will be able to look at how impactful equipment usage really is.

def get_model(df, event):
    X = df[['Wilks']] # You can add predictor variables by doing ['Wilks','Add_Var']
    
    if event == 's':
    # Predict Squats:
        y = df[['BestSquatKg']]
    elif event == 'b':
    # Predict Squats:
        y = df[['BestBenchKg']]
    elif event == 'd':
     # Predict Squats:
        y = df[['BestDeadliftKg']]

    # Split Data 80/20:
    from sklearn.model_selection import train_test_split

    # Split X and y into X_
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

    # Train Model
    from sklearn.linear_model import LinearRegression

    regression_model = LinearRegression()
    regression_model.fit(X_train, y_train)

    # Now that our model is trained, we can view the coefficients of the model using 
    # regression_model.coef_, which is an array of tuples of coefficients.
    # Each regression coefficient shows the strength of the relationship between the predictor 
    # variable and the outcome variable while controlling for the other predictor variable 
    #------
    #coef = pd.DataFrame
    for idx, col_name in enumerate(X_train.columns):
        #coef.append(regression_model.coef_[0][idx])
        print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
    
    intercept = regression_model.intercept_[0]
    
    #tmp = abs(df[['TotalKg']].dropna().sample(10000, random_state=42))
    
    #results = pd.DataFrame
    
    #for temp in tmp:
     #   y = (coef*temp)+intercept
      #  results.append(y)
    
    

    print("The intercept for our model is {}".format(intercept))
    
    
    
    #------
    #coefs = []

    #for idx, col_name in enumerate(X_train.columns):
    #    coefs[idx] == regression_model.coef_[0][idx]

    # regression_model.intercept_ returns an array of intercepts
    #intercept = regression_model.intercept_[0]


    #Now that we know the regression coefficients for each predictor variable and the intercept, 
    # we can figure out our model:

    # ŷ = intercept + coefs[0] * x1 
    
    # Return a list of where coefs[0] is the coefficient and coefs[1] = intercept
    
    #coefs.append(intercept)
    

In [10]:
# Get random sample of 10,000 lifters

    
    

In [11]:
# get trained models for squats
print("squats:")
get_model(exp1_raw, 's')
get_model(exp1_single, 's')
get_model(exp1_wraps, 's')

squats:
The coefficient for Wilks is 0.6619610314826567
The intercept for our model is -63.895481888754716
The coefficient for Wilks is 0.5955788318145823
The intercept for our model is -37.676069642680005
The coefficient for Wilks is 0.7734684967691747
The intercept for our model is -88.98069301953791


In [12]:
# get trained models for bench
print("bench:")
get_model(exp1_raw, 'b')
get_model(exp1_single, 'b')
get_model(exp1_wraps, 'b')

bench:
The coefficient for Wilks is 0.443473210646772
The intercept for our model is -45.259029734241594
The coefficient for Wilks is 0.4246748307372086
The intercept for our model is -37.966692700990905
The coefficient for Wilks is 0.4784556887499858
The intercept for our model is -50.473527341670206


In [13]:
# get trained models for deadlifting
print("dead:")
get_model(exp1_raw, 'd')
get_model(exp1_single, 'd')
get_model(exp1_wraps, 'd')

dead:
The coefficient for Wilks is 0.6617350670124019
The intercept for our model is -34.31277839304954
The coefficient for Wilks is 0.43267028794376505
The intercept for our model is 29.19252153528288
The coefficient for Wilks is 0.6660159747152594
The intercept for our model is -31.358587915646893


In [14]:
# get random sample of 10,000 weightlifters 
get_sample = exp1_raw.sample(100, random_state=42)


In [15]:
s_raw = []
s_single = []
s_wraps = []

for lifter in get_sample:
    
    x = get_sample['Wilks']
    
    # squat
    s_raw.append(.66*x - 63.89)
    s_single.append(.6*x - 37.68)
    s_wraps.append(.77*x -88.99)
    
b_raw = []
b_single = []
b_wraps = []

for lifter in get_sample:
    
    x = get_sample['Wilks']    
    
    # bench
    b_raw.append(.44*x - 45.26)
    b_single.append(.42*x - 37.97)
    b_wraps.append(.48*x - 50.47)
    
    
d_raw = []
d_single = []
d_wraps = []

for lifter in get_sample:
    
    x = get_sample['Wilks']
    
    # dead
    d_raw.append(.66*x - 34.31)
    d_single.append(.43*x - 29.20)
    d_wraps.append(.67*x - 31.36)

In [16]:
def average_contest(list1, list2, list3):
    total1 = 0
    for item in list1:
        total1 = item
    average1 = total1/len(list1)
    
    
    total2 = 0
    for item in list1:
        total2 = item
    average2 = total2/len(list2)
    
    total3 = 0
    for item in list3:
        total3 = item
    average3 = total3/len(list3)
    
    print("Average predicted squat: " +str(average1)+
          "/nAverage predicted bench: " +str(average2)+
          "/nAverage predicted bench: " +str(average3))

In [17]:
average_contest(s_raw, s_single, s_wraps)

Average predicted squat: 296145     9.455620
287892    14.168680
352694    17.388490
268995    17.343940
58791     22.497880
268546     8.816740
368058    14.944180
336796    16.462180
241558    16.301140
13760     13.094200
366555    16.010080
303274    18.287740
333933    18.042220
454       17.606620
328446    20.585860
115986    20.444620
255924    15.426640
366125    19.374100
245703    15.470200
376526    14.305960
7859      13.991140
328084    18.328594
252420    12.968800
356006    15.789640
289787    11.040280
360600    26.385280
365956    12.983980
285816    14.048560
289335    12.270520
241368    13.706020
            ...    
277840    15.444460
261339    16.270780
261618    15.748720
347037    23.696308
268060    17.755120
278479    20.558800
328012    16.314274
360704    24.277900
13832     10.941280
267246    10.248940
63500     18.665920
47315     22.821280
375551    20.457820
46211     13.631440
357715    10.504360
351118     5.942440
254376    15.416740
124955    17.63

In [18]:
# R squared value function
def get_model_R(df, event):
    X = df[['Wilks']] # You can add predictor variables by doing ['Wilks','Add_Var']
    
    
    if event == 's':
    # Predict Squats:
        y = df[['BestSquatKg']]
    elif event == 'b':
     #Predict Squats:
        y = df[['BestBenchKg']]
    elif event == 'd':
     # Predict Squats:
        y = df[['BestDeadliftKg']]
    elif event == 't':
        y= df[['TotalKg']]
    
    #y=df[['']]

    # Split Data 80/20:
    from sklearn.model_selection import train_test_split

    # Split X and y into X_
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

    # Train Model
    from sklearn.linear_model import LinearRegression

    regression_model = LinearRegression()
    predict = regression_model.fit(X_train, y_train)
    
    #rmse = mean_squared_error(predict, y_test)

    # Now that our model is trained, we can view the coefficients of the model using 
    # regression_model.coef_, which is an array of tuples of coefficients.
    # Each regression coefficient shows the strength of the relationship between the predictor 
    # variable and the outcome variable while controlling for the other predictor variable 

    #coefs = []

    ## coefs[idx] == regression_model.coef_[0][idx]

    #regression_model.intercept_ returns an array of intercepts
    #intercept = regression_model.intercept_[0]


    #Now that we know the regression coefficients for each predictor variable and the intercept, 
    # we can figure out our model:

    # ŷ = intercept + coefs[0] * x1 
    
    # Return a list of where coefs[0] is the coefficient and coefs[1] = intercept
    
   # coefs.append(intercept)
    
    #return coefs
    
    return regression_model.score(X_train, y_train)
    

In [20]:
# get trained models for squats
raw_model_squats = get_model_R(exp1_raw, 's')
single_model_squats = get_model_R(exp1_single, 's')
wraps_model_squats = get_model_R(exp1_wraps, 's')
multip_model_sqauts = get_model_R(exp1_multi, 's')


# get trained models for bench
raw_model_bench = get_model_R(exp1_raw, 'b')
single_model_bench = get_model_R(exp1_single, 'b')
wraps_model_bench = get_model_R(exp1_wraps, 'b')
multi_model_bench = get_model_R(exp1_multi, 'b')

# get trained models for deadlifting
raw_model_deadlift = get_model_R(exp1_raw, 'd')
single_model_deadlift = get_model_R(exp1_single, 'd')
wraps_model_deadlift = get_model_R(exp1_wraps, 'd')
multi_model_deadlift = get_model_R(exp1_multi, 'd')

total_model_wraps = get_model_R(exp1_wraps, 't')
total_model_raw = get_model_R(exp1_raw, 't')
total_model_single = get_model_R(exp1_single, 't')
total_model_multi = get_model_R(exp1_multi, 't')

In [22]:
print(raw_model_squats)
print(single_model_squats)
print(wraps_model_squats)
print(multip_model_sqauts)

print()

print(raw_model_bench)
print(single_model_bench)
print(wraps_model_bench)
print(multi_model_bench)


print()

print(raw_model_deadlift)
print(single_model_deadlift)
print(wraps_model_deadlift)
print(multi_model_deadlift)

print()

print(total_model_raw)
print(total_model_single)
print(total_model_wraps)
print(total_model_multi)

0.5789474777397912
0.5352094872817335
0.65830731417038
0.6192533456050635

0.4613475514968949
0.42795768802372725
0.5255984678878163
0.5075097293567068

0.5635028875182737
0.4350490940425281
0.6081140774393932
0.4937332267854987

0.5719288809614463
0.5037360378880311
0.6462580921161126
0.6332857937736782
