In [None]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
powerlifters = pd.read_csv("Data//powerlifting.csv")

# Replace blanks with NaN, fill all NaN values with "0"
# Replace all negative values with "1"
# Update dataframe with values
# Replace 0's with "NA" and 1's with "Fail" and save to new CSV
modifiedPowerlifters = powerlifters.replace(r'\s+', np.nan, regex=True)
# modifiedPowerlifters = powerlifters.fillna(0)
for col in modifiedPowerlifters.columns[9:23]:
    for i, row_value in modifiedPowerlifters[col].iteritems():
        if row_value < 0:
            modifiedPowerlifters.loc[i, col] = 0
# modifiedPowerlifters = modifiedPowerlifters.replace(0, "NA")
modifiedPowerlifters = modifiedPowerlifters.replace("M", "Male")
modifiedPowerlifters = modifiedPowerlifters.replace("F", "Female")
modifiedPowerlifters.to_csv('Data//modifiedPowerlifters.csv', index=False)

# Override 'powerlifters' variable to read newly modified CSV file
powerlifters = pd.read_csv("Data//modifiedPowerlifters.csv")

# Convert "Date" column to datetime for easier Data Exploration later
powerlifters['Date'] = pd.to_datetime(powerlifters['Date'], errors="coerce")

In [None]:
# Statistical Description
powerlifters.describe()

In [None]:
# Column Names
print(powerlifters.columns)
print(powerlifters.info())

In [None]:
# Top 20 rows
print(powerlifters.head(20))

In [None]:
# Number of Females and Males
print(powerlifters.groupby('Sex').size())

In [None]:
# Number of differnt types of equipment
print(powerlifters.groupby('Equipment').size())

In [None]:
# Number of different Age Classes
print(powerlifters.groupby('AgeClass').size())

In [None]:
# Number of different Weight Classes
print(powerlifters.groupby('WeightClassKg').size())

In [None]:
powerlifters['Sex'].value_counts().plot(kind='bar', title='Male vs Female')

In [None]:
powerlifters['Sex'].value_counts().plot.pie()

In [None]:
powerlifters['WeightClassKg'].value_counts().plot(kind='bar', figsize=(16,8), title='Count of Powerlifters in WeightClass(Kg)')

In [None]:
powerlifters.dtypes

In [None]:
powerlifters['Age'].plot(kind='hist', bins=80, figsize=(16,8), title='age')
plt.gca().grid(True)

In [None]:
powerlifters['Date'].dt.year.plot(kind='hist', bins=50, figsize=(16,8), title='Number of Powerlifters by Year')

In [None]:
powerlifters['TotalKg'].hist(bins=200, figsize=(16,8))
plt.gca().set_yscale('log')

In [None]:
powerlifters['Best3SquatKg'].hist(bins=100, figsize=(16,8))
plt.gca()

In [None]:
powerlifters['Best3DeadliftKg'].hist(bins=100, figsize=(16,8))
plt.gca()

In [None]:
powerlifters['Best3BenchKg'].hist(bins=100, figsize=(16,8))
plt.gca()

In [None]:
powerlifters['Age'].value_counts().sort_index().plot.bar(figsize=(16,10), title='Powerlifters Age')

In [None]:
powerlifters[powerlifters['Sex'] == 'Male']['Age'].hist(bins=89, figsize=(16,10))
powerlifters[powerlifters['Sex'] == 'Female']['Age'].hist(bins=89, figsize=(16,10))
plt.show()

In [None]:
powerlifter_gender_age = pd.crosstab(powerlifters['Age'], powerlifters['Sex'])
powerlifter_gender_age.plot.bar(stacked=True, figsize=(16,10), title='Powerlifters Age')

In [None]:
powerlifter_gender_age.plot.bar(stacked=True, figsize=(16,10), title="Powerlifter's Age and Gender")
plt.xticks(rotation=0)
plt.savefig('graphs/genders-ages.png', bbox_inches='tight')

In [None]:
powerlifters['Equipment'].value_counts().plot(kind='bar', figsize=(16,10))

In [None]:
# Plot graph based on the count of rows from a certain event, SBD is the highest
powerlifters['Event'].value_counts().plot(kind='bar', figsize=(16,10))

In [None]:
# Get the count of each type of equipment
# and assign it to a new variable with the values tolist()
powerlifter_equipment = powerlifters['Equipment'].value_counts()
powerlifter_size = powerlifter_equipment.values.tolist()
powerlifter_labels = "Wraps", "Multi-ply", "Single-ply", "Raw"
colours = ['#F38181','#EAFFD0','#95E1D3','#FCE38A','#BDE4F4','#9EF4E6']
explode = (0.1,0,0,0)

fig = plt.figure(figsize=(12,8))
plt.title('Powerlifters Equipment Type (Percentange)', fontsize=22)
patches, texts, autotexts = plt.pie(powerlifter_size, explode=explode, labels=powerlifter_labels, colors=colours,
        autopct='%1.1f%%', shadow=True, startangle=150)

for text,autotext in zip(texts,autotexts):
    text.set_fontsize(14)
    autotext.set_fontsize(14)

plt.axis("equal")
plt.show()

In [None]:
powerlifters['Event'].value_counts().plot.pie(figsize=(12,12))

In [None]:
# General plots
sns.pairplot(powerlifters[['Age', 'Sex', 'BodyweightKg', 'Best3SquatKg', 'Best3BenchKg',
                          'Best3DeadliftKg', 'TotalKg']], hue='Sex')

In [None]:
# Linear regression plots
sns.pairplot(powerlifters[['Sex', 'BodyweightKg', 'Best3SquatKg', 'Best3BenchKg',
                          'Best3DeadliftKg', 'TotalKg']], hue='Sex', kind='reg')

In [None]:
years_division = pd.crosstab(powerlifters['Date'].dt.year.astype(np.int), powerlifters['WeightClassKg'])
years_division.plot(figsize=(16,10), title='# powerlifters per year and weight class')

In [None]:
years_division = pd.crosstab(powerlifters['Date'].dt.year.astype(np.int), powerlifters['Country'])
years_division.plot(figsize=(16,10), title='# powerlifters per year and country')

In [None]:
powerlifters.hist(figsize=(16,12))
plt.show()

In [None]:
plt.scatter(powerlifters['Best3DeadliftKg'], powerlifters['Best3BenchKg'])

In [None]:
countries_event = pd.crosstab(powerlifters['Country'], powerlifters['Event'])

f, ax = plt.subplots(figsize=(16,10))

mask = np.zeros_like(countries_event, dtype=np.bool)
mask[countries_event == 0] = True

sns.heatmap(countries_event, mask=mask, linewidth=.5, cmap=plt.cm.plasma)

In [None]:
countries_eq = pd.crosstab(powerlifters['Country'], powerlifters['Equipment'])

f, ax = plt.subplots(figsize=(16,10))

mask = np.zeros_like(countries_eq, dtype=np.bool)
mask[countries_eq == 0] = True

sns.heatmap(countries_eq, mask=mask, linewidth=.5, cmap=plt.cm.plasma)

In [None]:
powerlifters = powerlifters.fillna(0)

In [None]:
# Remove the fields that we deem irrelevant
# to the analysis we are making
# Here we create a new variable to store the dataframe
# with irrelevant features excluded to prevent it
# from affecting the previous results

fePowerlifters = powerlifters.copy()
del fePowerlifters['Name']
del fePowerlifters['Event']
del fePowerlifters['AgeClass']
del fePowerlifters['Division']
del fePowerlifters['WeightClassKg']
del fePowerlifters['Squat1Kg']
del fePowerlifters['Squat2Kg']
del fePowerlifters['Squat3Kg']
del fePowerlifters['Squat4Kg']
del fePowerlifters['Bench1Kg']
del fePowerlifters['Bench2Kg']
del fePowerlifters['Bench3Kg']
del fePowerlifters['Bench4Kg']
del fePowerlifters['Deadlift1Kg']
del fePowerlifters['Deadlift2Kg']
del fePowerlifters['Deadlift3Kg']
del fePowerlifters['Deadlift4Kg']
del fePowerlifters['Place']
del fePowerlifters['Wilks']
del fePowerlifters['McCulloch']
del fePowerlifters['Glossbrenner']
del fePowerlifters['IPFPoints']
del fePowerlifters['Tested']
del fePowerlifters['Country']
del fePowerlifters['Federation']
del fePowerlifters['Date']
del fePowerlifters['MeetCountry']
del fePowerlifters['MeetState']
del fePowerlifters['MeetName']
del fePowerlifters['Best3BenchKg']
del fePowerlifters['Best3SquatKg']
del fePowerlifters['TotalKg']

In [None]:
print(fePowerlifters.columns)

In [None]:
print(fePowerlifters.describe())

In [None]:
# Replace categorical data (Sex) with one-hot encoded data
features_pl = pd.get_dummies(fePowerlifters, columns=['Sex', 'Equipment'])

# # Create variable for Best3SquatKg Analysis
# # 'sq' = Squats
# features_sq = pd.get_dummies(fePowerlifters, columns=['Sex', 'Equipment'])
# del features_sq['Best3BenchKg']
# del features_sq['Best3DeadliftKg']
# del features_sq['TotalKg']

# # Remove output value to be used as 'y' for training
# del features_sq['Best3SquatKg']

# # Create variable for Best3BenchKg Analysis
# # 'bh' = Bench
# features_bh = pd.get_dummies(fePowerlifters, columns=['Sex', 'Equipment'])
# del features_bh['Best3SquatKg']
# del features_bh['Best3DeadliftKg']
# del features_bh['TotalKg']

# # Remove output value to be used as 'y' for training
# del features_bh['Best3BenchKg']

# Create variable for Best3DeadliftKg Analysis
# 'dl' = Deadlift
features_dl = pd.get_dummies(fePowerlifters, columns=['Sex', 'Equipment'])

# Remove output value to be used as 'y' for training
del features_dl['Best3DeadliftKg']

# # Create variable for TotalKg Analysis
# # 'total' = TotalKg
# features_total = pd.get_dummies(fePowerlifters, columns=['Sex', 'Equipment'])
# del features_total['Best3BenchKg']
# del features_total['Best3SquatKg']
# del features_total['Best3DeadliftKg']

# # Remove output value to be used as 'y' for training
# del features_total['TotalKg']

# Create separate variable identical to 'features_pl' for use later
features_impt = pd.get_dummies(fePowerlifters, columns=['Sex', 'Equipment'])

# Create the X and y arrays, X being the input data and y being the output data,
# the value we are predicting
# Arrays for training 'deadlift'
X = features_dl.values
y = features_pl['Best3DeadliftKg'].values

scoring = 'neg_mean_absolute_error'
seed = 7

In [None]:
from sklearn.model_selection import train_test_split
# Split the data into a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

In [None]:
# Spot Checking Algorithms
from sklearn import model_selection
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import ensemble
from sklearn.svm import SVR

models = []
models.append(('Linear Regression', LinearRegression()))
models.append(('SGDRegressor', SGDRegressor(max_iter=1000, tol=1e-3)))
models.append(('Ridge', Ridge()))
models.append(('LASSO', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('KNeighborsRegressor', KNeighborsRegressor()))
models.append(('DecisionTreeRegressor', DecisionTreeRegressor()))
models.append(('GradientBoostingRegressor', ensemble.GradientBoostingRegressor()))
models.append(('SupportVectorMachines', SVR(gamma='auto')))
models.append(('RandomForestRegressor', ensemble.RandomForestRegressor(max_depth=2, random_state=seed, n_estimators=100)))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib

# Implement Grid Search to determine the best paramters for Algorithm (GradientBoostingRegressor)
# Parameters we want to try
# Run only if needed

gbr = ensemble.GradientBoostingRegressor()

param_grid = {
    'n_estimators': [500, 1000, 3000],
    'max_depth': [2, 4, 6],
    'min_samples_leaf': [3, 5, 9, 17],
    'learning_rate': [0.1, 0.05, 0.02, 0.01],
    'max_features': [1.0, 0.3, 0.1],
    'loss': ['ls', 'lad', 'huber']
}

# Define the grid search we want to run. Run it with four cpus in parallel
gs_cv = GridSearchCV(gbr, param_grid, n_jobs=6, verbose=100)

# Run the grid search - on only the training data!
gs_cv.fit(X_train, y_train)

# Print the parameters that gave us the best result!
print(gs_cv.best_params_)

In [None]:
# Fit regression model
# Testing out new hyper-parameters
# From results of GridSearch
# After running GridSearch, BestParameters: 
# {'learning_rate': 0.01, 'loss': 'ls', 'max_depth': 4, 'max_features': 0.3, 'min_samples_leaf': 3, 'n_estimators': 1000}

# The combination that worked best
gbr = ensemble.GradientBoostingRegressor(
                n_estimators=1000,
                learning_rate=0.01,
                max_depth=4,
                min_samples_leaf=3,
                max_features=0.3,
                loss='ls',
                random_state=seed
)
 
gbr.fit(X_train, y_train)

# Save the trained model to a file so we can use it in other programs
joblib.dump(gbr, "models//gs_trained_deadlift_regressor_model.pkl")

In [None]:
# Fit regression model
# With Default hyper-parameters without GridSearch
gbr = ensemble.GradientBoostingRegressor(
                n_estimators=1000,
                learning_rate=0.1,
                max_depth=6,
                min_samples_leaf=9,
                max_features=0.1,
                loss='huber',
                random_state=seed
)
 
gbr.fit(X_train, y_train)

# Save the trained model to a file so we can use it in other programs
joblib.dump(gbr, "models//trained_deadlift_regressor_model.pkl")

In [None]:
# These are the feature labels from our data set
feature_labels = features_impt.columns

# Load the trained model created previously
model = joblib.load('models//trained_deadlift_regressor_model.pkl')
model_gs = joblib.load('models//gs_trained_deadlift_regressor_model.pkl')

# Create a numpy array based on the model's feature importances
importance = model.feature_importances_
importance_gs = model_gs.feature_importances_

# Sort the feature labels based on the feature importance rankings from the model
feature_indexes_by_importance = importance.argsort()
feature_indexes_by_importance_gs = importance_gs.argsort()

# Print each feature label, from most important to least important (reverse order)
print("Most to Least Important Features without GridSearch Optimization")
print("")
for index in feature_indexes_by_importance:
    print("{} - {:2f}%".format(feature_labels[index], (importance[index] * 100.0)))
print("")
    
print("Most to Least Important Features with GridSearch Optimization")
print("")
for index in feature_indexes_by_importance_gs:
    print("{} - {:2f}%".format(feature_labels[index], (importance_gs[index] * 100.0)))
print("")

In [None]:
# For the Total Weight lifted, we need to provide the features in the exact same
# arrangement as our training data set
weight_lifted = [
    # Lifter features
    21, # Age
    80.0, # BodyweightKg
    
    # Sex: Choose only one
    0, # Female
    1, # Male
    
    # Equipment: Choose only one
    0, # Multi-ply
    1, # Raw
    0, # Single-ply
    0, # Wraps
]

lifts_to_value = [
    weight_lifted
]

predicted_lift_values = model.predict(lifts_to_value)
predicted_lift_values_gs = model_gs.predict(lifts_to_value)

predicted_value = predicted_lift_values[0]
gs_predicted_value = predicted_lift_values_gs[0]

print("Before GridSearch: This person has an estimated deadlift of {:,.2f}kg".format(predicted_value))
print("After GridSearch: This person has an estimated deadlift of {:,.2f}kg".format(gs_predicted_value))

In [None]:
from sklearn.metrics import mean_absolute_error

# Error rate before implementing GridSearch 'best-parameters'
print("Before GridSearch:")
# Find the error rate on the training set
mse = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

# Find the error rate on the test set
mse = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)

print("")

# Error rate after implementing GridSearch 'best-parameters'
print("After GridSearch:")
# Find the error rate on the training set
mse = mean_absolute_error(y_train, model_gs.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

# Find the error rate on the test set
mse = mean_absolute_error(y_test, model_gs.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)