# Prediction Model v2
Here, my goal is to create a training pipeline that prepares the data through imputation, dummy variables, and ultimately modeling.

In [1]:
!pip install wandb pandas pyarrow



In [2]:
# Import statements
import pandas as pd
import wandb
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_absolute_error
import numpy as np
import tensorflow as tf
import keras

from tensorflow.keras.datasets import mnist
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
import tensorflow as tf

In [3]:
# Connect to wandb
run = wandb.init(project="mlops-datasets", job_type = "load-dataset")
art = run.use_artifact("smehta15-university-of-chicago/mlops-datasets/athletes:v2")
path = art.download()
train = pd.read_csv(f"{path}/train.csv")
test = pd.read_csv(f"{path}/test.csv")
run.finish()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msmehta15[0m ([33msmehta15-university-of-chicago[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact 'athletes:v2', 89.22MB. 4 files...
[34m[1mwandb[0m:   4 of 4 files downloaded.  
Done. 00:00:05.9 (15.1MB/s)


In [4]:
train = train[train['total_lift'].notna()]
test = test[test['total_lift'].notna()]

In [5]:
train.head()

Unnamed: 0,region,gender,age,height,weight,candj,snatch,deadlift,backsq,eat,background,experience,schedule,howlong,total_lift
0,South East,Male,35.0,69.0,192.0,295.0,225.0,465.0,400.0,I eat quality foods but don't measure the amount|,I played youth or high school level sports|I p...,I began CrossFit by trying it alone (without a...,I do multiple workouts in a day 3+ times a week|,4+ years|,1385.0
1,Latin America,Male,27.0,68.0,164.0,254.0,187.0,397.0,397.0,I weigh and measure my food|I eat strict Paleo|,I played youth or high school level sports|I p...,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 3+ times a wee...,2-4 years|,1235.0
2,North East,Male,48.0,64.0,155.0,185.0,135.0,415.0,315.0,I eat whatever is convenient|,I played youth or high school level sports|I r...,I began CrossFit with a coach (e.g. at an affi...,I typically rest 4 or more days per month|,2-4 years|,1050.0
3,North East,Female,22.0,63.0,136.0,140.0,105.0,265.0,200.0,I eat quality foods but don't measure the amou...,I played college sports|,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 2x a week|,1-2 years|,710.0
4,South East,Female,22.0,63.0,139.0,205.0,165.0,300.0,275.0,I eat whatever is convenient|,I played youth or high school level sports|,I began CrossFit with a coach (e.g. at an affi...,I usually only do 1 workout a day|,1-2 years|,945.0


In [6]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(fill_value=0))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

class ColumnDropper():
    def __init__(self, columns):
        self.columns = columns

    def transform(self, X, y=None):
        return X.drop(self.columns, axis=1)

    def fit(self, X, y=None):
        return self

column_transformer = Pipeline(steps=[
    ('dropper', ColumnDropper(['name', 'total_lift']))
])


preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, train.select_dtypes(include='number').columns.drop(['total_lift'])),
    ('cat', categorical_transformer, train.select_dtypes(exclude=['number']).columns),
],
                                 remainder='drop')

In [7]:
transformed_train = preprocessor.fit_transform(train)

In [8]:
transformed_train[0]

array([ 35.,  69., 192., 295., 225., 465., 400.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,
         0.,   0.,   0.,   1.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   

In [9]:
transformed_test = preprocessor.transform(test)

In [10]:
X = transformed_train
y = train['total_lift']

In [11]:
model = Sequential([
    Dense(2048, activation='relu', input_shape=(transformed_train.shape[1],)),
    Dense(1024, activation='relu'),
    Dense(512, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='linear') 
])

model.compile(optimizer='adam',
              loss='mean_absolute_error', 
              metrics=['mean_absolute_error']) 

model.fit(X, y, epochs=20)

test_loss, test_mae = model.evaluate(transformed_test, test['total_lift'])
print(f"Test Mean Absolute Error: {test_mae}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m751/751[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 13ms/step - loss: 68.1269 - mean_absolute_error: 68.1269
Epoch 2/20
[1m751/751[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 31.3912 - mean_absolute_error: 31.3912
Epoch 3/20
[1m751/751[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 25.3018 - mean_absolute_error: 25.3018
Epoch 4/20
[1m751/751[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 23.8156 - mean_absolute_error: 23.8156
Epoch 5/20
[1m751/751[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 15.9499 - mean_absolute_error: 15.9499
Epoch 6/20
[1m751/751[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 15.6418 - mean_absolute_error: 15.6418
Epoch 7/20
[1m751/751[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 15.3752 - mean_absolute_error: 15.3752
Epoch 8/20
[1m751/751[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

## Writeup
This model performed far better on the updated data version, in part due to the much more minimal dataset. The model was able to pick up on the key details that indicate what someone's total_lift score will be.