# Prediction Model v1
Here, my goal is to create a training pipeline that prepares the data through imputation, dummy variables, and ultimately modeling.

In [None]:
!pip install wandb pandas pyarrow



In [None]:
# Import statements
import pandas as pd
import wandb
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_absolute_error
import numpy as np
import tensorflow as tf
import keras

from tensorflow.keras.datasets import mnist
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
import tensorflow as tf

In [None]:
# Connect to wandb
run = wandb.init(project="mlops-datasets", job_type = "load-dataset")
art = run.use_artifact("smehta15-university-of-chicago/mlops-datasets/athletes:v1")
path = art.download()
train = pd.read_csv(f"{path}/train.csv")
test = pd.read_csv(f"{path}/test.csv")
run.finish()

[34m[1mwandb[0m: Downloading large artifact 'athletes:v1', 211.56MB. 10 files...
[34m[1mwandb[0m:   10 of 10 files downloaded.  
Done. 00:00:00.4 (573.7MB/s)


In [None]:
train = train[train['total_lift'].notna()]
test = test[test['total_lift'].notna()]

In [None]:
train.head()

Unnamed: 0,athlete_id,name,region,team,affiliate,gender,age,height,weight,fran,...,deadlift,backsq,pullups,eat,train,background,experience,schedule,howlong,total_lift
0,209977.0,Dawn Chapman,,,,Female,21.0,60.0,110.0,,...,255.0,190.0,,I eat quality foods but don't measure the amou...,I workout mostly at a CrossFit Affiliate|I wor...,I played youth or high school level sports|I r...,I began CrossFit with a coach (e.g. at an affi...,I usually only do 1 workout a day|I do multipl...,Less than 6 months|,655.0
1,155848.0,Kayla O'Brien,North East,CrossFit Woburn,CrossFit Woburn,Female,26.0,65.0,137.0,,...,250.0,205.0,21.0,I eat quality foods but don't measure the amount|,I workout mostly at a CrossFit Affiliate|I rec...,I played youth or high school level sports|I p...,I began CrossFit with a coach (e.g. at an affi...,I usually only do 1 workout a day|I strictly s...,1-2 years|,728.0
2,365594.0,Leslie Tamayo,South West,Old Pueblo CrossFit,Old Pueblo CrossFit,Female,22.0,63.0,142.0,,...,290.0,230.0,31.0,,I workout mostly at a CrossFit Affiliate|I rec...,I played youth or high school level sports|I p...,,,2-4 years|,828.0
3,150482.0,Derek Kingston,,,,Male,36.0,69.0,160.0,401.0,...,365.0,245.0,21.0,I eat quality foods but don't measure the amount|,I workout mostly at a CrossFit Affiliate|I hav...,I have no athletic background besides CrossFit|,I began CrossFit with a coach (e.g. at an affi...,I usually only do 1 workout a day|,1-2 years|,930.0
4,112001.0,Amanda Feola,Central East,,KW CrossFit,Female,35.0,63.0,150.0,227.0,...,263.0,273.0,20.0,I eat quality foods but don't measure the amount|,I workout mostly at a CrossFit Affiliate|I hav...,I played youth or high school level sports|,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 3+ times a wee...,2-4 years|,789.0


In [None]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(fill_value=0))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

class ColumnDropper():
    def __init__(self, columns):
        self.columns = columns

    def transform(self, X, y=None):
        return X.drop(self.columns, axis=1)

    def fit(self, X, y=None):
        return self

column_transformer = Pipeline(steps=[
    ('dropper', ColumnDropper(['name', 'total_lift']))
])


preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, train.select_dtypes(include='number').columns.drop(['total_lift'])),
    ('cat', categorical_transformer, train.select_dtypes(exclude=['number']).columns.drop(['name'])),
],
                                 remainder='drop')

In [None]:
transformed_train = preprocessor.fit_transform(train)

In [None]:
transformed_train[0]

array([2.09977e+05, 2.10000e+01, 6.00000e+01, ..., 0.00000e+00,
       0.00000e+00, 0.00000e+00])

In [None]:
transformed_test = preprocessor.transform(test)

In [None]:
X = transformed_train
y = train['total_lift']

In [None]:
model = Sequential([
    Dense(2048, activation='relu', input_shape=(transformed_train.shape[1],)),
    Dense(1024, activation='relu'),
    Dense(512, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='linear') 
])

model.compile(optimizer='adam',
              loss='mean_absolute_error', 
              metrics=['mean_absolute_error']) 

model.fit(X, y, epochs=20)

test_loss, test_mae = model.evaluate(transformed_test, test['total_lift'])
print(f"Test Mean Absolute Error: {test_mae}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m2130/2130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 7ms/step - loss: 1476.1715 - mean_absolute_error: 1476.1715
Epoch 2/20
[1m2130/2130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 2023.7762 - mean_absolute_error: 2023.7762
Epoch 3/20
[1m2130/2130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 2322.6111 - mean_absolute_error: 2322.6111
Epoch 4/20
[1m2130/2130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 577.2027 - mean_absolute_error: 577.2027
Epoch 5/20
[1m2130/2130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 647.5872 - mean_absolute_error: 647.5872
Epoch 6/20
[1m2130/2130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 3252.8347 - mean_absolute_error: 3252.8347
Epoch 7/20
[1m2130/2130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 2048.1292 - mean_absolute_error: 2048.1292
Epoch 8/20
[1m2130/2130[0m 