# Prediction Model v1
Here, my goal is to create a training pipeline that prepares the data through imputation, dummy variables, and ultimately modeling.

In [None]:
!pip install wandb pandas pyarrow opacus

Collecting opacus
  Downloading opacus-1.5.4-py3-none-any.whl.metadata (8.7 kB)
Downloading opacus-1.5.4-py3-none-any.whl (254 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.4/254.4 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: opacus
Successfully installed opacus-1.5.4


In [None]:
# Import statements
import pandas as pd
import wandb
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_absolute_error
import numpy as np
import tensorflow as tf
import keras

from tensorflow.keras.datasets import mnist
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
import tensorflow as tf

In [None]:
# Connect to wandb
run = wandb.init(project="mlops-datasets", job_type = "load-dataset")
art = run.use_artifact("smehta15-university-of-chicago/mlops-datasets/athletes:v2")
path = art.download()
train = pd.read_csv(f"{path}/train.csv")
test = pd.read_csv(f"{path}/test.csv")
run.finish()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msmehta15[0m ([33msmehta15-university-of-chicago[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact 'athletes:v2', 89.22MB. 4 files...
[34m[1mwandb[0m:   4 of 4 files downloaded.  
Done. 00:00:04.0 (22.4MB/s)


In [None]:
train = train[train['total_lift'].notna()]
test = test[test['total_lift'].notna()]

In [None]:
train.head()

Unnamed: 0,region,gender,age,height,weight,candj,snatch,deadlift,backsq,eat,background,experience,schedule,howlong,total_lift
0,South East,Male,35.0,69.0,192.0,295.0,225.0,465.0,400.0,I eat quality foods but don't measure the amount|,I played youth or high school level sports|I p...,I began CrossFit by trying it alone (without a...,I do multiple workouts in a day 3+ times a week|,4+ years|,1385.0
1,Latin America,Male,27.0,68.0,164.0,254.0,187.0,397.0,397.0,I weigh and measure my food|I eat strict Paleo|,I played youth or high school level sports|I p...,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 3+ times a wee...,2-4 years|,1235.0
2,North East,Male,48.0,64.0,155.0,185.0,135.0,415.0,315.0,I eat whatever is convenient|,I played youth or high school level sports|I r...,I began CrossFit with a coach (e.g. at an affi...,I typically rest 4 or more days per month|,2-4 years|,1050.0
3,North East,Female,22.0,63.0,136.0,140.0,105.0,265.0,200.0,I eat quality foods but don't measure the amou...,I played college sports|,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 2x a week|,1-2 years|,710.0
4,South East,Female,22.0,63.0,139.0,205.0,165.0,300.0,275.0,I eat whatever is convenient|,I played youth or high school level sports|,I began CrossFit with a coach (e.g. at an affi...,I usually only do 1 workout a day|,1-2 years|,945.0


In [None]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(fill_value=0))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

class ColumnDropper():
    def __init__(self, columns):
        self.columns = columns

    def transform(self, X, y=None):
        return X.drop(self.columns, axis=1)

    def fit(self, X, y=None):
        return self

column_transformer = Pipeline(steps=[
    ('dropper', ColumnDropper(['name', 'total_lift']))
])


preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, train.select_dtypes(include='number').columns.drop(['total_lift'])),
    ('cat', categorical_transformer, train.select_dtypes(exclude=['number']).columns),
],
                                 remainder='drop')

In [None]:
transformed_train = preprocessor.fit_transform(train)

In [None]:
transformed_train[0]

array([ 35.,  69., 192., 295., 225., 465., 400.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,
         0.,   0.,   0.,   1.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   

In [None]:
transformed_test = preprocessor.transform(test)

In [None]:
X = transformed_train
y = train['total_lift']

In [None]:
import torch

X_train_tensor = torch.tensor(transformed_train, dtype=torch.float32)
X_test_tensor = torch.tensor(transformed_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y.values, dtype=torch.float32)

In [None]:
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, torch.tensor(test['total_lift'].values, dtype=torch.float32))

batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
import torch.nn as nn

class PrivateRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(PrivateRegressionModel, self).__init__()
        self.layer_1 = nn.Linear(input_dim, 2048)
        self.relu_1 = nn.ReLU()
        self.layer_2 = nn.Linear(2048, 1024)
        self.relu_2 = nn.ReLU()
        self.layer_3 = nn.Linear(1024, 512)
        self.relu_3 = nn.ReLU()
        self.layer_4 = nn.Linear(512, 64)
        self.relu_4 = nn.ReLU()
        self.output_layer = nn.Linear(64, 1)

    def forward(self, x):
        x = self.relu_1(self.layer_1(x))
        x = self.relu_2(self.layer_2(x))
        x = self.relu_3(self.layer_3(x))
        x = self.relu_4(self.layer_4(x))
        x = self.output_layer(x)
        return x

input_dim = X_train_tensor.shape[1]
model = PrivateRegressionModel(input_dim)

print(model)

PrivateRegressionModel(
  (layer_1): Linear(in_features=305, out_features=2048, bias=True)
  (relu_1): ReLU()
  (layer_2): Linear(in_features=2048, out_features=1024, bias=True)
  (relu_2): ReLU()
  (layer_3): Linear(in_features=1024, out_features=512, bias=True)
  (relu_3): ReLU()
  (layer_4): Linear(in_features=512, out_features=64, bias=True)
  (relu_4): ReLU()
  (output_layer): Linear(in_features=64, out_features=1, bias=True)
)


In [None]:
import torch.optim as optim
import torch.nn as nn

criterion = nn.L1Loss() # Mean Absolute Error
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
from opacus import PrivacyEngine

DELTA = 1e-5 
EPSILON = 10.0 
privacy_engine = PrivacyEngine()


model, optimizer, train_loader = privacy_engine.make_private_with_epsilon(
    module=model,
    optimizer=optimizer,
    data_loader=train_loader,
    target_delta=DELTA,
    target_epsilon=EPSILON,
    epochs=20, 
    max_grad_norm=1.0, 
)



In [None]:
import torch
from tqdm.notebook import tqdm 

epochs = 20 # Use the same number of epochs as in PrivacyEngine
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    # Use tqdm for a progress bar
    for i, data in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        # Reshape labels to match output shape if necessary
        loss = criterion(outputs.squeeze(), labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}")

print("Finished Training")

Epoch 1/20:   0%|          | 0/751 [00:00<?, ?it/s]



Epoch 1, Loss: 52.81245032155244


Epoch 2/20:   0%|          | 0/751 [00:00<?, ?it/s]

Epoch 2, Loss: 11.826056524813572


Epoch 3/20:   0%|          | 0/751 [00:00<?, ?it/s]

Epoch 3, Loss: 10.504825797046072


Epoch 4/20:   0%|          | 0/751 [00:00<?, ?it/s]

Epoch 4, Loss: 10.846652452621893


Epoch 5/20:   0%|          | 0/751 [00:00<?, ?it/s]

Epoch 5, Loss: 13.56033233708929


Epoch 6/20:   0%|          | 0/751 [00:00<?, ?it/s]

Epoch 6, Loss: 13.234736306054614


Epoch 7/20:   0%|          | 0/751 [00:00<?, ?it/s]

Epoch 7, Loss: 13.377967080009602


Epoch 8/20:   0%|          | 0/751 [00:00<?, ?it/s]

Epoch 8, Loss: 10.940599970747723


Epoch 9/20:   0%|          | 0/751 [00:00<?, ?it/s]

Epoch 9, Loss: 10.720322550374881


Epoch 10/20:   0%|          | 0/751 [00:00<?, ?it/s]

Epoch 10, Loss: 9.089123106986959


Epoch 11/20:   0%|          | 0/751 [00:00<?, ?it/s]

Epoch 11, Loss: 10.365086218805986


Epoch 12/20:   0%|          | 0/751 [00:00<?, ?it/s]

Epoch 12, Loss: 10.103880853056115


Epoch 13/20:   0%|          | 0/751 [00:00<?, ?it/s]

Epoch 13, Loss: 9.316539881391309


Epoch 14/20:   0%|          | 0/751 [00:00<?, ?it/s]

Epoch 14, Loss: 9.878761967234858


Epoch 15/20:   0%|          | 0/751 [00:00<?, ?it/s]

Epoch 15, Loss: 8.355075082036056


Epoch 16/20:   0%|          | 0/751 [00:00<?, ?it/s]

Epoch 16, Loss: 7.955521815150143


Epoch 17/20:   0%|          | 0/751 [00:00<?, ?it/s]

Epoch 17, Loss: 9.565845695221313


Epoch 18/20:   0%|          | 0/751 [00:00<?, ?it/s]

Epoch 18, Loss: 10.111943088898487


Epoch 19/20:   0%|          | 0/751 [00:00<?, ?it/s]

Epoch 19, Loss: 6.200878679672983


Epoch 20/20:   0%|          | 0/751 [00:00<?, ?it/s]

Epoch 20, Loss: 8.105475286192965
Finished Training


In [None]:
model.eval()
test_loss = 0.0
with torch.no_grad():
    for data, labels in test_loader:
        data, labels = data.to(device), labels.to(device)
        outputs = model(data)
        loss = criterion(outputs.squeeze(), labels)
        test_loss += loss.item()

average_test_loss = test_loss / len(test_loader)
print(f"Test Loss (MAE): {average_test_loss}")

Test Loss (MAE): 8.17280253704558
