In [3]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/Users/veeraanand/athlete_project/feature_repo/data/athletes.csv')

# Explore structure
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423006 entries, 0 to 423005
Data columns (total 28 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   athlete_id       423003 non-null  float64
 1   name             331110 non-null  object 
 2   region           251262 non-null  object 
 3   team             155160 non-null  object 
 4   affiliate        241916 non-null  object 
 5   gender           331110 non-null  object 
 6   age              331110 non-null  float64
 7   height           159869 non-null  float64
 8   weight           229890 non-null  float64
 9   fran             55426 non-null   float64
 10  helen            30279 non-null   float64
 11  grace            40745 non-null   float64
 12  filthy50         19359 non-null   float64
 13  fgonebad         29738 non-null   float64
 14  run400           22246 non-null   float64
 15  run5k            36097 non-null   float64
 16  candj            104435 non-null  floa

Unnamed: 0,athlete_id,name,region,team,affiliate,gender,age,height,weight,fran,...,deadlift,backsq,pullups,eat,train,background,experience,schedule,howlong,event_timestamp
0,2554.0,Pj Ablang,South West,Double Edge,Double Edge CrossFit,Male,24.0,70.0,166.0,,...,400.0,305.0,,,I workout mostly at a CrossFit Affiliate|I hav...,I played youth or high school level sports|I r...,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 2x a week|,4+ years|,2025-07-17 01:54:15.467059
1,3517.0,Derek Abdella,,,,Male,42.0,70.0,190.0,,...,,,,,I have a coach who determines my programming|I...,I played youth or high school level sports|,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 2x a week|,4+ years|,2025-07-17 01:54:15.467059
2,4691.0,,,,,,,,,,...,,,,,,,,,,2025-07-17 01:54:15.467059
3,5164.0,Abo Brandon,Southern California,LAX CrossFit,LAX CrossFit,Male,40.0,67.0,,211.0,...,375.0,325.0,25.0,I eat 1-3 full cheat meals per week|,I workout mostly at a CrossFit Affiliate|I hav...,I played youth or high school level sports|,I began CrossFit by trying it alone (without a...,I usually only do 1 workout a day|,4+ years|,2025-07-17 01:54:15.467059
4,5286.0,Bryce Abbey,,,,Male,32.0,65.0,149.0,206.0,...,,325.0,50.0,I eat quality foods but don't measure the amount|,I workout mostly at a CrossFit Affiliate|I inc...,I played college sports|,I began CrossFit by trying it alone (without a...,I usually only do 1 workout a day|I strictly s...,1-2 years|,2025-07-17 01:54:15.467059


In [4]:
# Create binary target column
df['is_high_performer'] = (
    (df['deadlift'] >= 350) &
    (df['snatch'] >= 150) &
    (df['pullups'] >= 20)
).astype(int)

# Check the distribution
df['is_high_performer'].value_counts()

is_high_performer
0    403698
1     19308
Name: count, dtype: int64

In [3]:
# Select baseline numeric features
feature_v1 = df[['age', 'height', 'weight', 'deadlift', 'backsq', 'pullups', 'is_high_performer']].copy()

# Drop rows with any missing values
feature_v1 = feature_v1.dropna()

# Show shape and preview
print("Shape:", feature_v1.shape)
feature_v1.head()

Shape: (42307, 7)


Unnamed: 0,age,height,weight,deadlift,backsq,pullups,is_high_performer
6,21.0,72.0,175.0,0.0,0.0,0.0,0
8,30.0,72.0,175.0,0.0,0.0,0.0,0
12,31.0,65.0,150.0,465.0,405.0,81.0,1
13,43.0,71.0,185.0,0.0,0.0,0.0,0
15,23.0,72.0,165.0,419.0,0.0,55.0,1


In [4]:
from datetime import datetime
import os

# Create Feast-ready copy
feature_v1_feast = feature_v1.copy()

# Add required columns
feature_v1_feast['athlete_id'] = range(1, len(feature_v1_feast) + 1)
feature_v1_feast['event_timestamp'] = datetime.now()

# Reorder columns: ID + timestamp first (just for readability)
cols = ['athlete_id', 'event_timestamp'] + [col for col in feature_v1_feast.columns if col not in ['athlete_id', 'event_timestamp']]
feature_v1_feast = feature_v1_feast[cols]

# Create Feast data folder if not exists
os.makedirs("feature_repo/data", exist_ok=True)

# Save to CSV
feature_v1_feast.to_csv("feature_repo/data/feature_v1.csv", index=False)

# Show confirmation
feature_v1_feast.head()

Unnamed: 0,athlete_id,event_timestamp,age,height,weight,deadlift,backsq,pullups,is_high_performer
6,1,2025-07-16 19:19:42.054866,21.0,72.0,175.0,0.0,0.0,0.0,0
8,2,2025-07-16 19:19:42.054866,30.0,72.0,175.0,0.0,0.0,0.0,0
12,3,2025-07-16 19:19:42.054866,31.0,65.0,150.0,465.0,405.0,81.0,1
13,4,2025-07-16 19:19:42.054866,43.0,71.0,185.0,0.0,0.0,0.0,0
15,5,2025-07-16 19:19:42.054866,23.0,72.0,165.0,419.0,0.0,55.0,1


In [None]:
import pandas as pd
from datetime import datetime

# ✅ Now pointing to the correct file location
df = pd.read_csv("data/athletes.csv")

# ✅ Add a timestamp column
df["event_timestamp"] = datetime.utcnow()

# ✅ Save it back to the same place
df.to_csv("data/athletes.csv", index=False)

In [4]:
# 📄 File: athletes_pipeline.ipynb

import pandas as pd
from datetime import datetime

df = pd.read_csv("data/athletes.csv")
df["event_timestamp"] = datetime.utcnow()
df.to_csv("data/athletes.csv", index=False)

In [7]:
# First, let's see what columns we have
import pandas as pd

df = pd.read_csv('/Users/veeraanand/athlete_project/feature_repo/data/athletes.csv')
print("Available columns:")
print(df.columns.tolist())
print(f"\nDataset shape: {df.shape}")

Available columns:
['athlete_id', 'name', 'region', 'team', 'affiliate', 'gender', 'age', 'height', 'weight', 'fran', 'helen', 'grace', 'filthy50', 'fgonebad', 'run400', 'run5k', 'candj', 'snatch', 'deadlift', 'backsq', 'pullups', 'eat', 'train', 'background', 'experience', 'schedule', 'howlong', 'event_timestamp']

Dataset shape: (423006, 28)


In [8]:
# Create Feature Version 2 - Performance metrics focus
import pandas as pd
from datetime import datetime

# Load the data
df = pd.read_csv('/Users/veeraanand/athlete_project/feature_repo/data/athletes.csv')

# Recreate the target variable
df['is_high_performer'] = (
    (df['deadlift'] >= 350) &
    (df['snatch'] >= 150) &
    (df['pullups'] >= 20)
).astype(int)

print("Target distribution:")
print(df['is_high_performer'].value_counts())

# Select performance-based features for Version 2
feature_v2 = df[['fran', 'helen', 'grace', 'run400', 'run5k', 'snatch', 'is_high_performer']].copy()

# Drop rows with any missing values
feature_v2 = feature_v2.dropna()

print(f"\nFeature V2 shape after cleaning: {feature_v2.shape}")
print(f"Target distribution in V2:\n{feature_v2['is_high_performer'].value_counts()}")

# Preview the data
feature_v2.head()

Target distribution:
is_high_performer
0    403698
1     19308
Name: count, dtype: int64

Feature V2 shape after cleaning: (5821, 7)
Target distribution in V2:
is_high_performer
1    3239
0    2582
Name: count, dtype: int64


Unnamed: 0,fran,helen,grace,run400,run5k,snatch,is_high_performer
12,119.0,417.0,103.0,61.0,1211.0,225.0,1
83,147.0,425.0,115.0,59.0,1187.0,235.0,1
101,175.0,615.0,171.0,85.0,1470.0,185.0,1
102,126.0,449.0,86.0,58.0,1302.0,240.0,1
110,244.0,486.0,294.0,63.0,1268.0,175.0,1


In [9]:
# Prepare Feature V2 for Feast with required columns
feature_v2_feast = feature_v2.copy()

# Add required columns for Feast
feature_v2_feast['athlete_id'] = range(1, len(feature_v2_feast) + 1)
feature_v2_feast['event_timestamp'] = datetime.now()

# Reorder columns: ID + timestamp first
cols = ['athlete_id', 'event_timestamp'] + [col for col in feature_v2_feast.columns if col not in ['athlete_id', 'event_timestamp']]
feature_v2_feast = feature_v2_feast[cols]

# Save to CSV for Feast
feature_v2_feast.to_csv("data/feature_v2.csv", index=False)

print("Feature V2 saved successfully!")
print(f"Shape: {feature_v2_feast.shape}")
feature_v2_feast.head()

Feature V2 saved successfully!
Shape: (5821, 9)


Unnamed: 0,athlete_id,event_timestamp,fran,helen,grace,run400,run5k,snatch,is_high_performer
12,1,2025-07-17 11:41:33.406578,119.0,417.0,103.0,61.0,1211.0,225.0,1
83,2,2025-07-17 11:41:33.406578,147.0,425.0,115.0,59.0,1187.0,235.0,1
101,3,2025-07-17 11:41:33.406578,175.0,615.0,171.0,85.0,1470.0,185.0,1
102,4,2025-07-17 11:41:33.406578,126.0,449.0,86.0,58.0,1302.0,240.0,1
110,5,2025-07-17 11:41:33.406578,244.0,486.0,294.0,63.0,1268.0,175.0,1


In [10]:
# Create Feature V1 dataset that matches what the ML pipeline expects
import pandas as pd
from datetime import datetime

# Load original dataset
df = pd.read_csv("data/athletes.csv")

# Recreate the target variable
df['is_high_performer'] = (
    (df['deadlift'] >= 350) &
    (df['snatch'] >= 150) &
    (df['pullups'] >= 20)
).astype(int)

# Select basic physical features for Version 1
feature_v1 = df[['age', 'height', 'weight', 'deadlift', 'backsq', 'pullups', 'is_high_performer']].copy()

# Drop rows with any missing values
feature_v1 = feature_v1.dropna()

# Add required columns for Feast
feature_v1['athlete_id'] = range(1, len(feature_v1) + 1)
feature_v1['event_timestamp'] = datetime.now()

# Reorder columns
cols = ['athlete_id', 'event_timestamp'] + [col for col in feature_v1.columns if col not in ['athlete_id', 'event_timestamp']]
feature_v1 = feature_v1[cols]

# Save to CSV
feature_v1.to_csv("data/feature_v1.csv", index=False)

print(f"Feature V1 saved! Shape: {feature_v1.shape}")
feature_v1.head()

Feature V1 saved! Shape: (42307, 9)


Unnamed: 0,athlete_id,event_timestamp,age,height,weight,deadlift,backsq,pullups,is_high_performer
6,1,2025-07-17 12:00:50.351157,21.0,72.0,175.0,0.0,0.0,0.0,0
8,2,2025-07-17 12:00:50.351157,30.0,72.0,175.0,0.0,0.0,0.0,0
12,3,2025-07-17 12:00:50.351157,31.0,65.0,150.0,465.0,405.0,81.0,1
13,4,2025-07-17 12:00:50.351157,43.0,71.0,185.0,0.0,0.0,0.0,0
15,5,2025-07-17 12:00:50.351157,23.0,72.0,165.0,419.0,0.0,55.0,1
