- # Feature engineering
    - 

In [None]:
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession
import pandas as pd
import matplotlib.pyplot as plt

# Inline plotting in Jupyter
%matplotlib inline

# Optional: better aesthetics
plt.style.use('ggplot')  # or 'seaborn-v0_8', 'fivethirtyeight', etc.


builder = SparkSession.builder \
    .appName("LocalDeltaLake") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
cols_to_drop = ["Squat1Kg", "Squat2Kg", "Squat3Kg", "Squat4Kg", "Bench1Kg", "Bench2Kg", "Bench3Kg", "Bench4Kg", "Deadlift1Kg", "Deadlift2Kg", "Deadlift3Kg", "Deadlift4Kg", "Wilks", "Glossbrenner", "Goodlift", "MeetCountry", "MeetState", "MeetTown", "MeetName"]

silver_features = silver_df.drop(*cols_to_drop).fillna({"Tested":"No"})

In [None]:
silver_features.show()

In [None]:
silver_features.select("Tested").distinct().show()
silver_features_pandas = silver_features.toPandas()

# Column descriptions

In [None]:
print(silver_features_pandas["TotalKg"].dtype)

# Univariate analysis

# Lifter analysis
- Correlation between bodyweight and total (same as Weight class candlesticks per weight class men vs women)
- Raw vs equippend, tested vs untested percentile comparison
- Squat bench, Squat deadlift, Bench deadlift ratios + averages and grouping per weight class men vs women


In [None]:
lifter_analysis_df = silver_features.select("Name","Sex","Equipment","AgeClass","Division","WeightClassKg","Best3SquatKg","Best3BenchKg","Best3DeadliftKg","TotalKg","Dots","Tested", "date")
#lifter_analysis_df.show()

In [None]:
import matplotlib.pyplot as plt

# Split data by gender
male_totalkg = bw_total_df[bw_total_df["Sex"] == "M"]["TotalKg"]
female_totalkg = bw_total_df[bw_total_df["Sex"] == "F"]["TotalKg"]

# Define common bins for both
bins = 50

# Plot males in the background (alpha < 1)
plt.hist(male_totalkg, bins=bins, color='blue', alpha=0.5, label='Male')

# Plot females in the foreground (higher alpha)
plt.hist(female_totalkg, bins=bins, color='pink', alpha=0.8, label='Female')

# Add labels and title
plt.title('Distribution of TotalKg')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Split data
male_totalkg = bw_total_df[bw_total_df["Sex"] == "M"]["TotalKg"].dropna()
female_totalkg = bw_total_df[bw_total_df["Sex"] == "F"]["TotalKg"].dropna()

# Define bins
bins = 50

# Define percentiles to show
percentile_points = np.arange(0, 101, 10)

# Calculate percentiles and stats
def get_stats(data):
    percentiles = np.percentile(data, percentile_points)
    desc = data.describe()
    return percentiles, desc

male_percentiles, male_desc = get_stats(male_totalkg)
female_percentiles, female_desc = get_stats(female_totalkg)

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(18, 6), constrained_layout=True)

# --- Male plot ---
axes[0].hist(male_totalkg, bins=bins, color='blue', alpha=0.7)
axes[0].set_title("Male TotalKg Distribution")
axes[0].set_xlabel("TotalKg")
axes[0].set_ylabel("Frequency")

# Add percentile lines and labels
for p, value in zip(percentile_points, male_percentiles):
    axes[0].axvline(value, color='black', linestyle='--', linewidth=0.8)
    axes[0].text(value, axes[0].get_ylim()[1] * 0.95,
                 f"{p}th\n{int(round(value))}",
                 rotation=90, verticalalignment='top',
                 horizontalalignment='center', fontsize=8, color='black')

# Add male stats
stats_text = "\n".join([
    f"Count: {int(male_desc['count'])}",
    f"Mean: {male_desc['mean']:.0f}",
    f"Std: {male_desc['std']:.0f}",
    f"Min: {male_desc['min']:.0f}",
    f"25%: {male_desc['25%']:.0f}",
    f"50%: {male_desc['50%']:.0f}",
    f"75%: {male_desc['75%']:.0f}",
    f"Max: {male_desc['max']:.0f}",
])
axes[0].text(1.02, 0.5, stats_text,
             transform=axes[0].transAxes,
             fontsize=10, va='center')

# --- Female plot ---
axes[1].hist(female_totalkg, bins=bins, color='pink', alpha=0.7)
axes[1].set_title("Female TotalKg Distribution")
axes[1].set_xlabel("TotalKg")
axes[1].set_ylabel("Frequency")

# Add percentile lines and labels
for p, value in zip(percentile_points, female_percentiles):
    axes[1].axvline(value, color='black', linestyle='--', linewidth=0.8)
    axes[1].text(value, axes[1].get_ylim()[1] * 0.95,
                 f"{p}th\n{int(round(value))}",
                 rotation=90, verticalalignment='top',
                 horizontalalignment='center', fontsize=8, color='black')

# Add female stats
stats_text = "\n".join([
    f"Count: {int(female_desc['count'])}",
    f"Mean: {female_desc['mean']:.0f}",
    f"Std: {female_desc['std']:.0f}",
    f"Min: {female_desc['min']:.0f}",
    f"25%: {female_desc['25%']:.0f}",
    f"50%: {female_desc['50%']:.0f}",
    f"75%: {female_desc['75%']:.0f}",
    f"Max: {female_desc['max']:.0f}",
])
axes[1].text(1.02, 0.5, stats_text,
             transform=axes[1].transAxes,
             fontsize=10, va='center')

plt.show()


# EDA per bw

In [None]:
bw_total_df = lifter_analysis_df.select("Name", "Sex", "Equipment", "WeightClassKg", "TotalKg").na.drop().toPandas()
#bw_total_df.show()
#bw_total_df.count()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# Load and clean data
df = lifter_analysis_df.select(
    "Name", "Sex", "Equipment", "WeightClassKg", "TotalKg"
).na.drop().toPandas()
df = df[df["Equipment"] == "Raw"]
df["WeightClassKg"] = pd.to_numeric(df["WeightClassKg"], errors="coerce")
df = df.dropna(subset=["WeightClassKg", "TotalKg"])

# Define custom binning function
def assign_weight_bin(row):
    wc = row["WeightClassKg"]
    if row["Sex"] == "M":
        if wc >= 140:
            return "140kg+"
        else:
            return f"{(wc // 10) * 10:.0f}-{(wc // 10) * 10 + 9:.0f}kg"
    elif row["Sex"] == "F":
        if wc >= 100:
            return "100kg+"
        else:
            return f"{(wc // 10) * 10:.0f}-{(wc // 10) * 10 + 9:.0f}kg"
    return None

df["WeightBin"] = df.apply(assign_weight_bin, axis=1)

# Define plotting function
def plot_sex_percentiles(sex_label):
    sex_df = df[df["Sex"] == sex_label]
    bin_labels = sorted(sex_df["WeightBin"].unique(), key=lambda x: float(x.rstrip("+kg").split("-")[0]))
    cmap = cm.get_cmap("rainbow")
    n_bins = len(bin_labels)
    percentiles = np.arange(0, 101, 10)

    plt.figure(figsize=(12, 7))
    for i, b in enumerate(bin_labels):
        bin_df = sex_df[sex_df["WeightBin"] == b]
        if len(bin_df) < 10:
            continue
        pct_values = np.percentile(bin_df["TotalKg"], percentiles)
        color = cmap(i / max(n_bins - 1, 1))
        plt.plot(percentiles, pct_values, label=b, color=color)
        plt.scatter(percentiles, pct_values, color=color, s=30)
        for x, y in zip(percentiles, pct_values):
            plt.text(x, y, f"{int(y)}", fontsize=8, ha="center", va="bottom", color=color)

    title = f"TotalKg Percentiles by Weight Bin ({'Men' if sex_label == 'M' else 'Women'}, Raw)"
    plt.title(title)
    plt.xlabel("Percentile")
    plt.ylabel("TotalKg")
    plt.grid(True)
    plt.legend(title="Weight Bin", fontsize=8, loc="upper left", bbox_to_anchor=(1, 1))
    plt.tight_layout()
    plt.show()

# Generate charts
plot_sex_percentiles("M")  # Men
plot_sex_percentiles("F")  # Women


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Assume bw_total_df is already loaded
bw_total_df = lifter_analysis_df.select("Name", "Sex", "Equipment", "WeightClassKg", "TotalKg").na.drop().toPandas()

# Keep only Raw equipment
bw_total_df = bw_total_df[bw_total_df["Equipment"] == "Raw"]

# Convert WeightClassKg to numeric (in case of string like "83+" or "120+")
bw_total_df["WeightClassKg"] = pd.to_numeric(bw_total_df["WeightClassKg"], errors='coerce')

# Drop any rows with NaNs after conversion
bw_total_df = bw_total_df.dropna(subset=["WeightClassKg", "TotalKg"])

# IPF weight class binning
def ipf_bin(row):
    sex = row["Sex"]
    wc = row["WeightClassKg"]
    if sex == "M":
        if wc <= 59: return "59"
        elif wc <= 66: return "66"
        elif wc <= 74: return "74"
        elif wc <= 83: return "83"
        elif wc <= 93: return "93"
        elif wc <= 105: return "105"
        elif wc <= 120: return "120"
        else: return "120+"
    elif sex == "F":
        if wc <= 47: return "47"
        elif wc <= 52: return "52"
        elif wc <= 57: return "57"
        elif wc <= 63: return "63"
        elif wc <= 69: return "69"
        elif wc <= 76: return "76"
        elif wc <= 84: return "84"
        else: return "84+"
    else:
        return None

# Apply binning
bw_total_df["IPF_WeightClass"] = bw_total_df.apply(ipf_bin, axis=1)

# Percentiles to compute
percentiles = np.arange(0, 110, 10)

def plot_percentile_lines(ax, data, color):
    perc_values = np.percentile(data, percentiles)
    for p, val in zip(percentiles, perc_values):
        ax.axhline(val, linestyle="--", color=color, alpha=0.4)
        ax.text(x=0.5, y=val, s=f"{p}th: {int(round(val))}", color=color,
                fontsize=8, alpha=0.8, va='bottom', ha='left', transform=ax.get_yaxis_transform())

# Plot for each sex
for sex in ["M", "F"]:
    fig, ax = plt.subplots(figsize=(10, 6))
    subset = bw_total_df[bw_total_df["Sex"] == sex]
    classes = sorted(subset["IPF_WeightClass"].unique(), key=lambda x: float(x.replace('+', '')))

    # rainbow color palette
    palette = sns.color_palette("hsv", len(classes))

    for i, wc in enumerate(classes):
        class_data = subset[subset["IPF_WeightClass"] == wc]["TotalKg"]
        if len(class_data) < 2:
            continue
        percs = np.percentile(class_data, percentiles)
        ax.plot(percentiles, percs, label=wc, color=palette[i])
        for p, val in zip(percentiles, percs):
            ax.scatter(p, val, color=palette[i], s=20)
            ax.text(p, val, f"{int(val)}", fontsize=7, alpha=0.8, color=palette[i], ha='center', va='bottom')

    ax.set_title(f"Percentile Trends by IPF Weight Class - {'Men' if sex == 'M' else 'Women'} (Raw Only)")
    ax.set_xlabel("Percentile")
    ax.set_ylabel("TotalKg")
    ax.legend(title="Weight Class", bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.grid(True)

    # Draw horizontal percentile lines across all classes
    plot_percentile_lines(ax, subset["TotalKg"], "black")

    plt.tight_layout()
    plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Filter to Raw Men only
male_df = bw_total_df[(bw_total_df["Sex"] == "M") & (bw_total_df["Equipment"] == "Raw")]

# Apply IPF binning again (if not already applied)
def ipf_bin(row):
    wc = row["WeightClassKg"]
    if wc <= 59: return "59"
    elif wc <= 66: return "66"
    elif wc <= 74: return "74"
    elif wc <= 83: return "83"
    elif wc <= 93: return "93"
    elif wc <= 105: return "105"
    elif wc <= 120: return "120"
    else: return "120+"

male_df["IPF_WeightClass"] = male_df.apply(ipf_bin, axis=1)

# Get only 120+ kg class
target_class_df = male_df[male_df["IPF_WeightClass"] == "105"]

# Ryan_Cotter's total
Ryan_Cotter = 712

# Compute percentile
percentile = np.mean(target_class_df["TotalKg"] < Ryan_Cotter) * 100

# Plot histogram
plt.figure(figsize=(10, 6))
sns.histplot(target_class_df["TotalKg"], bins=30, color="skyblue", kde=True)

# Add vertical line for Ryan_Cotter
plt.axvline(Ryan_Cotter, color="red", linestyle="--", linewidth=2)
plt.text(Ryan_Cotter, plt.ylim()[1]*0.9,
         f"Ryan Cotter\n{int(Ryan_Cotter)}kg\n{percentile:.1f}th percentile",
         color="red", ha="left", va="top", fontsize=10, backgroundcolor="white")

# Labels
plt.title("TotalKg Distribution in IPF 105kg (Men, Raw)")
plt.xlabel("TotalKg")
plt.ylabel("Lifter Count")

plt.tight_layout()
plt.show()


# Bivariate analysis
- Somehow all lifters show up as tested

In [None]:
testing_equipment_df = silver_features.select("Name", "Sex", "WeightClassKg", "TotalKg", "Tested", "Equipment").na.drop().toPandas()
#testing_equipment_df.show()
#testing_equipment_df.count()

In [None]:
plt.scatter(df['WeightClassKg'], df['TotalKg'], alpha=0.5)
plt.title('WeightClassKg vs TotalKg')
plt.xlabel('WeightClassKg')
plt.ylabel('TotalKg')
plt.show()

In [None]:
lift_ratios = lifter_analysis_df.select("Name", "Sex", "WeightClassKg", "Best3SquatKg", "Best3BenchKg", "Best3DeadliftKg").na.drop().toPandas()
#lift_ratios.show()
#lift_ratios.count()

In [None]:
age_analysis_raw = lifter_analysis_df.select("Name","Sex","Equipment","AgeClass","WeightClassKg","TotalKg","Tested").na.drop().toPandas()
#age_analysis_raw.show()
#age_analysis_raw.count()


# Time series df
- progression of records per weightclass over time per equipment and testing status

In [None]:
records_progression = lifter_analysis_df.select("Name","Sex","Equipment","Division","WeightClassKg","TotalKg","Tested", "date").toPandas()
#records_progression.show()
#records_progression.count()