In [None]:
from pyspark.sql import SparkSession
import pandas as pd
from delta import configure_spark_with_delta_pip

builder = SparkSession.builder \
    .appName("Data_visualization") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")


spark = configure_spark_with_delta_pip(builder).getOrCreate()
    

# Optimize

data_to_visualize = spark.read.csv("../data/cleaned_lifts_future_delta/")
sample = data_to_visualize.limit(10000).toPandas()

from delta.tables import DeltaTable
DeltaTable.optimize(spark, data_to_visualize)

In [None]:
from delta.tables import DeltaTable

df_loaded = spark.read.format("delta").load("../data/cleaned_lifts_future_delta/")
df_loaded.repartition(1).write.format("delta").mode("overwrite").option("overwriteSchema", True).save("../data/cleaned_lifts_future_delta/")


# Test

In [None]:
import os
print(os.listdir("../data/cleaned_lifts_future_delta/"))


# Load sample for plotting

In [None]:
df = spark.read.format("delta").load("../data/cleaned_lifts_future_delta/")
sample_df = df.limit(10000).toPandas()

# Basic plotting with Matplotlib or Seaborn

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.histplot(sample_df["Best3DeadliftKg"].dropna(), bins=10, kde=True)
plt.title("Distribution of Best Bench Press (kg)")
plt.xlabel("Best Bench (kg)")
plt.ylabel("Count")
plt.show()


# Trends over time

In [None]:
sample_df["year"] = pd.to_datetime(sample_df["date"]).dt.year
sns.boxplot(data=sample_df, x="year", y="Best3SquatKg")
plt.xticks(rotation=45)
plt.title("Squat Performance Over Years")
plt.show()


# Group analysis

In [None]:
sns.boxplot(data=sample_df, x="Sex", y="TotalKg")

# DuckDB simple query

In [None]:
import duckdb

duckdb.query("SELECT 42, 'hello duckdb'")

In [None]:
import pandas as pd

df = pd.DataFrame({'x': range(5), 'y': ['a','b','c','d','e']})
duckdb.query("SELECT * FROM df WHERE x > 2").to_df()

In [None]:
con = duckdb.connect("my_data.duckdb")
con.execute("CREATE TABLE people AS SELECT * FROM df")
con.execute("SELECT * FROM people").fetchdf()

In [None]:
duckdb.query("SELECT Sex, AVG(TotalKg) FROM sample_df GROUP BY Sex").df()


In [None]:
import duckdb

percentiles = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
queries = []

for p in percentiles:
    queries.append(f"""
        SELECT 
            WeightClassKg, 
            {p} AS percentile,
            quantile_cont(TotalKg, {p}) AS value
        FROM sample_df
        GROUP BY WeightClassKg
        ORDER BY WeightClassKg
    """)

# Combine with UNION ALL
query = " UNION ALL ".join(f"({q})" for q in queries)
result = duckdb.query(query).df()

def parse_weight_class(wc):
    if wc == "140+":
        return 165.0
    elif wc == "90+":
        return 105.0
    elif wc == "110+":
        return 120
    try:
        return float(wc)
    except ValueError:
        return None  # or use np.nan if you're using NumPy


result['WeightClassNumeric'] = result['WeightClassKg'].apply(parse_weight_class)
result = result.sort_values(['WeightClassNumeric', 'percentile'])

result

In [None]:
plt.figure(figsize=(12, 6))
for wc, group in result.groupby('WeightClassKg'):
    plt.plot(group['percentile'], group['value'], color=colors[wc])
    # Label at the last point
    plt.text(
        group['percentile'].max() + 0.01,  # a bit right of the line
        group['value'].iloc[-1], 
        wc, 
        fontsize=8, 
        color=colors[wc], 
        verticalalignment='center'
    )

plt.xlabel('Percentile')
plt.ylabel('TotalKg')
plt.title('Total Lift by Weight Class and Percentile')
plt.tight_layout()
plt.show()

In [None]:
import streamlit as st
import duckdb
import pandas as pd

