In [None]:
import pandas as pd
import numpy as np

URL='https://raw.githubusercontent.com/Patrick0481/Individual-project/refs/heads/main/activities.csv'
Strava_df = pd.read_csv(URL)

Strava_df.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Prepare Data (Same logic as before)
df_viz = Strava_df[
    (Strava_df['Distance'] > 3.0) & 
    (Strava_df['Average Heart Rate'] > 0)
].copy()

# Calculate Efficiency (Speed per Heart Beat)
# This is your "Fitness Score"
df_viz['Speed_kmh'] = df_viz['Distance'] / (df_viz['Moving Time'] / 60)
df_viz['Efficiency'] = df_viz['Speed_kmh'] / df_viz['Average Heart Rate']

# Filter outliers
df_viz = df_viz[(df_viz['Speed_kmh'] < 22) & (df_viz['Speed_kmh'] > 5)]

# 2. Plot the "Innovation"
# We plot Date vs. Efficiency to show your fitness evolution
plt.figure(figsize=(12, 6))

# Scatter plot of individual runs
sns.scatterplot(
    data=df_viz, 
    x='Activity Date', 
    y='Efficiency', 
    alpha=0.3, 
    color='grey',
    label='Individual Runs'
)

# Trend Line (The "AI" part)
# This shows if you are getting more efficient over time
sns.regplot(
    data=df_viz, 
    x=df_viz['Activity Date'].map(dt.datetime.toordinal), 
    y='Efficiency', 
    scatter=False, 
    color='red', 
    label='Fitness Trend (AI Model)'
)

plt.title('Innovation: Visualizing Fitness Evolution Over Time', fontsize=14)
plt.ylabel('Running Efficiency (Speed / HR)', fontsize=12)
plt.xlabel('Date', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)

# Fix X-axis dates
plt.gca().xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y-%m'))

plt.show()

In [None]:
import matplotlib.pyplot as plt

# 1. Prepare the Data
# We look at everything > 3km
df_exp = Strava_df[(Strava_df['Distance'] > 3.0) & (Strava_df['Average Heart Rate'] > 0)].copy()

# Calculate Speed
df_exp['Speed_kmh'] = df_exp['Distance'] / (df_exp['Moving Time'] / 60)

# 2. Apply the "Experimental Logic" to find the Best Data
# Criteria 1: Remove Glitches (> 22 km/h) and Walking (< 6 km/h)
# Criteria 2: Keep only runs with decent speed (e.g., > 10 km/h) for the "Model Data"
df_noise = df_exp[(df_exp['Speed_kmh'] >= 22) | (df_exp['Speed_kmh'] <= 6)]
df_slow  = df_exp[(df_exp['Speed_kmh'] < 10) & (df_exp['Speed_kmh'] > 6)]
df_model = df_exp[(df_exp['Speed_kmh'] >= 10) & (df_exp['Speed_kmh'] < 22)]

# 3. Plot the "Experimentation Result"
plt.figure(figsize=(12, 7))

# Plot Rejected Data (Noise)
plt.scatter(df_noise['Average Heart Rate'], df_noise['Speed_kmh'], 
            color='red', alpha=0.3, marker='x', label='Discarded (Glitches/Walking)')

# Plot Slow Data (Low Effort - Not useful for Race Prediction)
plt.scatter(df_slow['Average Heart Rate'], df_slow['Speed_kmh'], 
            color='grey', alpha=0.3, label='Discarded (Low Effort/Recovery)')

# Plot Selected Data (The "Gold" used for the Model)
plt.scatter(df_model['Average Heart Rate'], df_model['Speed_kmh'], 
            color='green', alpha=0.6, label='Selected for Training (High Quality)')

plt.title('Exploration: Filtering Signal from Noise', fontsize=14)
plt.xlabel('Heart Rate (bpm)', fontsize=12)
plt.ylabel('Speed (km/h)', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)

plt.show()