In [None]:
import pandas as pd
import numpy as np

URL='https://raw.githubusercontent.com/Patrick0481/Individual-project/refs/heads/main/activities.csv'
Strava_df = pd.read_csv(URL)

Strava_df.head()

In [None]:
import matplotlib.pyplot as plt

# 1. Prepare the Data
# We look at everything > 3km
df_exp = Strava_df[(Strava_df['Distance'] > 3.0) & (Strava_df['Average Heart Rate'] > 0)].copy()

# Calculate Speed
df_exp['Speed_kmh'] = df_exp['Distance'] / (df_exp['Moving Time'] / 60)

# 2. Apply the "Experimental Logic" to find the Best Data
# Criteria 1: Remove Glitches (> 22 km/h) and Walking (< 6 km/h)
# Criteria 2: Keep only runs with decent speed (e.g., > 10 km/h) for the "Model Data"
df_noise = df_exp[(df_exp['Speed_kmh'] >= 22) | (df_exp['Speed_kmh'] <= 6)]
df_slow  = df_exp[(df_exp['Speed_kmh'] < 10) & (df_exp['Speed_kmh'] > 6)]
df_model = df_exp[(df_exp['Speed_kmh'] >= 10) & (df_exp['Speed_kmh'] < 22)]

# 3. Plot the "Experimentation Result"
plt.figure(figsize=(12, 7))

# Plot Rejected Data (Noise)
plt.scatter(df_noise['Average Heart Rate'], df_noise['Speed_kmh'], 
            color='red', alpha=0.3, marker='x', label='Discarded (Glitches/Walking)')

# Plot Slow Data (Low Effort - Not useful for Race Prediction)
plt.scatter(df_slow['Average Heart Rate'], df_slow['Speed_kmh'], 
            color='grey', alpha=0.3, label='Discarded (Low Effort/Recovery)')

# Plot Selected Data (The "Gold" used for the Model)
plt.scatter(df_model['Average Heart Rate'], df_model['Speed_kmh'], 
            color='green', alpha=0.6, label='Selected for Training (High Quality)')

plt.title('Exploration: Filtering Signal from Noise', fontsize=14)
plt.xlabel('Heart Rate (bpm)', fontsize=12)
plt.ylabel('Speed (km/h)', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)

plt.show()