In [1]:
# Data Wrangling
import pandas as pd
import numpy as np

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Data Transformation
from sklearn.impute import SimpleImputer

In [2]:
# URL of the CSV file

# Read the CSV file directly from the URL
df = pd.read_csv('C:/Users/himan/OneDrive/Documents/RedbackOperations/redback-fit-sports-performance/Cycling Analysis/data/activities_cleaned.csv')

# Display the first 5 rows
df.head()

Unnamed: 0,Activity ID,Activity Date,Activity Name,Activity Type,Elapsed Time,Distance,Max Heart Rate,Relative Effort,Commute,Activity Gear,...,Perceived Relative Effort,Commute.1,From Upload,Grade Adjusted Distance,Bike,Gear,Flagged,Average Elapsed Speed,Dirt Distance,Total Steps
0,2929442069,"13 Dec 2019, 01:46:07",Lunch Ride,Ride,3859,22.97,139.0,11.0,False,,...,,0.0,1.0,,,,,,,
1,2945780637,"20 Dec 2019, 23:05:01",Morning Ride,Ride,4852,29.65,133.0,12.0,False,,...,,0.0,1.0,,,,,,,
2,2948028275,"21 Dec 2019, 23:25:29",Morning Ride,Ride,5817,32.38,139.0,19.0,False,,...,,0.0,1.0,,,,,,,
3,2952462113,"24 Dec 2019, 01:19:17",Lunch Ride,Ride,3851,21.68,140.0,11.0,False,,...,,0.0,1.0,,,,,,,
4,2956494096,"26 Dec 2019, 00:09:08",Lunch Ride,Ride,5843,32.36,131.0,14.0,False,,...,,0.0,1.0,,,,,,,


In [3]:
# Isolate the running data component
df_ride = df[df['Activity Type'] == 'Ride']

df_ride.head()

Unnamed: 0,Activity ID,Activity Date,Activity Name,Activity Type,Elapsed Time,Distance,Max Heart Rate,Relative Effort,Commute,Activity Gear,...,Perceived Relative Effort,Commute.1,From Upload,Grade Adjusted Distance,Bike,Gear,Flagged,Average Elapsed Speed,Dirt Distance,Total Steps
0,2929442069,"13 Dec 2019, 01:46:07",Lunch Ride,Ride,3859,22.97,139.0,11.0,False,,...,,0.0,1.0,,,,,,,
1,2945780637,"20 Dec 2019, 23:05:01",Morning Ride,Ride,4852,29.65,133.0,12.0,False,,...,,0.0,1.0,,,,,,,
2,2948028275,"21 Dec 2019, 23:25:29",Morning Ride,Ride,5817,32.38,139.0,19.0,False,,...,,0.0,1.0,,,,,,,
3,2952462113,"24 Dec 2019, 01:19:17",Lunch Ride,Ride,3851,21.68,140.0,11.0,False,,...,,0.0,1.0,,,,,,,
4,2956494096,"26 Dec 2019, 00:09:08",Lunch Ride,Ride,5843,32.36,131.0,14.0,False,,...,,0.0,1.0,,,,,,,


In [4]:
df_ride.info()

<class 'pandas.core.frame.DataFrame'>
Index: 181 entries, 0 to 343
Data columns (total 47 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Activity ID                181 non-null    int64  
 1   Activity Date              181 non-null    object 
 2   Activity Name              181 non-null    object 
 3   Activity Type              181 non-null    object 
 4   Elapsed Time               181 non-null    int64  
 5   Distance                   181 non-null    float64
 6   Max Heart Rate             181 non-null    float64
 7   Relative Effort            181 non-null    float64
 8   Commute                    181 non-null    bool   
 9   Activity Gear              162 non-null    object 
 10  Filename                   181 non-null    object 
 11  Athlete Weight             70 non-null     float64
 12  Bike Weight                162 non-null    float64
 13  Elapsed Time.1             181 non-null    float64
 14 

In [5]:
# Sum the total rows of missing values from each attribute
missing_values = df_ride.isnull().sum()

# Return only attrbutes with missing values
missing_values[missing_values > 0]

Activity Gear                 19
Athlete Weight               111
Bike Weight                   19
Elevation Gain                 1
Elevation Loss                 6
Elevation Low                  6
Elevation High                 6
Max Heart Rate.1             111
Average Temperature            6
Total Work                     1
Perceived Exertion           180
Prefer Perceived Exertion    179
Perceived Relative Effort    180
Grade Adjusted Distance      181
Bike                          19
Gear                         181
Flagged                       89
Average Elapsed Speed         89
Dirt Distance                 89
Total Steps                  181
dtype: int64

In [6]:
# Remove attributes where 50% or more data is missing
null_threshold = len(df_ride) * 0.5
df_ride_clean = df_ride.dropna(thresh=null_threshold, axis=1)

In [7]:
from sklearn.impute import SimpleImputer

# Numeric variables with missing values
numeric_cols = ['Elevation Gain', 'Elevation Loss', 'Elevation Low', 'Elevation High', 
                'Average Temperature', 'Total Work','Bike Weight' ,'Flagged' ,
                'Average Elapsed Speed', 'Dirt Distance' ]

# Categorical variables with missing values
categorical_cols = ['Activity Gear', 'Bike' ]

# Impute missing values for numeric variables with median
numeric_imputer = SimpleImputer(strategy='median')
df_ride_clean[numeric_cols] = numeric_imputer.fit_transform(df_ride_clean[numeric_cols])

# Impute missing values for categorical variables with mode
categorical_imputer = SimpleImputer(strategy='most_frequent')
df_ride_clean[categorical_cols] = categorical_imputer.fit_transform(df_ride_clean[categorical_cols])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ride_clean[numeric_cols] = numeric_imputer.fit_transform(df_ride_clean[numeric_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ride_clean[categorical_cols] = categorical_imputer.fit_transform(df_ride_clean[categorical_cols])


In [8]:
df_ride_clean['Activity Date'] = pd.to_datetime(df_ride_clean['Activity Date'], format='%d %b %Y, %H:%M:%S')

# Preparing data for time series analysis
df_ride_clean['Month'] = df_ride_clean['Activity Date'].dt.month
df_ride_clean['Weekday'] = df_ride_clean['Activity Date'].dt.weekday
df_ride_clean['Year'] = df_ride_clean['Activity Date'].dt.year


# Average metrics by month
avg_metrics_month = df_ride_clean.groupby('Month')[['Distance', 'Average Speed', 'Calories']].mean()

# Average metrics by weekday
avg_metrics_weekday = df_ride_clean.groupby('Weekday')[['Distance', 'Average Speed', 'Calories']].mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ride_clean['Activity Date'] = pd.to_datetime(df_ride_clean['Activity Date'], format='%d %b %Y, %H:%M:%S')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ride_clean['Month'] = df_ride_clean['Activity Date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ride_clean['Weekday'] = df_r

In [9]:
df_ride_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 181 entries, 0 to 343
Data columns (total 42 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Activity ID             181 non-null    int64         
 1   Activity Date           181 non-null    datetime64[ns]
 2   Activity Name           181 non-null    object        
 3   Activity Type           181 non-null    object        
 4   Elapsed Time            181 non-null    int64         
 5   Distance                181 non-null    float64       
 6   Max Heart Rate          181 non-null    float64       
 7   Relative Effort         181 non-null    float64       
 8   Commute                 181 non-null    bool          
 9   Activity Gear           181 non-null    object        
 10  Filename                181 non-null    object        
 11  Bike Weight             181 non-null    float64       
 12  Elapsed Time.1          181 non-null    float64       


In [10]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define the rmse function
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

# Split the dataset into features (X) and target variable (y)
X = df_ride_clean[['Distance', 'Elevation Gain', 'Power Count']]
y = df_ride_clean['Moving Time']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the SVM model
svm_model = SVR(kernel='rbf')  # Radial Basis Function (RBF) kernel is commonly used for SVM regression
svm_model.fit(X_train_scaled, y_train)
# Make predictions on the test set
y_pred = svm_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
RMSE_svr = rmse(y_test, y_pred)

print("Performance Metrics of SVR :")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared Score (R2): {r2:.2f}")
print(f"Root Mean Squared Error (RMSE): {RMSE_svr:.2f}")

Performance Metrics of SVR :
Mean Squared Error (MSE): 15036383.63
Mean Absolute Error (MAE): 2515.47
R-squared Score (R2): -0.00
Root Mean Squared Error (RMSE): 3877.68


In [11]:
from sklearn.ensemble import RandomForestRegressor

# Train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)  # You can adjust the number of trees (n_estimators) as needed
rf_model.fit(X_train_scaled, y_train)

# Define the rmse function
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test_scaled)

# Evaluate the Random Forest model
mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
RMSE_rf = rmse(y_test, y_pred)

print("Random Forest Performance Metrics:")
print(f"Mean Squared Error (MSE): {mse_rf:.2f}")
print(f"Mean Absolute Error (MAE): {mae_rf:.2f}")
print(f"R-squared Score (R2): {r2_rf:.2f}")
print(f"Root Mean Squared Error (RMSE): {RMSE_rf:.2f}")


Random Forest Performance Metrics:
Mean Squared Error (MSE): 25012.36
Mean Absolute Error (MAE): 122.38
R-squared Score (R2): 1.00
Root Mean Squared Error (RMSE): 3877.68


In [12]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Gradient Boosting Regressor
gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Fit the model on the training data
gb_regressor.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred_gb = gb_regressor.predict(X_test_scaled)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred_gb)
mae_rf = mean_absolute_error(y_test, y_pred_gb)
r2_rf = r2_score(y_test, y_pred_gb)
RMSE_rf = rmse(y_test, y_pred_gb)

print(" Gradient Bossting Regressor Performance Metrics: ")
print(f"Mean Squared Error (MSE): {mse_rf:.2f}")
print(f"Mean Absolute Error (MAE): {mae_rf:.2f}")
print(f"R-squared Score (R2): {r2_rf:.2f}")
print(f"Root Mean Squared Error (RMSE): {RMSE_rf:.2f}")



Gradient Bossting Regressor Performance Metrics:
Mean Squared Error (MSE): 25012.36
Mean Absolute Error (MAE): 149.68
R-squared Score (R2): 1.00
Root Mean Squared Error (RMSE): 185.26


In [13]:

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# Train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = lr_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
RMSE_lr = rmse(y_test, y_pred)

print("Performance Metrics of Linear Regession:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared Score (R2): {r2:.2f}")
print(f"Root Mean Squared Error (RMSE): {RMSE_lr:.2f}")

Performance Metrics of Linear Regession:
Mean Squared Error (MSE): 13402.32
Mean Absolute Error (MAE): 85.58
R-squared Score (R2): 1.00
Root Mean Squared Error (RMSE): 115.77
