<h1 style="color: brown;">1. Importing Libraries and Configuring Display Options</h1>

In [22]:
# Data manipulation and visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# Evaluation
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, log_loss
from sklearn.model_selection import cross_validate
import time
from itertools import product

## Customizing DataFrame Output Formatting
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.3f}'.format)

<h1 style="color: brown;">2. Exploring Dataset</h1>

<h2 style="color: purple;">2.1 Loading Gtm Members Exercise Dataset</h2>

In [2]:
df = pd.read_csv('gym_members_exercise_tracking.csv')

<h2 style="color: purple;">2.2 Displaying Dataset Dimensions</h2> 

In [3]:
print(f'The data has {df.shape[0]} rows and {df.shape[1]} columns.')

The data has 973 rows and 15 columns.


<h2 style="color: purple;">2.3 Displaying All Column Names</h2>

In [4]:
# Get all column names from df
all_column_names = df.columns

# Display columns with numbered list
print("All Columns:")
for idx, col in enumerate(all_column_names, start=1):
    print(f"{idx}. {col}")


All Columns:
1. Age
2. Gender
3. Weight (kg)
4. Height (m)
5. Max_BPM
6. Avg_BPM
7. Resting_BPM
8. Session_Duration (hours)
9. Calories_Burned
10. Workout_Type
11. Fat_Percentage
12. Water_Intake (liters)
13. Workout_Frequency (days/week)
14. Experience_Level
15. BMI


<h2 style="color: purple;">2.4 Identify Numeric and Non-Numeric Features</h2>  

In [5]:
numeric_column_names = df.select_dtypes(include='number').columns
non_numeric_column_names = df.select_dtypes(exclude='number').columns

# Organize the columns in a dictionary for clear display
columns_dict = {
    "Numeric Columns": list(numeric_column_names),
    "Non-Numeric Columns": list(non_numeric_column_names)
}

# Print dictionary in a readable format with numbered lists and unique value counts
for category, columns in columns_dict.items():
    print(f"{category}:")
    for idx, col in enumerate(columns, start=1):
        unique_values_count = df[col].nunique()  # Count unique values
        print(f"  {idx}. {col} (Unique Values: {unique_values_count})")
    print()  # Add an empty line between categories


Numeric Columns:
  1. Age (Unique Values: 42)
  2. Weight (kg) (Unique Values: 532)
  3. Height (m) (Unique Values: 51)
  4. Max_BPM (Unique Values: 40)
  5. Avg_BPM (Unique Values: 50)
  6. Resting_BPM (Unique Values: 25)
  7. Session_Duration (hours) (Unique Values: 147)
  8. Calories_Burned (Unique Values: 621)
  9. Fat_Percentage (Unique Values: 239)
  10. Water_Intake (liters) (Unique Values: 23)
  11. Workout_Frequency (days/week) (Unique Values: 4)
  12. Experience_Level (Unique Values: 3)
  13. BMI (Unique Values: 771)

Non-Numeric Columns:
  1. Gender (Unique Values: 2)
  2. Workout_Type (Unique Values: 4)



<h2 style="color: purple;">2.5 Displaying First 5 Rows of the Dataset</h2>  

In [6]:
pd.set_option('display.max_columns', None)
df.head(5)

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
0,56,Male,88.3,1.71,180,157,60,1.69,1313.0,Yoga,12.6,3.5,4,3,30.2
1,46,Female,74.9,1.53,179,151,66,1.3,883.0,HIIT,33.9,2.1,4,2,32.0
2,32,Female,68.1,1.66,167,122,54,1.11,677.0,Cardio,33.4,2.3,4,2,24.71
3,25,Male,53.2,1.7,190,164,56,0.59,532.0,Strength,28.8,2.1,3,1,18.41
4,38,Male,46.1,1.79,188,158,68,0.64,556.0,Strength,29.2,2.8,3,1,14.39


<h2 style="color: purple;">2.6 Checking for Missing Data</h2>  

In [7]:
print(f'Total number of missing values: {df.isnull().sum().sum()}')

Total number of missing values: 0


<h2 style="color: purple;">2.7 Checking for Duplicate Data</h2> 

In [8]:
print(f'There are {df.duplicated().sum()} duplicate rows in the dataset.')

There are 0 duplicate rows in the dataset.


<h2 style="color: purple;">2.8 Summary Statistics for Numeric Columns</h2> 

In [9]:
pd.set_option('display.float_format', '{:.3f}'.format)
numeric_columns = df.select_dtypes(include='number')
numeric_columns.describe()

Unnamed: 0,Age,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
count,973.0,973.0,973.0,973.0,973.0,973.0,973.0,973.0,973.0,973.0,973.0,973.0,973.0
mean,38.683,73.855,1.723,179.884,143.767,62.223,1.256,905.422,24.977,2.627,3.322,1.81,24.912
std,12.181,21.208,0.128,11.526,14.345,7.327,0.343,272.642,6.259,0.6,0.913,0.74,6.661
min,18.0,40.0,1.5,160.0,120.0,50.0,0.5,303.0,10.0,1.5,2.0,1.0,12.32
25%,28.0,58.1,1.62,170.0,131.0,56.0,1.04,720.0,21.3,2.2,3.0,1.0,20.11
50%,40.0,70.0,1.71,180.0,143.0,62.0,1.26,893.0,26.2,2.6,3.0,2.0,24.16
75%,49.0,86.0,1.8,190.0,156.0,68.0,1.46,1076.0,29.3,3.1,4.0,2.0,28.56
max,59.0,129.9,2.0,199.0,169.0,74.0,2.0,1783.0,35.0,3.7,5.0,3.0,49.84


<h2 style="color: purple;">2.9 Summary Statistics for Categorical Columns</h2> 

In [10]:
non_numeric_columns = df.select_dtypes(exclude='number')
print("Non-Numeric Columns Summary:")
non_numeric_columns.describe(include='all')

Non-Numeric Columns Summary:


Unnamed: 0,Gender,Workout_Type
count,973,973
unique,2,4
top,Male,Strength
freq,511,258


<h2 style="color: purple;">2.10 Displaying Frequency of Categorical Data</h2> 

In [11]:
for col in non_numeric_column_names:
    # Get value counts
    value_counts = df[col].value_counts()

    # Display with a numbered list
    print(f"\nValue counts for {col}:")
    for idx, (value, count) in enumerate(value_counts.items(), start=1):
        print(f"{idx}. {value}: {count}")
    print("-" * 50)



Value counts for Gender:
1. Male: 511
2. Female: 462
--------------------------------------------------

Value counts for Workout_Type:
1. Strength: 258
2. Cardio: 255
3. Yoga: 239
4. HIIT: 221
--------------------------------------------------


<h1 style="color: brown;">3. Preprocessing </h1> 

<h2 style="color: purple;">3.1 Preprocessing Non-Numeric Columns for Machine Learning</h2>  

In [12]:
# Define LabelEncoder
label_encoder = LabelEncoder()

# Iterate over the non-numeric columns
for col in non_numeric_column_names:
    # Get the number of unique values in the column
    unique_values_count = df[col].nunique()

    if unique_values_count == 2:
        # Apply Label Encoding for columns with 2 unique values
        df[col] = label_encoder.fit_transform(df[col])

        # Display the mapping of label encoding
        print(f"\nLabel Encoding for column '{col}':")
        mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
        print(f"  {mapping}")
    elif unique_values_count >= 3:
        # Apply One-Hot Encoding for columns with 3 or more unique values
        df_encoded = pd.get_dummies(df[col], prefix=col, dtype=int)

        # Drop the original column and replace it with the One-Hot Encoded columns
        df = df.drop(columns=[col])
        df = pd.concat([df, df_encoded], axis=1)

        print(f"\nOne-Hot Encoding applied to column '{col}':")
        print(f"  Created columns: {list(df_encoded.columns)}")


Label Encoding for column 'Gender':
  {'Female': 0, 'Male': 1}

One-Hot Encoding applied to column 'Workout_Type':
  Created columns: ['Workout_Type_Cardio', 'Workout_Type_HIIT', 'Workout_Type_Strength', 'Workout_Type_Yoga']


In [13]:
print("\nUpdated DataFrame with Encoded Columns:")
df.head()


Updated DataFrame with Encoded Columns:


Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI,Workout_Type_Cardio,Workout_Type_HIIT,Workout_Type_Strength,Workout_Type_Yoga
0,56,1,88.3,1.71,180,157,60,1.69,1313.0,12.6,3.5,4,3,30.2,0,0,0,1
1,46,0,74.9,1.53,179,151,66,1.3,883.0,33.9,2.1,4,2,32.0,0,1,0,0
2,32,0,68.1,1.66,167,122,54,1.11,677.0,33.4,2.3,4,2,24.71,1,0,0,0
3,25,1,53.2,1.7,190,164,56,0.59,532.0,28.8,2.1,3,1,18.41,0,0,1,0
4,38,1,46.1,1.79,188,158,68,0.64,556.0,29.2,2.8,3,1,14.39,0,0,1,0


<h1 style="color: brown;">4. Development and Assessment of Models</h1> 

<h2 style="color: purple;">4.1 Splitting Features and Target Variable</h2>

In [14]:
X = df.drop(columns=['Workout_Frequency (days/week)'])
y = df['Workout_Frequency (days/week)']

<h2 style="color: purple;">4.2 Splitting Data into Training and Test Sets</h2> 


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

<h2 style="color: purple;">4.3 Scaling Features using StandardScaler</h2>

In [16]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

<h1 style="color: brown;">5. Evaluating Models and Measuring Performance</h1>  

<h2 style="color: purple;">5.1 Finding the Best Model</h2> 

In [18]:
model_list = [
    LogisticRegression(max_iter=10000, random_state=42),
    DecisionTreeClassifier(ccp_alpha=0.001, random_state=42),
    RandomForestClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    AdaBoostClassifier(algorithm='SAMME', random_state=42),
    SVC(probability=True, random_state=42),  # probability=True needed for log_loss
    GaussianNB(),
    KNeighborsClassifier(),
]

results = []

for model in model_list:
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    preds_proba = None
    
    # For log_loss we need predicted probabilities, if available
    if hasattr(model, "predict_proba"):
        preds_proba = model.predict_proba(X_test_scaled)
    elif hasattr(model, "decision_function"):
        # Some models like SVC can give decision_function scores; convert to probabilities
        decision_scores = model.decision_function(X_test_scaled)
        from sklearn.preprocessing import softmax
        preds_proba = softmax(decision_scores, axis=1)
    
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average='macro')
    prec = precision_score(y_test, preds, average='macro')
    rec = recall_score(y_test, preds, average='macro')
    
    # Compute log loss only if predicted probabilities are available
    if preds_proba is not None:
        ll = log_loss(y_test, preds_proba)
    else:
        ll = np.nan  # not available
    
    results.append((model.__class__.__name__, acc, f1, prec, rec, ll))

# Create DataFrame and sort by accuracy or any metric
df_results = pd.DataFrame(results, columns=["Model", "Accuracy", "F1_macro", "Precision_macro", "Recall_macro", "Log_loss"])
df_results = df_results.sort_values(by="Accuracy", ascending=False)

print(df_results)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                        Model  Accuracy  F1_macro  Precision_macro  \
2      RandomForestClassifier     0.538     0.540            0.553   
1      DecisionTreeClassifier     0.534     0.558            0.560   
5                         SVC     0.527     0.505            0.547   
0          LogisticRegression     0.503     0.523            0.527   
6                  GaussianNB     0.500     0.515            0.493   
3  GradientBoostingClassifier     0.497     0.514            0.513   
4          AdaBoostClassifier     0.479     0.335            0.252   
7        KNeighborsClassifier     0.462     0.481            0.484   

   Recall_macro  Log_loss  
2         0.530     0.752  
1         0.555    16.787  
5         0.487     0.834  
0         0.518     0.734  
6         0.630     3.535  
3         0.515     0.885  
4         0.500     1.281  
7         0.478     3.515  


In [19]:
best_row = df_results.loc[df_results['Accuracy'].idxmax()]

best_model_name = best_row['Model']
best_accuracy = best_row['Accuracy']

print(f"Best model based on accuracy: {best_model_name} with accuracy = {best_accuracy:.4f}")

Best model based on accuracy: RandomForestClassifier with accuracy = 0.5377


<h2 style="color: purple;">5.2 Manual Grid Search for Best Hyperparameters</h2>  

In [21]:
# Create a validation split from your training data
X_subtrain, X_val, y_subtrain, y_val = train_test_split(
    X_train_scaled, y_train, test_size=0.2, random_state=42
)

# Define your hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'max_features': ['sqrt', 'log2', None]  # you can add more if you want
}

best_score = 0
best_params = None

for combo in product(*param_grid.values()):
    params = dict(zip(param_grid.keys(), combo))
    model = RandomForestClassifier(**params, random_state=42)
    model.fit(X_subtrain, y_subtrain)
    preds = model.predict(X_val)
    score = accuracy_score(y_val, preds)
    
    if score > best_score:
        best_score = score
        best_params = params

print("Best hyperparameters found (manual grid search):")
print(best_params)
print(f"Validation Accuracy with best params: {best_score:.4f}")

Best hyperparameters found (manual grid search):
{'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 2, 'max_features': 'sqrt'}
Validation Accuracy with best params: 0.5036


In [None]:
<h2 style="color: purple;">5.3 Cross-Validation Evaluation</h2>   

In [23]:
# Best parameters from grid search
best_params = {
    'n_estimators': 50,
    'max_depth': 10,
    'min_samples_split': 2,
    'max_features': 'sqrt'
}

# Initialize final model
final_model = RandomForestClassifier(**best_params, random_state=42)

# Define scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'f1_macro': 'f1_macro',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'log_loss': 'neg_log_loss'
}

# Perform cross-validation
cv_results = cross_validate(final_model, X_train_scaled, y_train, cv=5, scoring=scoring)

# Extract and convert metrics
accuracy_scores = cv_results['test_accuracy']
f1_scores = cv_results['test_f1_macro']
precision_scores = cv_results['test_precision_macro']
recall_scores = cv_results['test_recall_macro']
log_loss_scores = -cv_results['test_log_loss']  # convert to positive

# Print individual and mean scores
print("CV Accuracy scores:", accuracy_scores)
print("Mean CV Accuracy:", np.mean(accuracy_scores))

print("CV F1 (macro) scores:", f1_scores)
print("Mean CV F1 (macro):", np.mean(f1_scores))

print("CV Precision (macro) scores:", precision_scores)
print("Mean CV Precision (macro):", np.mean(precision_scores))

print("CV Recall (macro) scores:", recall_scores)
print("Mean CV Recall (macro):", np.mean(recall_scores))

print("CV Log Loss scores:", log_loss_scores)
print("Mean CV Log Loss:", np.mean(log_loss_scores))

CV Accuracy scores: [0.44525547 0.52205882 0.5        0.51470588 0.52205882]
Mean CV Accuracy: 0.5008158007728639
CV F1 (macro) scores: [0.43833762 0.52061208 0.51918048 0.5038329  0.50862163]
Mean CV F1 (macro): 0.49811694258232153
CV Precision (macro) scores: [0.44475754 0.50927343 0.50458439 0.50960341 0.56969675]
Mean CV Precision (macro): 0.5075831030300837
CV Recall (macro) scores: [0.43859428 0.5413961  0.54218252 0.50344116 0.49294434]
Mean CV Recall (macro): 0.5037116802233081
CV Log Loss scores: [0.77617886 0.76803228 0.75456709 0.75081832 0.76802441]
Mean CV Log Loss: 0.7635241917333733


<h2 style="color: purple;">5.4 Final Model Training and Test Set Evaluation</h2>   

In [24]:
# Retrain on the full training data
final_model.fit(X_train_scaled, y_train)

# Predict on the test data
y_test_preds = final_model.predict(X_test_scaled)

# Evaluate
from sklearn.metrics import classification_report

print("Final Test Set Evaluation:\n")
print(classification_report(y_test, y_test_preds, digits=4))

Final Test Set Evaluation:

              precision    recall  f1-score   support

           2     0.5000    0.4762    0.4878        63
           3     0.4474    0.4679    0.4574       109
           4     0.5111    0.5169    0.5140        89
           5     0.5357    0.4839    0.5085        31

    accuracy                         0.4863       292
   macro avg     0.4985    0.4862    0.4919       292
weighted avg     0.4875    0.4863    0.4866       292

