In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings("ignore")

# Load the dataset
file_path = 'fitness_tracker_dataset.csv'
fitness_data = pd.read_csv(file_path)

# Display basic information about the dataset
print("Dataset Overview:")
print(fitness_data.info())
print("\nFirst Few Rows:")
print(fitness_data.head())

In [None]:
# Handling missing values by forward filling
fitness_data.fillna(method='ffill', inplace=True)

# Create activity level from composite score
steps_norm = fitness_data['steps'] / fitness_data['steps'].max()
heart_rate_norm = fitness_data['heart_rate_avg'] / fitness_data['heart_rate_avg'].max()
active_minutes_norm = fitness_data['active_minutes'] / fitness_data['active_minutes'].max()

fitness_data['activity_score'] = (
    0.5 * steps_norm +
    0.3 * heart_rate_norm +
    0.2 * active_minutes_norm
)

fitness_data['activity_level'] = pd.qcut(
    fitness_data['activity_score'], 
    q=3, 
    labels=["Low", "Moderate", "High"]
)

fitness_data.drop(columns=['activity_score'], inplace=True)

# Encode categorical variables
fitness_data_encoded = pd.get_dummies(fitness_data, columns=['workout_type', 'weather_conditions', 'location', 'mood'])

# Drop unnecessary columns
fitness_data_encoded.drop(['date', 'user_id'], axis=1, inplace=True)

print("Processed Dataset:")
print(fitness_data_encoded.head())

In [None]:
# --------- Feature Engineering (New) ---------

# Activity Intensity Score: Higher when steps and heart rate are both high
fitness_data['activity_intensity'] = (fitness_data['steps'] * fitness_data['heart_rate_avg']) / (fitness_data['active_minutes'] + 1)

# Sleep Efficiency Ratio: Measures rest per active minute
fitness_data['sleep_efficiency'] = fitness_data['sleep_hours'] / (fitness_data['active_minutes'] + 1)

# Calorie Burn Rate: Calories per active minute
fitness_data['calorie_burn_rate'] = fitness_data['calories_burned'] / (fitness_data['active_minutes'] + 1)

# Normalize extreme values (if any)
fitness_data['activity_intensity'] = fitness_data['activity_intensity'].clip(upper=fitness_data['activity_intensity'].quantile(0.99))
fitness_data['sleep_efficiency'] = fitness_data['sleep_efficiency'].clip(upper=fitness_data['sleep_efficiency'].quantile(0.99))
fitness_data['calorie_burn_rate'] = fitness_data['calorie_burn_rate'].clip(upper=fitness_data['calorie_burn_rate'].quantile(0.99))

# Print to verify
print("Feature engineered columns added:")
print(fitness_data[['activity_intensity', 'sleep_efficiency', 'calorie_burn_rate']])

print("Processed Dataset:")
print(fitness_data_encoded.head())

In [None]:
# Visualize the distribution of activity levels
plt.figure(figsize=(8, 5))
sns.countplot(data=fitness_data, x='activity_level', palette="viridis")
plt.title("Activity Level Distribution")
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(fitness_data_encoded.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()


In [None]:
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE


# Assume fitness_data_encoded is already preprocessed

# Encode the target column (activity_level)
label_encoder = LabelEncoder()
fitness_data_encoded['activity_level'] = label_encoder.fit_transform(fitness_data_encoded['activity_level'])

# Split features (X) and target (y)
X = fitness_data_encoded.drop('activity_level', axis=1)
y = fitness_data_encoded['activity_level']

# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Verify class distribution
print("Class Distribution After SMOTE:")
print(y_train_smote.value_counts())

# Train Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train_smote, y_train_smote)

# Predict on test set
y_pred = model.predict(X_test)

# Decode predictions back to original labels
y_pred_decoded = label_encoder.inverse_transform(y_pred)
y_test_decoded = label_encoder.inverse_transform(y_test)

# Evaluate the model
print("Accuracy Score:", accuracy_score(y_test_decoded, y_pred_decoded))
print("\nClassification Report:\n", classification_report(y_test_decoded, y_pred_decoded))

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test_decoded, y_pred_decoded), annot=True, fmt="d", cmap='Blues')
plt.title("Confusion Matrix - Random Forest with SMOTE")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()



In [None]:
# Initialize Logistic Regression Model
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
log_reg.fit(X_train_smote, y_train_smote)

# Predict on test set
y_pred_log_reg = log_reg.predict(X_test)

# Evaluate the model
print("Logistic Regression:")
print("Accuracy Score:", accuracy_score(y_test, y_pred_log_reg))
print("\nClassification Report:\n", classification_report(y_test, y_pred_log_reg))

# Confusion Matrix for Logistic Regression
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_log_reg), annot=True, fmt="d", cmap='Greens')
plt.title("Logistic Regression - Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()


In [None]:
# Ensure data is in the correct format
X_train = np.array(X_train)
X_test = np.array(X_test)

# Initialize and train KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_smote, y_train_smote)

# Predict on test set
y_pred_knn = knn.predict(X_test)

# Evaluate the model
print("\nK-Nearest Neighbors (KNN):")
print("Accuracy Score:", accuracy_score(y_test, y_pred_knn))
print("\nClassification Report:\n", classification_report(y_test, y_pred_knn))

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_knn), annot=True, fmt="d", cmap='Greens')
plt.title("KNN - Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
from xgboost import XGBClassifier

# Initialize XGBoost Classifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train the model
xgb.fit(X_train_smote, y_train_smote)

# Predict on the test set
y_pred_xgb = xgb.predict(X_test)

# Evaluate the model
print("XGBoost Accuracy Score:", accuracy_score(y_test, y_pred_xgb))
print("\nXGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_xgb), annot=True, fmt="d", cmap='Greens')
plt.title("xbg - Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()