In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings("ignore")

# Load the dataset
file_path = 'fitness_tracker_dataset.csv'
fitness_data = pd.read_csv(file_path)

# Display basic information about the dataset
print("Dataset Overview:")
print(fitness_data.info())
print("\nFirst Few Rows:")
print(fitness_data.head())


In [None]:
# Handling missing values by forward filling
fitness_data.fillna(method='ffill', inplace=True)

# Feature Engineering: Creating an "activity level" column
conditions = [
    (fitness_data['steps'] < 5000),
    (fitness_data['steps'] >= 5000) & (fitness_data['steps'] < 10000),
    (fitness_data['steps'] >= 10000)
]
choices = ['Low', 'Moderate', 'High']
fitness_data['activity_level'] = np.select(conditions, choices)

# Encode categorical variables
fitness_data_encoded = pd.get_dummies(fitness_data, columns=['workout_type', 'weather_conditions', 'location', 'mood'])

# Drop unnecessary columns
fitness_data_encoded.drop(['date', 'user_id'], axis=1, inplace=True)

print("Processed Dataset:")
print(fitness_data_encoded.head())

In [None]:
# Visualize the distribution of activity levels
plt.figure(figsize=(8, 5))
sns.countplot(data=fitness_data, x='activity_level', palette="viridis")
plt.title("Activity Level Distribution")
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(fitness_data_encoded.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()


In [None]:
# Ensure 'date' is in datetime format
fitness_data['date'] = pd.to_datetime(fitness_data['date'])

# Calculate the total steps per day
daily_steps = fitness_data.groupby('date')['steps'].sum().reset_index()
daily_steps.columns = ['Date', 'TotalSteps']

# Plot the total steps per day
plt.figure(figsize=(12, 6))
plt.plot(daily_steps['Date'], daily_steps['TotalSteps'], label='TotalSteps', color='blue', linewidth=1.5)

# Add plot title and labels
plt.title("Total Steps per Day", fontsize=16)
plt.xlabel("Date", fontsize=12)
plt.ylabel("Mean Total Steps", fontsize=12)

# Add gridlines for better readability
plt.grid(alpha=0.5)

# Rotate x-axis labels for better visualization
plt.xticks(rotation=45, fontsize=10)

# Show the legend
plt.legend()

# Tighten layout
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE


# Assume fitness_data_encoded is already preprocessed

# Encode the target column (activity_level)
label_encoder = LabelEncoder()
fitness_data_encoded['activity_level'] = label_encoder.fit_transform(fitness_data_encoded['activity_level'])

# Split features (X) and target (y)
X = fitness_data_encoded.drop('activity_level', axis=1)
y = fitness_data_encoded['activity_level']

# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Verify class distribution
print("Class Distribution After SMOTE:")
print(y_train_smote.value_counts())

# Train Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train_smote, y_train_smote)

# Predict on test set
y_pred = model.predict(X_test)

# Decode predictions back to original labels
y_pred_decoded = label_encoder.inverse_transform(y_pred)
y_test_decoded = label_encoder.inverse_transform(y_test)

# Evaluate the model
print("Accuracy Score:", accuracy_score(y_test_decoded, y_pred_decoded))
print("\nClassification Report:\n", classification_report(y_test_decoded, y_pred_decoded))

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test_decoded, y_pred_decoded), annot=True, fmt="d", cmap='Blues')
plt.title("Confusion Matrix - Random Forest with SMOTE")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()



In [None]:
# Initialize Logistic Regression Model
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
log_reg.fit(X_train_smote, y_train_smote)

# Predict on test set
y_pred_log_reg = log_reg.predict(X_test)

# Evaluate the model
print("Logistic Regression:")
print("Accuracy Score:", accuracy_score(y_test, y_pred_log_reg))
print("\nClassification Report:\n", classification_report(y_test, y_pred_log_reg))

# Confusion Matrix for Logistic Regression
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_log_reg), annot=True, fmt="d", cmap='Greens')
plt.title("Logistic Regression - Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()


In [None]:
# Ensure data is in the correct format
X_train = np.array(X_train)
X_test = np.array(X_test)

# Initialize and train KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_smote, y_train_smote)

# Predict on test set
y_pred_knn = knn.predict(X_test)

# Evaluate the model
print("\nK-Nearest Neighbors (KNN):")
print("Accuracy Score:", accuracy_score(y_test, y_pred_knn))
print("\nClassification Report:\n", classification_report(y_test, y_pred_knn))

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_knn), annot=True, fmt="d", cmap='Greens')
plt.title("KNN - Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
from xgboost import XGBClassifier

# Initialize XGBoost Classifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train the model
xgb.fit(X_train_smote, y_train_smote)

# Predict on the test set
y_pred_xgb = xgb.predict(X_test)

# Evaluate the model
print("XGBoost Accuracy Score:", accuracy_score(y_test, y_pred_xgb))
print("\nXGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_xgb), annot=True, fmt="d", cmap='Greens')
plt.title("xbg - Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score

# 1. Random Forest Cross-Validation
rf_cv_scores = cross_val_score(model, X_train_smote, y_train_smote, cv=5, scoring='accuracy')
print("Random Forest Cross-Validation Scores:", rf_cv_scores)
print("Random Forest Mean Accuracy:", rf_cv_scores.mean())
print("Random Forest Standard Deviation:", rf_cv_scores.std())



# 2. Logistic Regression Cross-Validation
log_reg_cv_scores = cross_val_score(log_reg, X_train_smote, y_train_smote, cv=5, scoring='accuracy')
print("Logistic Regression Cross-Validation Scores:", log_reg_cv_scores)
print("Logistic Regression Mean Accuracy:", log_reg_cv_scores.mean())
print("Logistic Regression Standard Deviation:", log_reg_cv_scores.std())



# 3. K-Nearest Neighbors (KNN) Cross-Validation
knn_cv_scores = cross_val_score(knn, X_train_smote, y_train_smote, cv=5, scoring='accuracy')
print("KNN Cross-Validation Scores:", knn_cv_scores)
print("KNN Mean Accuracy:", knn_cv_scores.mean())
print("KNN Standard Deviation:", knn_cv_scores.std())


# 4. Cross-validation for XGBoost
xgb_cv_scores = cross_val_score(xgb, X_train_smote, y_train_smote, cv=5, scoring='accuracy')
print("XGBoost Cross-Validation Scores:", xgb_cv_scores)
print("XGBoost Mean Accuracy:", xgb_cv_scores.mean())
print("XGBoost Standard Deviation:", xgb_cv_scores.std())



In [None]:
# Drop 'steps' from features
X_no_steps = X.drop('steps', axis=1)

# Train-test split
X_train_ns, X_test_ns, y_train_ns, y_test_ns = train_test_split(X_no_steps, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_smote_ns, y_train_smote_ns = smote.fit_resample(X_train_ns, y_train_ns)

# Train Random Forest without 'steps' after SMOTE
rf_no_steps_smote = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_no_steps_smote.fit(X_train_smote_ns, y_train_smote_ns)

# Predict and evaluate
y_pred_no_steps_smote = rf_no_steps_smote.predict(X_test_ns)
print("Accuracy without 'steps' after SMOTE:", accuracy_score(y_test_ns, y_pred_no_steps_smote))
print("\nClassification Report without 'steps' after SMOTE:\n", classification_report(y_test_ns, y_pred_no_steps_smote))

# Confusion Matrix for Ramdom Forest
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test_ns, y_pred_no_steps_smote), annot=True, fmt="d", cmap='Purples')
plt.title("Random Forest without steps - Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()


In [None]:
# Drop 'steps' from features
X_no_steps = X.drop('steps', axis=1)

# Split the data into training and testing sets
X_train_ns, X_test_ns, y_train_ns, y_test_ns = train_test_split(X_no_steps, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_smote_ns, y_train_smote_ns = smote.fit_resample(X_train_ns, y_train_ns)

# Train Logistic Regression without 'steps' after SMOTE
log_reg_no_steps_smote = LogisticRegression(max_iter=1000, random_state=42)
log_reg_no_steps_smote.fit(X_train_smote_ns, y_train_smote_ns)

# Predict and evaluate
y_pred_log_reg_no_steps_smote = log_reg_no_steps_smote.predict(X_test_ns)

# Print evaluation metrics
print("Accuracy without 'steps' after SMOTE (Logistic Regression):", accuracy_score(y_test_ns, y_pred_log_reg_no_steps_smote))
print("\nClassification Report without 'steps' after SMOTE (Logistic Regression):\n", classification_report(y_test_ns, y_pred_log_reg_no_steps_smote))

# Confusion Matrix for Ramdom Forest
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test_ns, y_pred_log_reg_no_steps_smote), annot=True, fmt="d", cmap='Purples')
plt.title("Logistic Regression without steps - Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
# Drop 'steps' from features
X_no_steps = X.drop(['steps'], axis=1)

# Split the data into training and testing sets
X_train_ns, X_test_ns, y_train_ns, y_test_ns = train_test_split(X_no_steps, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_smote_ns, y_train_smote_ns = smote.fit_resample(X_train_ns, y_train_ns)

# Ensure data is in the correct format for KNN
X_train_smote_ns = np.array(X_train_smote_ns)
X_test_ns = np.array(X_test_ns)

# Initialize and train KNN after SMOTE
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')  # Distance-weighted KNN
knn.fit(X_train_smote_ns, y_train_smote_ns)

# Predict on test set
y_pred_knn_smote = knn.predict(X_test_ns)

# Evaluate the model
print("\nK-Nearest Neighbors (KNN) Without 'steps' After SMOTE:")
print("Accuracy Score:", accuracy_score(y_test_ns, y_pred_knn_smote))
print("\nClassification Report:\n", classification_report(y_test_ns, y_pred_knn_smote))

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test_ns, y_pred_knn_smote), annot=True, fmt="d", cmap='Purples')
plt.title("KNN Without 'steps' After SMOTE - Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
# Drop the 'steps' attribute
X_train_no_steps = X_train.drop(columns=['steps'])
X_test_no_steps = X_test.drop(columns=['steps'])

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_train_smote_no_steps, y_train_smote_no_steps = smote.fit_resample(X_train_no_steps, y_train)

# Initialize XGBoost Classifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train the model on the dataset without 'steps'
xgb.fit(X_train_smote_no_steps, y_train_smote_no_steps)

# Predict on the test set
y_pred_xgb_no_steps = xgb.predict(X_test_no_steps)

# Evaluate the model
print("XGBoost Accuracy Score Without 'steps':", accuracy_score(y_test, y_pred_xgb_no_steps))
print("\nXGBoost Classification Report Without 'steps':\n", classification_report(y_test, y_pred_xgb_no_steps))

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_xgb_no_steps), annot=True, fmt="d", cmap='Blues', xticklabels=['Low', 'Moderate', 'High'], yticklabels=['Low', 'Moderate', 'High'])
plt.title("XGBoost - Confusion Matrix Without 'steps'")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()
