In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('fitness_tracker_dataset.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365000 entries, 0 to 364999
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   user_id             365000 non-null  int64  
 1   date                365000 non-null  object 
 2   steps               365000 non-null  int64  
 3   calories_burned     365000 non-null  float64
 4   distance_km         365000 non-null  float64
 5   active_minutes      365000 non-null  int64  
 6   sleep_hours         365000 non-null  float64
 7   heart_rate_avg      365000 non-null  int64  
 8   workout_type        312847 non-null  object 
 9   weather_conditions  365000 non-null  object 
 10  location            365000 non-null  object 
 11  mood                365000 non-null  object 
dtypes: float64(3), int64(4), object(5)
memory usage: 33.4+ MB


In [4]:
df.head()

Unnamed: 0,user_id,date,steps,calories_burned,distance_km,active_minutes,sleep_hours,heart_rate_avg,workout_type,weather_conditions,location,mood
0,468,2023-01-01,4530,2543.02,16.1,613,1.5,176,Walking,Clear,Park,Tired
1,879,2023-01-01,11613,1720.76,8.1,352,6.3,128,Cycling,Fog,Park,Happy
2,152,2023-01-01,27335,1706.35,3.57,236,6.7,134,Yoga,Snow,Park,Neutral
3,311,2023-01-01,13459,2912.38,6.41,1329,11.6,116,Swimming,Rain,Office,Tired
4,759,2023-01-01,15378,3344.51,17.88,52,7.4,84,Swimming,Rain,Office,Neutral


In [5]:
from ipywidgets import interact
@interact
def filter_unique(column = list(df.columns)):
    return print(df[column].unique())

interactive(children=(Dropdown(description='column', options=('user_id', 'date', 'steps', 'calories_burned', '…

In [6]:
df['date'] = pd.to_datetime(df['date'])

In [7]:
df['active_hrs'] = df['active_minutes'] / 60
df.drop(columns=['active_minutes'], inplace=True)
df.head()

Unnamed: 0,user_id,date,steps,calories_burned,distance_km,sleep_hours,heart_rate_avg,workout_type,weather_conditions,location,mood,active_hrs
0,468,2023-01-01,4530,2543.02,16.1,1.5,176,Walking,Clear,Park,Tired,10.216667
1,879,2023-01-01,11613,1720.76,8.1,6.3,128,Cycling,Fog,Park,Happy,5.866667
2,152,2023-01-01,27335,1706.35,3.57,6.7,134,Yoga,Snow,Park,Neutral,3.933333
3,311,2023-01-01,13459,2912.38,6.41,11.6,116,Swimming,Rain,Office,Tired,22.15
4,759,2023-01-01,15378,3344.51,17.88,7.4,84,Swimming,Rain,Office,Neutral,0.866667


In [8]:
df['total_hours'] = df['sleep_hours'] + df['active_hrs']
df = df[df['total_hours'] <= 24]
df.drop(columns=['total_hours'], inplace=True)
df.head()

Unnamed: 0,user_id,date,steps,calories_burned,distance_km,sleep_hours,heart_rate_avg,workout_type,weather_conditions,location,mood,active_hrs
0,468,2023-01-01,4530,2543.02,16.1,1.5,176,Walking,Clear,Park,Tired,10.216667
1,879,2023-01-01,11613,1720.76,8.1,6.3,128,Cycling,Fog,Park,Happy,5.866667
2,152,2023-01-01,27335,1706.35,3.57,6.7,134,Yoga,Snow,Park,Neutral,3.933333
4,759,2023-01-01,15378,3344.51,17.88,7.4,84,Swimming,Rain,Office,Neutral,0.866667
6,245,2023-01-01,498,3553.37,3.31,7.4,154,Swimming,Snow,Gym,Stressed,8.9


In [9]:
imputer = SimpleImputer(strategy='most_frequent')
df['workout_type'] = imputer.fit_transform(df[['workout_type']]).ravel()

In [10]:
aggregation_rules = {
    'steps': 'mean',
    'calories_burned': 'mean',
    'distance_km': 'mean',
    'active_hrs': 'mean',
    'sleep_hours': 'mean',
    'heart_rate_avg': 'mean',
    'workout_type': lambda x: x.mode()[0], 
    'weather_conditions': lambda x: x.mode()[0],
    'location': lambda x: x.mode()[0],
    'mood': lambda x: x.mode()[0] 
}
df_ag = df.groupby(['user_id', 'date']).agg(aggregation_rules).reset_index()

In [11]:
df_ag['workout_type'] = df_ag['workout_type'].map({'Walking':0, 'Cycling':1, 'Yoga':2, 'Swimming':3, 'Gym Workout':4, 'Running':5})

In [12]:
df_ag['weather_conditions'] = df_ag['weather_conditions'].map({'Clear':0, 'Fog':1, 'Snow':2, 'Rain':3})

In [13]:
df_ag['location'] = df_ag['location'].map({'Park':0, 'Office':1, 'Home':2, 'Gym':3, 'Other':4})

In [14]:
df_ag = df_ag.sort_values(by=['user_id', 'date'])

In [15]:
df_ag['days_since_last_workout'] = df_ag.groupby('user_id')['date'].diff().dt.days.fillna(0).astype(int)
df_ag.loc[df_ag.groupby('user_id').head(1).index, 'days_since_last_workout'] = 0

In [16]:
df_ag.head()

Unnamed: 0,user_id,date,steps,calories_burned,distance_km,active_hrs,sleep_hours,heart_rate_avg,workout_type,weather_conditions,location,mood,days_since_last_workout
0,1,2023-01-03,1762.0,3606.03,9.77,9.45,5.4,76.0,1,0,3,Neutral,0
1,1,2023-01-06,4195.0,3364.94,6.76,15.533333,3.1,122.0,3,3,1,Tired,3
2,1,2023-01-08,29530.0,1988.14,17.9,11.566667,10.0,60.0,1,0,1,Neutral,2
3,1,2023-01-09,9728.666667,2963.53,14.52,5.638889,6.766667,133.333333,4,0,0,Happy,1
4,1,2023-01-10,24103.0,3859.28,18.32,8.416667,4.1,76.0,4,1,1,Neutral,1


In [17]:
user_models = {}  
users_group = df_ag.groupby('user_id')

In [18]:
users_group.head()

Unnamed: 0,user_id,date,steps,calories_burned,distance_km,active_hrs,sleep_hours,heart_rate_avg,workout_type,weather_conditions,location,mood,days_since_last_workout
0,1,2023-01-03,1762.000000,3606.030,9.770,9.450000,5.400000,76.000000,1,0,3,Neutral,0
1,1,2023-01-06,4195.000000,3364.940,6.760,15.533333,3.100000,122.000000,3,3,1,Tired,3
2,1,2023-01-08,29530.000000,1988.140,17.900,11.566667,10.000000,60.000000,1,0,1,Neutral,2
3,1,2023-01-09,9728.666667,2963.530,14.520,5.638889,6.766667,133.333333,4,0,0,Happy,1
4,1,2023-01-10,24103.000000,3859.280,18.320,8.416667,4.100000,76.000000,4,1,1,Neutral,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
192629,999,2023-01-02,22364.000000,3573.940,9.715,10.566667,6.600000,156.000000,4,3,3,Happy,0
192630,999,2023-01-03,5414.000000,2116.785,6.570,11.175000,3.850000,83.000000,4,1,1,Neutral,1
192631,999,2023-01-04,5513.500000,2655.785,15.680,2.991667,4.950000,143.000000,4,1,4,Stressed,1
192632,999,2023-01-05,13253.000000,2949.820,6.420,2.116667,5.100000,138.000000,3,3,3,Stressed,1


In [19]:
model_performance = {}
user_models = {}

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0]
}

label_encoder = LabelEncoder()

models = {
    'RandomForest': (RandomForestClassifier(random_state=110), param_grid_rf),
    'XGBoost': (XGBClassifier(random_state=110, eval_metric='logloss'), param_grid_xgb)
}

for user_id, user_data in users_group:
    user_data_balanced = resample(user_data, replace=True, n_samples=len(user_data), stratify=user_data['mood'], random_state=110)
    user_data_balanced['mood'] = label_encoder.fit_transform(user_data_balanced['mood'])
    
    X = user_data_balanced.drop(columns=['user_id', 'date', 'mood'])
    y = user_data_balanced['mood']

    best_accuracy = 0
    best_model = None
    best_model_name = None

    for model_name, (model, param_grid) in models.items():
        grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
        grid_search.fit(X, y)
        best_estimator = grid_search.best_estimator_

        y_pred = cross_val_predict(best_estimator, X, y, cv=5)
        accuracy = cross_val_score(best_estimator, X, y, cv=5).mean()

        print(f'User ID: {user_id} | Model: {model_name}')
        print(f'Best Parameters: {grid_search.best_params_}')
        print(f'Cross-Validation Accuracy: {accuracy * 100:.2f}%')
        print('Classification Report:')
        print(classification_report(y, y_pred))
        print('\n')

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = best_estimator
            best_model_name = model_name

    best_model.fit(X, y)
    user_models[user_id] = best_model
    model_performance[user_id] = (best_model_name, best_accuracy)

sorted_models = sorted(model_performance.items(), key=lambda item: item[1][1], reverse=True)
print("Sorted Models by Performance:")
for user_id, (model_name, accuracy) in sorted_models:
    print(f'User ID: {user_id} | Best Model: {model_name} | Accuracy: {accuracy * 100:.2f}%')

User ID: 1 | Model: RandomForest
Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Cross-Validation Accuracy: 72.43%
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.75      0.72        64
           1       0.75      0.75      0.75        53
           2       0.70      0.68      0.69        38
           3       0.77      0.67      0.71        30

    accuracy                           0.72       185
   macro avg       0.73      0.71      0.72       185
weighted avg       0.73      0.72      0.72       185



User ID: 1 | Model: XGBoost
Best Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}
Cross-Validation Accuracy: 73.51%
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.72      0.74        64
           1       0.81      0.79      0.80        53
           2       0.63      0.71      0.67      

In [25]:
top_accuracies = [accuracy for user_id, (model_name, accuracy) in sorted_models[:20]]
mean_accuracy = np.mean(top_accuracies)
print(f'Overall Accuracy: {mean_accuracy:.4f}')

Overall Accuracy: 0.8028


In [21]:
top_models = [model for user_id, accuracy in sorted_models[:20]]

In [34]:
def preprocess_data(data, df_ag):
    data['date'] = pd.to_datetime(data['date'])
    data['active_hrs'] = data['active_minutes'] / 60
    data.drop(columns=['active_minutes'], inplace=True)
    data['location'] = data['location'].map({'Park':0, 'Office':1, 'Home':2, 'Gym':3, 'Other':4})
    data['weather_conditions'] = data['weather_conditions'].map({'Clear':0, 'Fog':1, 'Snow':2, 'Rain':3})
    data['workout_type'] = data['workout_type'].map({'Walking':0, 'Cycling':1, 'Yoga':2, 'Swimming':3, 'Gym Workout':4, 'Running':5})
    user_id = data['user_id'].iloc[0]
    if user_id in df_ag['user_id'].unique():
        last_entry_date = df_ag[df_ag['user_id'] == user_id]['date'].max()
        data['days_since_last_workout'] = (data['date'] - last_entry_date).dt.days
    else:
        data['days_since_last_workout'] = 0
    return data

In [35]:
def predict_mood(new_data, df_ag):
    new_data = preprocess_data(new_data, df_ag)
    user_id = new_data['user_id'].iloc[0]
    X = new_data.drop(columns=['user_id', 'date'])
    required_columns = ['steps', 'calories_burned', 'distance_km', 'active_hrs',
                    'sleep_hours', 'heart_rate_avg', 'workout_type','weather_conditions', 'location', 'days_since_last_workout']
    X = X[required_columns]
    
    if user_id in user_models:
        model = user_models[user_id]
        mood_pred = model.predict(X)
    else:
        predictions = [model.predict(X)[0] for model in top_models]
        mood_pred = mode(predictions)
    
    mood_pred = label_encoder.inverse_transform([mood_pred])
    return mood_pred[0]

In [36]:
new_data = pd.DataFrame({
    'user_id': [999],
    'date': ['2024-02-02'],
    'steps': [1010],
    'calories_burned': [2750.02],
    'distance_km': [12.10],
    'active_minutes': [513],
    'sleep_hours': [8.5],
    'heart_rate_avg': [176],
    'workout_type': ['Walking'],
    'weather_conditions': ['Snow'],
    'location': ['Office']
})
predicted_mood = predict_mood(new_data, df_ag)
print(f'Predicted Mood: {predicted_mood}')


Predicted Mood: Neutral


  y = column_or_1d(y, warn=True)


In [26]:
import sklearn
import numpy as np
import joblib

print("scikit-learn version:", sklearn.__version__)
print("NumPy version:", np.__version__)
print("joblib version:", joblib.__version__)


scikit-learn version: 1.2.2
NumPy version: 1.26.4
joblib version: 1.2.0


In [38]:
import os
folder_path = 'C:\\Users\\lenovo\\OneDrive\\Desktop\\Final project\\aggregated_data'
file_path = os.path.join(folder_path, 'aggregated_data.csv')
df_ag.to_csv(file_path, index=False)
print(f'DataFrame successfully saved to {file_path}')

DataFrame successfully saved to C:\Users\lenovo\OneDrive\Desktop\Final project\aggregated_data\aggregated_data.csv


In [39]:
import joblib
import os

# Define the path for the models directory
models_dir = 'C:\\Users\\lenovo\\OneDrive\\Desktop\\Final project\\models'

# Create the directory if it doesn't exist
os.makedirs(models_dir, exist_ok=True)

# Save the Label Encoder
label_encoder_path = os.path.join(models_dir, 'label_encoder.pkl')
joblib.dump(label_encoder, label_encoder_path)
print(f'Label Encoder saved at {label_encoder_path}')

# Save each model based on the sorted performance
for index, (user_id, (model_name, accuracy)) in enumerate(sorted_models):
    model = user_models[user_id]
    # Save the model using joblib
    file_path = os.path.join(models_dir, f'{user_id}_{model_name}_model.pkl')
    joblib.dump(model, file_path)
    print(f'Model for User ID {user_id} saved at {file_path}')


Label Encoder saved at C:\Users\lenovo\OneDrive\Desktop\Final project\models\label_encoder.pkl
Model for User ID 668 saved at C:\Users\lenovo\OneDrive\Desktop\Final project\models\668_XGBoost_model.pkl
Model for User ID 452 saved at C:\Users\lenovo\OneDrive\Desktop\Final project\models\452_RandomForest_model.pkl
Model for User ID 465 saved at C:\Users\lenovo\OneDrive\Desktop\Final project\models\465_XGBoost_model.pkl
Model for User ID 530 saved at C:\Users\lenovo\OneDrive\Desktop\Final project\models\530_XGBoost_model.pkl
Model for User ID 257 saved at C:\Users\lenovo\OneDrive\Desktop\Final project\models\257_RandomForest_model.pkl
Model for User ID 278 saved at C:\Users\lenovo\OneDrive\Desktop\Final project\models\278_XGBoost_model.pkl
Model for User ID 791 saved at C:\Users\lenovo\OneDrive\Desktop\Final project\models\791_XGBoost_model.pkl
Model for User ID 61 saved at C:\Users\lenovo\OneDrive\Desktop\Final project\models\61_XGBoost_model.pkl
Model for User ID 409 saved at C:\Users\l