In [110]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, silhouette_score
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [111]:
df_new = pd.read_csv('eda_data.csv')

In [112]:
df_new

Unnamed: 0,user_id,date,steps,calories_burned,distance_km,active_minutes,sleep_hours,heart_rate_avg,workout_type,weather_conditions,location,mood
0,468,01-01-2023,4530,2543.02,16.10,613,1.5,176,Walking,Clear,Park,Tired
1,879,01-01-2023,11613,1720.76,8.10,352,6.3,128,Cycling,Fog,Park,Happy
2,152,01-01-2023,27335,1706.35,3.57,236,6.7,134,Yoga,Snow,Park,Neutral
3,311,01-01-2023,13459,2912.38,6.41,1329,11.6,116,Swimming,Rain,Office,Tired
4,759,01-01-2023,15378,3344.51,17.88,52,7.4,84,Swimming,Rain,Office,Neutral
...,...,...,...,...,...,...,...,...,...,...,...,...
364995,251,31-12-2023,14298,3333.78,5.57,942,11.4,136,Cycling,Fog,Home,Happy
364996,314,31-12-2023,14751,2465.31,17.93,590,5.4,116,Running,Snow,Other,Stressed
364997,58,31-12-2023,3340,2078.38,13.87,1137,0.2,99,Walking,Fog,Park,Happy
364998,606,31-12-2023,16148,3279.38,6.06,1107,1.2,65,Cycling,Fog,Home,Stressed


In [113]:
steps_per_calorie = df_new['steps'] / df_new['calories_burned']

In [114]:
df_new.insert(3, 'steps_per_calorie', steps_per_calorie)


In [115]:
df_new

Unnamed: 0,user_id,date,steps,steps_per_calorie,calories_burned,distance_km,active_minutes,sleep_hours,heart_rate_avg,workout_type,weather_conditions,location,mood
0,468,01-01-2023,4530,1.781347,2543.02,16.10,613,1.5,176,Walking,Clear,Park,Tired
1,879,01-01-2023,11613,6.748762,1720.76,8.10,352,6.3,128,Cycling,Fog,Park,Happy
2,152,01-01-2023,27335,16.019574,1706.35,3.57,236,6.7,134,Yoga,Snow,Park,Neutral
3,311,01-01-2023,13459,4.621306,2912.38,6.41,1329,11.6,116,Swimming,Rain,Office,Tired
4,759,01-01-2023,15378,4.597983,3344.51,17.88,52,7.4,84,Swimming,Rain,Office,Neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...
364995,251,31-12-2023,14298,4.288825,3333.78,5.57,942,11.4,136,Cycling,Fog,Home,Happy
364996,314,31-12-2023,14751,5.983426,2465.31,17.93,590,5.4,116,Running,Snow,Other,Stressed
364997,58,31-12-2023,3340,1.607021,2078.38,13.87,1137,0.2,99,Walking,Fog,Park,Happy
364998,606,31-12-2023,16148,4.924102,3279.38,6.06,1107,1.2,65,Cycling,Fog,Home,Stressed


In [116]:
df_new = df_new.drop(columns=['calories_burned','steps'])

In [117]:
df_new['date'] = pd.to_datetime(df_new['date'], format='%d-%m-%Y')

In [118]:
df_new['active_minutes'] = df_new['active_minutes']//60

In [119]:
df_new

Unnamed: 0,user_id,date,steps_per_calorie,distance_km,active_minutes,sleep_hours,heart_rate_avg,workout_type,weather_conditions,location,mood
0,468,2023-01-01,1.781347,16.10,10,1.5,176,Walking,Clear,Park,Tired
1,879,2023-01-01,6.748762,8.10,5,6.3,128,Cycling,Fog,Park,Happy
2,152,2023-01-01,16.019574,3.57,3,6.7,134,Yoga,Snow,Park,Neutral
3,311,2023-01-01,4.621306,6.41,22,11.6,116,Swimming,Rain,Office,Tired
4,759,2023-01-01,4.597983,17.88,0,7.4,84,Swimming,Rain,Office,Neutral
...,...,...,...,...,...,...,...,...,...,...,...
364995,251,2023-12-31,4.288825,5.57,15,11.4,136,Cycling,Fog,Home,Happy
364996,314,2023-12-31,5.983426,17.93,9,5.4,116,Running,Snow,Other,Stressed
364997,58,2023-12-31,1.607021,13.87,18,0.2,99,Walking,Fog,Park,Happy
364998,606,2023-12-31,4.924102,6.06,18,1.2,65,Cycling,Fog,Home,Stressed


In [120]:
df_new = df_new.sort_values(by=['date'])

In [121]:
df_new

Unnamed: 0,user_id,date,steps_per_calorie,distance_km,active_minutes,sleep_hours,heart_rate_avg,workout_type,weather_conditions,location,mood
0,468,2023-01-01,1.781347,16.10,10,1.5,176,Walking,Clear,Park,Tired
658,487,2023-01-01,0.462605,16.41,1,4.7,129,Walking,Snow,Park,Happy
659,990,2023-01-01,12.645253,0.99,3,9.2,115,Cycling,Fog,Other,Tired
660,694,2023-01-01,8.291206,14.23,8,4.2,65,Walking,Fog,Gym,Happy
661,939,2023-01-01,10.275483,3.93,4,5.7,158,Walking,Clear,Home,Neutral
...,...,...,...,...,...,...,...,...,...,...,...
364338,859,2023-12-31,1.928598,7.04,11,10.8,63,Yoga,Clear,Other,Happy
364339,795,2023-12-31,4.516547,15.86,22,11.6,60,Gym Workout,Fog,Home,Tired
364340,589,2023-12-31,10.417495,14.71,0,9.2,139,Cycling,Rain,Home,Stressed
364327,791,2023-12-31,0.645785,6.07,14,5.4,127,Gym Workout,Rain,Gym,Happy


In [122]:
def capped_cumsum(series, cap=20):
    cumsum = 0
    result = []
    for value in series:
         if cumsum + value <= cap:
             cumsum += value
             result.append(value)
    print(sum(result))

In [123]:
aggregation_functions = {
    'steps_per_calorie': 'sum',  # Sum numeric columns
    'distance_km': 'sum',
    'active_minutes': lambda x: capped_cumsum(x),  # Use the capped cumulative sum function
    'sleep_hours': 'mean',       # Average sleep
    'heart_rate_avg': 'mean',    # Average heart rate
    'workout_type': lambda x: x.iloc[0],  # Take the first occurrence
    'weather_conditions': lambda x: x.mode()[0],  # Most frequent weather condition
    'location': lambda x: x.mode()[0],     # Most frequent location
    'mood': lambda x: x.mode()[0]          # Most frequent mood
}


In [124]:
df_new = df_new.groupby(['date','user_id']).agg(aggregation_functions).reset_index()

In [125]:
df_new

Unnamed: 0,date,steps_per_calorie,distance_km,active_minutes,sleep_hours,heart_rate_avg,workout_type,weather_conditions,location,mood
0,2023-01-01,5816.530248,9908.35,20,6.0266,118.872,Walking,Clear,Home,Happy
1,2023-01-02,5863.920715,10001.94,20,6.1984,120.748,Gym Workout,Rain,Office,Stressed
2,2023-01-03,5939.816642,9956.44,20,5.9822,118.861,Swimming,Fog,Office,Stressed
3,2023-01-04,6128.948205,9955.26,20,6.1319,117.230,Gym Workout,Rain,Home,Neutral
4,2023-01-05,6045.792401,10179.08,20,5.8541,119.498,Gym Workout,Rain,Park,Stressed
...,...,...,...,...,...,...,...,...,...,...
360,2023-12-27,5887.917610,10119.00,20,5.9958,119.081,Cycling,Clear,Office,Happy
361,2023-12-28,5894.388567,9733.02,20,5.7732,119.155,Gym Workout,Rain,Gym,Happy
362,2023-12-29,5663.540429,10210.00,20,5.9871,120.528,Gym Workout,Fog,Home,Tired
363,2023-12-30,5854.204911,9889.94,20,6.0804,118.778,Cycling,Fog,Park,Stressed


In [126]:
data_new = pd.to_datetime(df_new['date']).dt.day_name()
df_new.insert(0, 'day_of_week', data_new)

In [127]:
df_new = df_new.drop(columns = ['date'])

In [128]:
dicts = {'Stressed':1,'Happy':2,'Tired':3,'Neutral':4}

In [129]:
label_encoder = LabelEncoder()
cols = ['day_of_week','workout_type', 'weather_conditions', 'location']
for col in cols:
    df_new[col] = label_encoder.fit_transform(df_new[col])

In [133]:
x = df_new.iloc[:,:-1]
y = df_new.iloc[:,-1]

In [134]:
y = y.map(dicts)

In [135]:
y.value_counts()

mood
1    98
2    96
3    87
4    84
Name: count, dtype: int64

In [136]:
smote=SMOTE(sampling_strategy='minority') 
x,y=smote.fit_resample(x,y)

In [137]:
scaler = StandardScaler()

columsn_scale = ['steps_per_calorie','distance_km','active_minutes','sleep_hours','heart_rate_avg']
# Step 4: Fit and transform the data
x[columsn_scale] = scaler.fit_transform(x[columsn_scale])

In [143]:
x.values.reshape(-1,1)
y.values.reshape(-1,1)

array([[2],
       [1],
       [1],
       [4],
       [1],
       [3],
       [4],
       [4],
       [3],
       [1],
       [1],
       [3],
       [2],
       [3],
       [3],
       [2],
       [2],
       [2],
       [1],
       [1],
       [3],
       [3],
       [2],
       [3],
       [4],
       [1],
       [1],
       [2],
       [1],
       [3],
       [1],
       [1],
       [1],
       [3],
       [4],
       [4],
       [4],
       [1],
       [2],
       [3],
       [4],
       [1],
       [1],
       [4],
       [2],
       [2],
       [1],
       [3],
       [3],
       [2],
       [2],
       [1],
       [1],
       [2],
       [4],
       [1],
       [4],
       [1],
       [4],
       [2],
       [1],
       [1],
       [2],
       [1],
       [3],
       [4],
       [1],
       [4],
       [1],
       [1],
       [2],
       [4],
       [2],
       [4],
       [2],
       [2],
       [1],
       [1],
       [4],
       [1],
       [2],
       [3],
       [3],
    

In [144]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [148]:
models = {
    
    'RandomForest': RandomForestClassifier(),
    'DecisionTree': DecisionTreeClassifier(),
    'NaiveBayes': GaussianNB(),
}

In [149]:
for i,j in models.items():
    model_out = j.fit(x_train,y_train)
    mode_pre = model_out.predict(y_test)
    print("Accuracy : ", accuracy_score(y_train, y_pred_svc1))



ValueError: Expected a 2-dimensional container but got <class 'pandas.core.series.Series'> instead. Pass a DataFrame containing a single row (i.e. single sample) or a single column (i.e. single feature) instead.

In [None]:
user_models = {}

In [None]:
# def filter_by_duration(group):
#     group['cumulative_duration'] = group['active_minutes'].cumsum()
#     #Filter the rows where cumulative duration is less than or equal to 20 hours
#     return group[group['cumulative_duration'] <= 20]

In [None]:
# df_new = df_new.groupby(['date'], group_keys=False).apply(filter_by_duration)

In [None]:
df_new = df_grouped

In [None]:
f