In [1]:
import pymongo
import json
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
from datetime import time
from tqdm import tqdm
import datetime
import statistics

myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["studentlife"]

In [2]:
collection_stress = mydb["ema_Stress"]

In [3]:
df_data = DataFrame()

In [4]:
student_uid = ['u00', 'u01', 'u02', 'u03', 'u04', 'u05', 'u07', 'u08', 'u09', 'u10', 'u12', 'u13', 'u14', 'u15', 'u16',
                'u17', 'u18', 'u19', 'u20', 'u22', 'u23', 'u24', 'u25', 'u27', 'u30', 'u31', 'u32', 'u33', 'u34', 'u35',
                'u36', 'u39', 'u41', 'u42', 'u43', 'u44', 'u45', 'u46', 'u47', 'u49', 'u50', 'u51', 'u52', 'u53', 'u54',
                'u56', 'u57', 'u58', 'u59']

In [5]:
morning_start_time = time(6, 0, 0)    # 6:00 AM
afternoon_start_time = time(12, 0, 0) # 12:00 PM
evening_start_time = time(18, 0, 0)   # 6:00 PM
night_start_time = time(0, 0, 0)      # 12:00 AM (midnight)

In [6]:
def classify_stress(mean_stress, median_stress):
    if mean_stress < median_stress:
        return "low stress"
    elif mean_stress > median_stress:
        return "high stress"
    elif mean_stress == median_stress:
        return "medium stress"

In [7]:
median_levels = {}

for uid in student_uid:
    level_values = []
    for document in collection_stress.find({"uid": uid}, {"_id": 0, "level": 1}):
        level = document.get("level") 
        if level is not None:
            level = int(level) 
            if level == 5:
                level_values.append(1)
            elif level == 4:
                level_values.append(2)
            elif level == 1:
                level_values.append(3)
            elif level == 2:
                level_values.append(4)
            elif level == 3:
                level_values.append(5)
    
    if level_values:
        median_level = statistics.median(level_values)
        median_levels[uid] = median_level

print(median_levels)

{'u00': 3.0, 'u01': 3.0, 'u02': 3.0, 'u03': 3, 'u04': 3.5, 'u05': 2.0, 'u07': 2, 'u08': 4, 'u09': 2, 'u10': 3.0, 'u12': 3.0, 'u14': 4, 'u15': 3.0, 'u16': 4, 'u17': 5.0, 'u18': 3.0, 'u19': 3, 'u20': 3, 'u22': 3, 'u23': 3.0, 'u24': 3, 'u25': 3.0, 'u27': 3, 'u30': 2.0, 'u31': 3.0, 'u32': 3.0, 'u33': 3.0, 'u34': 3.0, 'u35': 3, 'u36': 3.0, 'u39': 3.0, 'u41': 4, 'u42': 2, 'u43': 3, 'u44': 3, 'u45': 2, 'u46': 4, 'u47': 3, 'u49': 2, 'u50': 4.0, 'u51': 3.0, 'u52': 3, 'u53': 3.0, 'u54': 3.0, 'u56': 2, 'u57': 3, 'u58': 3, 'u59': 3}


In [8]:
def process_stress_data_for_uid(uid):
    df_result = pd.DataFrame(columns=["uid", "date", "stress_ratings"])
    daily_stress_data = {}

    for doc in collection_stress.find({"uid": uid}):
        if "timestamp" in doc and "level" in doc:
            timestamp = int(doc["timestamp"])
            level = int(doc["level"])

            if level == 5:
                level = 1
            elif level == 4:
                level = 2
            elif level == 1:
                level = 3
            elif level == 2:
                level = 4
            elif level == 3:
                level = 5

            current_date = datetime.datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%d")

            if current_date not in daily_stress_data:
                daily_stress_data[current_date] = []
            daily_stress_data[current_date].append(level)

    for date, stress_levels in daily_stress_data.items(): 
        mean_stress = statistics.mean(stress_levels)
        uid_median = median_levels.get(uid, 0)  

        stress_category = classify_stress(mean_stress, uid_median)
        df_result = pd.concat([df_result, pd.DataFrame({
            "uid": uid,
            "date": date,
            "stress_ratings": stress_category
        }, index=[0])], ignore_index=True)

    return df_result

In [9]:
df = DataFrame()
for uid in tqdm(student_uid):
    print(uid)
    df = process_stress_data_for_uid(str(uid))
    frames = [df_data, df]
    df_data = pd.concat(frames, ignore_index=True)

 55%|███████████████████████▏                  | 27/49 [00:00<00:00, 267.88it/s]

u00
u01
u02
u03
u04
u05
u07
u08
u09
u10
u12
u13
u14
u15
u16
u17
u18
u19
u20
u22
u23
u24
u25
u27
u30
u31
u32
u33
u34
u35
u36
u39
u41
u42
u43
u44
u45
u46
u47
u49
u50
u51
u52
u53
u54
u56
u57
u58
u59


100%|██████████████████████████████████████████| 49/49 [00:00<00:00, 261.05it/s]


In [10]:
df_data.columns

Index(['uid', 'date', 'stress_ratings'], dtype='object')

In [11]:
df_data

Unnamed: 0,uid,date,stress_ratings
0,u00,2013-03-25,high stress
1,u00,2013-03-26,high stress
2,u00,2013-03-27,medium stress
3,u00,2013-03-28,high stress
4,u00,2013-03-29,low stress
...,...,...,...
1239,u59,2013-06-03,low stress
1240,u59,2013-06-04,low stress
1241,u59,2013-06-05,low stress
1242,u59,2013-06-06,low stress


In [12]:
df_data.to_csv('recreating_dailystress_features.csv')

In [13]:
df_data.isnull().sum()

uid               0
date              0
stress_ratings    0
dtype: int64

In [14]:
len(df_data)

1244

In [15]:
df_data['stress_ratings'].value_counts()

stress_ratings
medium stress    493
high stress      378
low stress       373
Name: count, dtype: int64

In [16]:
df_data.head(3)

Unnamed: 0,uid,date,stress_ratings
0,u00,2013-03-25,high stress
1,u00,2013-03-26,high stress
2,u00,2013-03-27,medium stress


In [17]:
print(len(df_data['uid'].unique()))
df_data['uid'].value_counts()

48


uid
u59    70
u19    55
u16    54
u44    47
u00    46
u57    45
u49    44
u58    40
u10    40
u08    39
u36    36
u32    36
u33    35
u17    35
u43    34
u51    32
u22    31
u52    30
u35    29
u53    28
u04    27
u14    27
u02    27
u01    26
u23    24
u24    23
u12    22
u46    21
u45    20
u07    19
u30    19
u42    19
u03    19
u27    17
u54    16
u56    16
u25    14
u18    13
u31    12
u47    11
u41    10
u20     8
u15     7
u39     6
u34     5
u05     4
u50     3
u09     3
Name: count, dtype: int64