# Soldier Statistics 

In [188]:
import pandas as pd

In [2]:
## data loading ##

# WHICH PEOPLE TO RETAIN #
df_watches_data = pd.read_excel('Processed_data/additional_data.xlsx')
people_to_work_with = set(df_watches_data['userId'])

#######################################################################################################################################
### UNPROCESSED FILES ###
with open('Processed_data/unprocessed_all_data.pickle', 'rb') as file:
    # entire_unprocessed_dataset = pickle.load(file)
    entire_unprocessed_dataset = pd.read_pickle(file)
    
# light sleep,awake_sleep,deep_sleep, awake (no ?)
with open('Processed_data/user_sleeping_unprocessed.pkl', 'rb') as file:
    # user_sleeping_unprocessed = pickle.load(file)
    user_sleeping_unprocessed = pd.read_pickle(file)

#soldiers: heart-rate unprocessed (15 secs), ['sleep']['sleep'][week] = sleep data with Light Sleep, Deep Sleep, ?, Awake (? means we had no data for that minute).
# epoch with duplicates and 15 minutes interval.
with open('Processed_data/soldiers_unprocessed.pkl', 'rb') as file:
    # soldiers_unprocessed = pickle.load(file)
    soldiers_unprocessed = pd.read_pickle(file)

#######################################################################################################################################
### PROCESSED FILES ###
with open('Processed_data/hr_one_minute.pkl', 'rb') as file:
    # heart_rate_one_minute = pickle.load(file)
    heart_rate_one_minute = pd.read_pickle(file)

input_file_sleep_weekly_012 = 'Processed_data/sleep_readable.pickle'  # File path of the pickle file
with open(input_file_sleep_weekly_012, 'rb') as f:
    # user_sleep_readable = pickle.load(f)
    user_sleep_readable = pd.read_pickle(f)

steps_hr_file = 'Processed_data/merge_dict_hr_steps_distance.pickle'  # File path of the pickle file
with open(steps_hr_file, 'rb') as f:
    # step_hr_dict = pickle.load(f)
    sleep_hr_dict = pd.read_pickle(f)

## Pre Processing Stats

In [195]:
columns = [
    'Soldier', 
    'Amount of data in minutes', 
    'Amount of filled sleep data', 
    'Amount of filled awake data', 
    'Amount of missing data', 
    'Percent of data filled', 
    'Percent of data missing', 
    'Amount of filled sleep during the day', 
    'Percent of day filled in as sleep'
]

soldier_stats_preprocessing = pd.DataFrame(columns=columns)

day_start_hour = 6
day_end_hour = 19

lst = []
for soldier_id in soldiers_unprocessed.keys():
    sleep_df = soldiers_unprocessed[soldier_id]['sleep']['sleep_by_minutes'].copy()
    sleep_df.drop_duplicates(subset=['Date','Hour','Minute'], inplace=True)
    
    total_data_minutes = sleep_df.shape[0]
    
    filled_sleep_data = sleep_df[sleep_df['SleepState'] == 'Deep Sleep'].shape[0] + sleep_df[sleep_df['SleepState'] == 'Light Sleep'].shape[0]
    filled_awake_data = sleep_df[sleep_df['SleepState'] == 'Awake'].shape[0]
    missing_data = sleep_df[sleep_df['SleepState'] == '?'].shape[0]

    percent_filled = round(100 * (filled_sleep_data + filled_awake_data) / total_data_minutes, 2) if total_data_minutes > 0 else 100
    percent_missing = round(100 * missing_data / total_data_minutes, 2) if total_data_minutes > 0 else 0

    day_sleep_data = sleep_df[(sleep_df['Hour'].astype(int) >= day_start_hour) & (sleep_df['Hour'].astype(int) < day_end_hour)]

    filled_sleep_during_day = day_sleep_data[day_sleep_data['SleepState'] == 'Deep Sleep'].shape[0] + day_sleep_data[day_sleep_data['SleepState'] == 'Light Sleep'].shape[0]

    percent_day_filled_as_sleep = round(100 * filled_sleep_during_day / day_sleep_data.shape[0], 2) if day_sleep_data.shape[0] > 0 else 0
    
    soldier_stats_row = pd.DataFrame([[soldier_id, total_data_minutes, filled_sleep_data, filled_awake_data, 
                                       missing_data, percent_filled, percent_missing, 
                                       filled_sleep_during_day, percent_day_filled_as_sleep]], 
                                     columns=columns)
    lst.append(soldier_stats_row)

soldier_stats_preprocessing = pd.concat(lst)
soldier_stats_preprocessing = soldier_stats_preprocessing.set_index('Soldier')

# Don't take into account soldiers with no data in the averages
no_zero_stats = soldier_stats_preprocessing[soldier_stats_preprocessing['Amount of data in minutes'] != 0]
average_row = round(no_zero_stats.mean(), 2)
average_row['Soldier'] = 'average'
average_row_df = pd.DataFrame(average_row).T 
lst.append(average_row_df)

soldier_stats_preprocessing = pd.concat(lst)
soldier_stats_preprocessing = soldier_stats_preprocessing.set_index('Soldier')

In [196]:
soldier_stats_preprocessing.tail()

Unnamed: 0_level_0,Amount of data in minutes,Amount of filled sleep data,Amount of filled awake data,Amount of missing data,Percent of data filled,Percent of data missing,Amount of filled sleep during the day,Percent of day filled in as sleep
Soldier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
fbfa150b-4c62-412d-9e84-ef06e8495fc5,64800.0,13423.0,492.0,50885.0,21.47,78.53,101.0,0.29
fd486705-4873-4001-95f5-6f5bbf0d620f,12960.0,2450.0,142.0,10368.0,20.0,80.0,91.0,1.3
fd949638-b68e-4e68-b660-f31133ed2a19,17280.0,2463.0,13.0,14804.0,14.33,85.67,0.0,0.0
ff28a7f9-ebf2-43de-a8fb-5cc213ec5dea,36000.0,6006.0,182.0,29812.0,17.19,82.81,977.0,5.01
average,78598.03,15034.03,610.66,62953.34,18.44,81.56,977.43,2.55


In [209]:
soldier_stats_preprocessing.to_pickle("soldier_stats_preprocessing.pkl")

## Post Processing Stats

In [204]:
columns = [
    'Soldier', 
    'Amount of data in minutes', 
    'Amount of filled sleep data', 
    'Amount of filled awake data', 
    'Amount of missing data', 
    'Percent of data filled', 
    'Percent of data missing', 
    'Amount of filled sleep during the day', 
    'Percent of day filled in as sleep'
]

soldier_stats_postprocessing = pd.DataFrame(columns=columns)

day_start_hour = 6
day_end_hour = 19

lst = []
for soldier_id in user_sleep_readable.keys():
    weeks = list(user_sleep_readable[soldier_id]['weeks'].keys())
    if len(weeks) > 0:
        sleep_df = user_sleep_readable[soldier_id]['weeks'][weeks[0]].copy()
        for w in weeks[1:]:
            sleep_df = pd.concat([sleep_df,  user_sleep_readable[soldier_id]['weeks'][w]])
        sleep_df.drop_duplicates(subset=['datetime'], inplace=True)
    else:
        sleep_df = pd.DataFrame(columns=['SleepState','datetime'])
    
    total_data_minutes = sleep_df.shape[0]
    
    filled_sleep_data = sleep_df[sleep_df['SleepState'] == 1].shape[0]
    filled_awake_data = sleep_df[sleep_df['SleepState'] == 0].shape[0]
    missing_data = sleep_df[sleep_df['SleepState'] == 2].shape[0]

    percent_filled = round(100 * (filled_sleep_data + filled_awake_data) / total_data_minutes, 2) if total_data_minutes > 0 else 100
    percent_missing = round(100 * missing_data / total_data_minutes, 2) if total_data_minutes > 0 else 0

    if total_data_minutes > 0:
        day_sleep_data = sleep_df[(sleep_df['datetime'].dt.hour.astype(int) >= day_start_hour) & (sleep_df['datetime'].dt.hour.astype(int) < day_end_hour)]
    else:
        day_sleep_data = sleep_df

    filled_sleep_during_day = day_sleep_data[day_sleep_data['SleepState'] == 1].shape[0]

    percent_day_filled_as_sleep = round(100 * filled_sleep_during_day / day_sleep_data.shape[0], 2) if day_sleep_data.shape[0] > 0 else 0
    
    soldier_stats_row = pd.DataFrame([[soldier_id, total_data_minutes, filled_sleep_data, filled_awake_data, 
                                       missing_data, percent_filled, percent_missing, 
                                       filled_sleep_during_day, percent_day_filled_as_sleep]], 
                                     columns=columns)
    lst.append(soldier_stats_row)

soldier_stats_postprocessing = pd.concat(lst)
soldier_stats_postprocessing = soldier_stats_postprocessing.set_index('Soldier')

# Don't take into account soldiers with no data in the averages
no_zero_stats = soldier_stats_postprocessing[soldier_stats_postprocessing['Amount of data in minutes'] != 0]
average_row = round(no_zero_stats.mean(), 2)
average_row['Soldier'] = 'average'
average_row_df = pd.DataFrame(average_row).T 
lst.append(average_row_df)

soldier_stats_postprocessing = pd.concat(lst)
soldier_stats_postprocessing = soldier_stats_postprocessing.set_index('Soldier')

In [207]:
soldier_stats_postprocessing.tail()

Unnamed: 0_level_0,Amount of data in minutes,Amount of filled sleep data,Amount of filled awake data,Amount of missing data,Percent of data filled,Percent of data missing,Amount of filled sleep during the day,Percent of day filled in as sleep
Soldier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
fbfa150b-4c62-412d-9e84-ef06e8495fc5,64800.0,51492.0,11522.0,1786.0,97.24,2.76,26237.0,74.75
fd486705-4873-4001-95f5-6f5bbf0d620f,12960.0,4148.0,7995.0,817.0,93.7,6.3,751.0,10.7
fd949638-b68e-4e68-b660-f31133ed2a19,17280.0,9785.0,1908.0,5587.0,67.67,32.33,4728.0,50.51
ff28a7f9-ebf2-43de-a8fb-5cc213ec5dea,36000.0,21406.0,13307.0,1287.0,96.42,3.58,9437.0,48.39
average,78598.03,41789.06,35771.93,1037.05,95.11,4.89,17422.36,37.71


In [208]:
soldier_stats_postprocessing.to_pickle("soldier_stats_postprocessing.pkl")

## Averages comparison 

In [197]:
a = soldier_stats_preprocessing.loc['average']
b = soldier_stats_postprocessing.loc['average']
a.name = 'pre processing averages'
b.name = 'post processing averages'
df_combined = pd.DataFrame([a, b]).T

In [198]:
df_combined

Unnamed: 0,pre processing averages,post processing averages
Amount of data in minutes,78598.03,78598.03
Amount of filled sleep data,15034.03,41789.06
Amount of filled awake data,610.66,35771.93
Amount of missing data,62953.34,1037.05
Percent of data filled,18.44,95.11
Percent of data missing,81.56,4.89
Amount of filled sleep during the day,977.43,17422.36
Percent of day filled in as sleep,2.55,37.71
