In [1]:
# ! pip install imbalanced-learn
# ! pip install --force-reinstall 'scikit-learn==0.19.2' 
# ! pip install pickle-mixin
# ! pip install --force-reinstall 'scikit-learn==0.19.2' 

In [2]:
import math
import json
import pickle
import numpy as np
import pandas as pd
import coremltools as ct
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from collections import defaultdict

from sklearn.tree import _tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

  from numpy.core.umath_tests import inner1d


In [3]:
STAGE = {
    'In Bed': 0,
    'Awake': 1,
    'Asleep': 2,
    'REM': 3,
    'Core': 4,
    'Deep': 5,
    'Unknown': 6
}

In [32]:
def load_json(path: str) -> dict:
    with open(path, 'r') as file:
        return json.load(file)

def parse_time(time: str) -> datetime:
    return datetime.strptime(time, '%Y-%m-%d %H:%M:%S')

def to_min(cur_time: datetime, start_time: datetime) -> int:
    return int((cur_time - start_time).total_seconds()//60)

def process_sleep_data(sleep_data: dict) -> dict:
    stages_by_day = defaultdict(dict)

    for data in sleep_data:
        if data['stage'] in ['Awake', 'REM', 'Core', 'Deep']:
            start_time = parse_time(data['start_time'])
            end_time = parse_time(data['end_time'])
            day = end_time.strftime('%Y-%m-%d')

            if not day in stages_by_day:
                stages_by_day[day]['stages'] = list()
                
            stages_by_day[day]['stages'].append({
                'start_time': start_time,
                'end_time': end_time,
                'stage': STAGE[data['stage']]
            })

    for date, data in stages_by_day.items():
        stages_by_day[date]['stages'] = sorted(data['stages'], key=lambda x: x['start_time'])
        stages_by_day[date]['start_time'] = stages_by_day[date]['stages'][0]['start_time']
        stages_by_day[date]['end_time'] = stages_by_day[date]['stages'][-1]['end_time']
        stages_by_day[date]['day_of_week'] = stages_by_day[date]['start_time'].weekday()

    return stages_by_day

# def post_process(stage_data: dict) -> pd.DataFrame:
#     ml_data = list()
#     for date, data in stage_data.items():
#         for item in data['stages']:

#             start_time = to_min(item['start_time'], data['start_time'])
#             end_time = to_min(item['end_time'], data['start_time'])
            
            
#             for interval in range(start_time, end_time):
                
#                 time_of_day = (data['start_time'] + timedelta(minutes=interval)).time()
#                 time_in_min = time_of_day.hour*60 + time_of_day.minute
                
#                 ml_data.append((
#                     date,
#                     time_in_min,
#                     data['day_of_week'],
#                     interval,
#                     item['stage']
#                 ))
#     return pd.DataFrame(ml_data, columns=['date', 'start_time', 'day_of_week', 'interval', 'stage'])

def post_process(stage_data: dict) -> pd.DataFrame:
    ml_data = list()
    for date, data in stage_data.items():
        
        day_end_time = data['end_time'].time()
        day_end_time_in_min = day_end_time.hour*60 + day_end_time.minute

        for item in data['stages']:

            start_interval = to_min(item['start_time'], data['start_time'])
            end_time = to_min(item['end_time'], item['start_time'])
            
            
#             for interval in range(start_time, end_time):
            for interval in range(0, end_time):
                
                time_of_day = (item['start_time'] + timedelta(minutes=interval)).time()
                time_in_min = time_of_day.hour*60 + time_of_day.minute
                
                
                
                ml_data.append((
                    time_in_min,
                    day_end_time_in_min,
                    data['day_of_week'],
                    (start_interval + interval),
                    item['stage']
#                     [1 if i == item['stage'] else 0 for i in range(7)]
                ))
    return pd.DataFrame(ml_data, columns=['start_time', 'end_of_sleep', 'day_of_week', 'interval', 'stage'])

In [33]:
data = load_json('./sleepData.json')['sleep_data']

stage_data = process_sleep_data(data)
ml_data = post_process(stage_data)

In [34]:
print(ml_data.head())
ml_data.info()

   start_time  end_of_sleep  day_of_week  interval  stage
0         113           649            1         0      4
1         114           649            1         1      4
2         115           649            1         2      4
3         116           649            1         3      4
4         117           649            1         4      4
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192297 entries, 0 to 192296
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   start_time    192297 non-null  int64
 1   end_of_sleep  192297 non-null  int64
 2   day_of_week   192297 non-null  int64
 3   interval      192297 non-null  int64
 4   stage         192297 non-null  int64
dtypes: int64(5)
memory usage: 7.3 MB


In [35]:
# X = np.array(ml_data[['start_time', 'interval', 'day_of_week']].values)
# Y = np.array(ml_data['stage'].values)

# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)
ml_data.iloc[0]

start_time      113
end_of_sleep    649
day_of_week       1
interval          0
stage             4
Name: 0, dtype: int64

In [36]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, BatchNormalization, Dropout
import numpy as np

X = ml_data[['start_time', 'end_of_sleep', 'day_of_week', 'interval']].values.astype('int32')
Y = np.vstack(ml_data['stage'].values).astype('int32')

# Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Reshape data for LSTM
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

print(X.shape, X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

# Model definition
model = Sequential([
    LSTM(256, return_sequences=False, input_shape=(X_train.shape[1], X_train.shape[2])),
    BatchNormalization(),
    Dropout(0.3),
    Dense(7, activation='softmax')  # Output layer with 7 neurons for 7 classes
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=7, min_lr=1e-7)

# Model training with early stopping
history = model.fit(X_train, Y_train, epochs=5, batch_size=256, validation_split=0.2, callbacks=[early_stopping, reduce_lr])

(192297, 4) (153837, 4, 1) (38460, 4, 1) (153837, 1) (38460, 1)
Epoch 1/5
Epoch 2/5

KeyboardInterrupt: 

In [39]:
loss = model.evaluate(X_test, Y_test)
print("Test Loss:", loss)
model.predict(X_test)

Test Loss: [0.9240830540657043, 0.5881955027580261]


array([[5.01867326e-04, 2.91264746e-02, 3.44320986e-04, ...,
        6.18578255e-01, 3.00282314e-02, 5.75222017e-04],
       [5.87659364e-04, 2.32168697e-02, 3.84483777e-04, ...,
        6.24584615e-01, 4.80643138e-02, 5.86389680e-04],
       [6.58627367e-04, 1.65541489e-02, 4.41503100e-04, ...,
        5.88640511e-01, 1.01210013e-01, 6.81311765e-04],
       ...,
       [6.12482312e-04, 2.28086840e-02, 3.99608834e-04, ...,
        6.17098927e-01, 5.44496067e-02, 6.20560779e-04],
       [5.40389912e-04, 2.63481718e-02, 3.57064535e-04, ...,
        6.22748017e-01, 3.60679850e-02, 5.75152924e-04],
       [6.79741905e-04, 2.00957600e-02, 4.13071859e-04, ...,
        5.94866872e-01, 7.51293674e-02, 6.76346768e-04]])

In [46]:
import random
import tensorflow as tf

r = random.randint(0, X_test.shape[0]-1)
temp = X_test[r:r+1]

t = ml_data[ (ml_data['start_time'] == int(temp[0][0][0])) & (ml_data['end_of_sleep'] == int(temp[0][1][0]))].iloc[0]
print(t)

max_idx = 50

truev = []
for i in range(max_idx):
    
    z = ml_data[ (ml_data['start_time'] == int(temp[0][0][0]+i)) & (ml_data['end_of_sleep'] == int(temp[0][1][0]))].iloc[0]
    truev.append(z['stage'])

# print(temp)
y = []
for i in range(50):
    predictions = model.predict(temp)
#     print(temp, predictions, tf.argmax(predictions, axis=1))
    y.append(tf.argmax(predictions, axis=1))
    temp[0][-1][0] += 1
    
# X_sample = [[i+r,i+r, dow] for i in range(11*60)]
# Y_sample = model.predict(X_sample)
# # print(Y_sample)

# [f'{x[0]/60}:{x[0]%60}'  for x in X_sample]

# # print(pd.Series(Y_sample).value_counts())
plt.plot(y, label='predicted')
plt.plot(truev, label='true')
plt.legend()

start_time       614
end_of_sleep    1016
day_of_week        0
interval          15
stage              4
Name: 74338, dtype: int64


IndexError: single positional indexer is out-of-bounds

In [11]:
# coreml_model = ct.convert(
#     model,
#     inputs = [ct.TensorType(name="lstm_input", shape=(1,3,1))]
# )
# coreml_model.save("LSTMSleepStage.mlmodel")

In [12]:
# model.save('LSTMSleepStageModel')