In [1]:
#using https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17

In [164]:
import modules
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


#### Get data and turn it into data usable by LSTM

In [172]:
#Get data and then split it into sequences (X) and class (Y)
X = modules.get_and_dayitise_data()#read in data from csv and turn into day by day sequences
Y = np.array([x[0] for x in X])
X = np.array([x[1:] for x in X])

no user_tags: 520
no user_tags: 532
no user_tags: 503
no user_tags: 503
no user_tags: 523
no user_tags: 544
no user_tags: 529
no user_tags: 661
no user_tags: 658
no user_tags: 664
no user_tags: 634
no user_tags: 507
no user_tags: 547
no user_tags: 501
no user_tags: 668
no user_tags: 662


In [173]:
#get only the activity sequences that have more than some number of activities per day
lowest_num_activities = 3
Y = np.array([Y[i] for i in range(len(Y)) if len(X[i])>lowest_num_activities])
X = [x for x in X if len(x)>lowest_num_activities]

In [174]:
average_sleep = np.median(Y)
Y[Y < average_sleep] = 0 #not a lot of sleep
Y[Y >= average_sleep] = 1 #lots of sleep

onehot_encoder = OneHotEncoder(categories='auto')
Y = onehot_encoder.fit_transform(Y.reshape(-1,1)).toarray()


In [175]:
Y

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]])

In [176]:
average_sleep

12.772058823529413

In [177]:
#encode the labels into numeric values (each row is not the same length)
le = LabelEncoder()
le.fit_transform(np.hstack(X)) #flatten X so we can fit the encoder to all possible values in each row
X = np.array([le.transform(x)+1 for x in X]) #now transform each row into corresponding encoding using the fitted encoder, add 1 so that we don't have any zeroes because 0 is for padding


In [178]:
#resize the arrays to be of size average length
average_length = int(np.median([len(x) for x in X])) #get the average length
#if the length is less than the average, pad it with zeroes. if the length is over, resize it
X = np.array([np.resize(np.pad(x,(0,average_length-len(x))), average_length) if len(x) < average_length else np.resize(x,average_length) for x in X])


In [None]:
X.shape

In [179]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 68)

#### Train The Model

In [183]:
embedding = 100
num_unique_labels = len(le.classes_)+1

model = Sequential()
model.add(Embedding(num_unique_labels, embedding, input_length=average_length))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 10, 100)           1200      
                                                                 
 spatial_dropout1d_6 (Spatia  (None, 10, 100)          0         
 lDropout1D)                                                     
                                                                 
 lstm_6 (LSTM)               (None, 100)               80400     
                                                                 
 dense_6 (Dense)             (None, 2)                 202       
                                                                 
Total params: 81,802
Trainable params: 81,802
Non-trainable params: 0
_________________________________________________________________
None


In [181]:
X_train.shape

(1339, 10)

In [184]:
epochs = 5
batch_size = 64
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [185]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.382
  Accuracy: 0.839
