# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from Sensor import Stress_Forecasting

# Time series plot

In [2]:
filepath = "https://raw.githubusercontent.com/italha-d/Stress-Predict-Dataset/main/Raw_data/"

In [3]:
# Function to read the dataset for each sensor 
def wrangle(filepath, s, types): 
  #filepath: the external link from Github
  #s: includes values from 1 to 35 which represents 35 volunteers
  #types: the sensors will be used. The values of types should be "BVP", "HR"

  # Load data from Github
  x_path = str(filepath + "S"+ str(s).zfill(2)+"/"+types+".csv")
  x = pd.read_csv(x_path, names=[types])
  starting_time = x.iloc[0][0]
  sample_rate = x.iloc[1][0]

  # Drop the first two rows including initial time and sameple rate from dataset
  x.drop([0, 1], inplace = True)

  return x

In [4]:
# Function to plot time series data
def plottwrangle(types, s):
  #types: the sensors will be used. The values of types should be "BVP", "HR"
  #s: includes values from 1 to 35 which represents 35 volunteers

  x_path = wrangle(filepath, s, types)
  x = x_path.iloc[:,0].plot()
  plt.xlabel("Temporal range")
  plt.ylabel("The changes in the " + types)
  plt.title("Person " + str(s))
  plt.show()

In [5]:
# Read BVP dataset for person 1
BVP_P1 = wrangle(filepath, 1, "BVP")

# Plot BVP dataset for person 1
plottwrangle("BVP", 1)

In [6]:
# Read HR dataset for person 1
HR_P1 = wrangle(filepath, 1, "HR")

# Plot BVP dataset for person 1
plottwrangle("HR", 1)

In [7]:
# Read EDA dataset for person 1
EDA_P1 = wrangle(filepath, 1, "EDA")

# Plot EDA dataset for person 1
plottwrangle("EDA", 1)

In [8]:
# Read TEMP dataset for person 1
TEMP_P1 = wrangle(filepath, 1, "TEMP")

# Plot TEMP dataset for person 1
plottwrangle("TEMP", 1)

In [9]:
# Read dataset of ACC for person 1
ACC_1 = pd.read_csv("https://raw.githubusercontent.com/italha-d/Stress-Predict-Dataset/main/Raw_data/S01/ACC.csv",
                 names=["ACC(x)","ACC(y)","ACC(z)"])
ACC_1 = ACC_1.iloc[2:]

In [10]:
# Plot for ACC(x) of person 1
ACC_1["ACC(x)"].plot()
plt.xlabel("Temporal range")
plt.ylabel("The changes in the ACC(x)")
plt.title("Person 1")
plt.show()

In [11]:
# Plot for ACC(y) of person 1
ACC_1["ACC(y)"].plot()
plt.xlabel("Temporal range")
plt.ylabel("The changes in the ACC(y)")
plt.title("Person 1")
plt.show()

In [12]:
# Plot for ACC(z) of person 1
ACC_1["ACC(z)"].plot()
plt.xlabel("Temporal range")
plt.ylabel("The changes in the ACC(z)")
plt.title("Person 1")
plt.show()

# Timestamp tags exploration

By comparing the research paper content and the "Time_logs" file in Processed_data, the tasks inducing stress with the corresponding indexes in the "tags" files for each person should be:

**Person 1** 
1. Stroop Test: between 09:29 and 09:34 (Index: 0 & 1)
2. Interview: between 09:44 and 09:54 (Index: 3 & 4)
3. Hyperventilation: between 10:00 and 10:02 (Index: 5 & 6)

**Person 2** 
1. Stroop Test: between 10:03 and 10:08 (Index: 0 & 1)
2. Interview: between 10:13 and 10:24 (Index: 2 & 3)
3. Hyperventilation: between 10:29 and 10:31 (Index: 4 & 5)

**Person 3** 
1. Stroop Test: between 11:04 and 11:09 (Index: 0 & 1)
2. Interview: between 11:14 and 11:23 (Index: 2 & 3)
3. Hyperventilation: between 11:28 and 11:30 (Index: 4 & 5)

**Person 6** The start-end time of each task in the "tags" file is wrong. 
1. Stroop Test: between 09:55 and 10:01 
2. Interview: between 10:07 and 10:18 
3. Hyperventilation: between 10:23 and 10:25 

**Person 7** 
1. Stroop Test: between 10:43 and 10:48 (Index: 0 & 1)
2. Interview: between 10:53 and 11:03 (Index: 2 & 3)
3. Hyperventilation: between 11:09 and 11:11 (Index: 4 & 5)

# Conclusion

From ***Person 2 to Person 5***, and ***Person 7 to Person 35***, the indexes in "timestamp tags" files are the same with:
1. Stroop Test - Index: 0 & 1
2. Interview - Index: 2 & 3
3. Hyperventilation - Index: 4 & 5


Only the indexes of ***Person 1*** is different with:
1. Stroop Test - Index: 0 & 1
2. Interview - Index: 3 & 4
3. Hyperventilation - Index: 5 & 6


Finally, dataset for time stamp of ***Person 6*** has many mistakes so it will not be used to build model later.


***The "Timestamp tags" exploration will be used in the defined function to label the whole dataset (1 for stress and 0 for no stress).***


In [13]:
# Function to read "tags.csv" dataset for 35 users:
filepath = "https://raw.githubusercontent.com/italha-d/Stress-Predict-Dataset/main/Raw_data/"
def time_tags(filepath, s):
  #filepath: the external link from Github
  #s: includes values from 1 to 35 which represents 35 volunteers

  time_stamp_path = str(filepath + "S"+ str(s).zfill(2)+"/tags_S"+ str(s).zfill(2)+".csv")
  y = pd.read_csv(time_stamp_path, names=["Time_stamp"])

  # Return timestamp to the local date and time
  y["Date"] = pd.to_datetime(y["Time_stamp"], unit="s")
  return y

# Read "tags.csv" dataset for 35 users
list_time = [0]
for i in range(1,36):
  print("\nPerson %d"%(i))
  print(time_tags(filepath, i))
  list_time.append(time_tags(filepath, i))

In [14]:
m = Stress_Forecasting()

list_label = []
for i in range(1,16):
  print("\nPerson %d"%(i))
  print("Label = 0 ", m.wrangle(filepath, i)["Label"].value_counts()[0])
  print("Label = 1 ", m.wrangle(filepath, i)["Label"].count() - m.wrangle(filepath, i)["Label"].value_counts()[0])

In [15]:
# Plot to show the imbalanced data for person 1
m.wrangle(filepath, 1)["Label"].value_counts().plot(kind='bar')
plt.xlabel("Label")
plt.ylabel("Count")
plt.xticks(rotation=360);

# Model for each person

1. Three models are built including Gaussian Naive Bayes (NB), Logistic Regression (LM), and Linear Support Vector Classification (SVM).
2. Two methods - Standardization and Principal Component Analysis (PCA) are applied to rescale the data.
3. Confusion matrix, precision, and recall are used to evaluate the prediction on validation and test set.

In [16]:
m = Stress_Forecasting()

# Filepath to use for functions
filepath = "https://raw.githubusercontent.com/italha-d/Stress-Predict-Dataset/main/Raw_data/"

# Read the dataset for person 25
person = 25
m.wrangle(filepath, person)

Unnamed: 0_level_0,ACC(x)_mean,ACC(y)_mean,ACC(z)_mean,ACC(x)_std,ACC(y)_std,ACC(z)_std,Sum_ACC_mean,Sum_ACC_std,BVP_mean,BVP_std,EDA_mean,EDA_std,EDA_max,EDA_min,HR,TEMP_mean,Label,Participant
Time (sec),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1.646061e+09,-14.65625,-3.00000,61.25000,1.752590,2.488684,1.459120,43.59375,5.700395,1.596406,9.933133,0.042937,0.002454,0.046142,0.041015,109.00,32.93,0,25
1.646061e+09,-14.09375,-5.09375,62.15625,1.328579,1.889860,2.315577,42.96875,5.534016,5.327656,133.680783,0.041015,0.000000,0.041015,0.041015,83.50,32.89,0,25
1.646061e+09,-14.21875,-5.00000,61.71875,1.453236,2.063508,4.327216,42.50000,7.843960,-13.246406,21.476374,0.041015,0.001047,0.042297,0.039733,88.33,32.93,0,25
1.646061e+09,-0.68750,5.15625,65.25000,6.301242,2.437733,7.466052,69.71875,16.205026,-12.830156,54.253558,0.041976,0.001227,0.043578,0.041015,87.00,32.93,0,25
1.646061e+09,-0.25000,5.00000,64.12500,0.567962,0.254000,0.336011,68.87500,1.157973,11.414062,261.698018,0.044539,0.000641,0.044860,0.043578,89.20,32.91,0,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1.646064e+09,-23.37500,-2.09375,60.00000,0.553581,0.296145,0.000000,34.53125,0.849725,-0.793281,7.033645,0.883049,0.001227,0.884652,0.882088,70.22,29.21,0,25
1.646064e+09,-23.62500,-2.06250,60.03125,0.491869,0.245935,0.176777,34.34375,0.914581,2.743281,5.486074,0.881768,0.000640,0.882088,0.880807,70.03,29.21,0,25
1.646064e+09,-23.84375,-2.00000,60.00000,0.368902,0.000000,0.000000,34.15625,0.368902,0.482969,4.727653,0.881768,0.001227,0.883370,0.880807,69.85,29.19,0,25
1.646064e+09,-23.71875,-2.90625,61.25000,3.050483,1.837940,2.016065,34.62500,6.904487,-1.222344,6.000226,0.879845,0.000641,0.880807,0.879525,69.73,29.23,0,25


In [17]:
# NB model, "reduce_dim", lag_length = 2 
m.create_data(2)
m.get_results("validation","NB","reduce_dim") # Evaluate on Validation data
print("Accuracy: %.2f"%m.accuracy)
print("Recall: %.2f"%m.recall)

Total explained variance: 0.81
Accuracy: 0.81
Recall: nan


  self.recall = true_positive/(true_positive+false_negative)


In [18]:
m.create_data(2)
m.get_results("test","NB","reduce_dim") # Evaluate on Test set
print("Accuracy: %.2f"%m.accuracy)
print("Recall: %.2f"%m.recall)

Total explained variance: 0.81
Accuracy: 0.80
Recall: 0.13


In [19]:
# NB model, "standardize", lag_length = 2 
m.create_data(2)
m.get_results("validation","NB","standardize") # Evaluate on Validation data
print("Accuracy: %.2f"%m.accuracy)
print("Recall: %.2f"%m.recall)

Accuracy: 0.59
Recall: nan


  self.recall = true_positive/(true_positive+false_negative)


In [20]:
m.create_data(2)
m.get_results("test","NB","standardize") # Evaluate on Test set
print("Accuracy: %.2f"%m.accuracy)
print("Recall: %.2f"%m.recall)

Accuracy: 0.79
Recall: 0.49


In [21]:
# LM model, "reduce_dim", lag_length = 2 
m.create_data(2)
m.get_results("validation","LM","reduce_dim") # Evaluate on Validation data
print("Accuracy: %.2f"%m.accuracy)
print("Recall: %.2f"%m.recall)

Total explained variance: 0.81
Accuracy: 0.48
Recall: nan


  self.recall = true_positive/(true_positive+false_negative)


In [22]:
m.create_data(2)
m.get_results("test","LM","reduce_dim") # Evaluate on Test set
print("Accuracy: %.2f"%m.accuracy)
print("Recall: %.2f"%m.recall)

Total explained variance: 0.81
Accuracy: 0.82
Recall: 0.29


In [23]:
# LM model, "standardize", lag_length = 2 
m.create_data(2)
m.get_results("validation","LM","standardize") # Evaluate on Validation data
print("Accuracy: %.2f"%m.accuracy)
print("Recall: %.2f"%m.recall)

Accuracy: 0.21
Recall: nan


  self.recall = true_positive/(true_positive+false_negative)


In [24]:
m.create_data(2)
m.get_results("test","LM","standardize") # Evaluate on Test set
print("Accuracy: %.2f"%m.accuracy)
print("Recall: %.2f"%m.recall)

Accuracy: 0.55
Recall: 0.81


In [25]:
# SVM model, "reduce_dim", lag_length = 2 
m.create_data(2)
m.get_results("validation","SVM","reduce_dim") # Evaluate on Validation data
print("Accuracy: %.2f"%m.accuracy)
print("Recall: %.2f"%m.recall)

Total explained variance: 0.81
















Accuracy: 0.42
Recall: nan


  self.recall = true_positive/(true_positive+false_negative)


In [26]:
m.create_data(2)
m.get_results("test","SVM","reduce_dim") # Evaluate on Test set
print("Accuracy: %.2f"%m.accuracy)
print("Recall: %.2f"%m.recall)

Total explained variance: 0.81


























































Accuracy: 0.83
Recall: 0.21




In [27]:
# SVM model, "standardize", lag_length = 2 
m.create_data(2)
m.get_results("validation","SVM","standardize") # Evaluate on Validation data
print("Accuracy: %.2f"%m.accuracy)
print("Recall: %.2f"%m.recall)















Accuracy: 0.23
Recall: nan


  self.recall = true_positive/(true_positive+false_negative)


In [28]:
m.create_data(2)
m.get_results("test","SVM","standardize") # Evaluate on Test set
print("Accuracy: %.2f"%m.accuracy)
print("Recall: %.2f"%m.recall)

























































Accuracy: 0.60
Recall: 0.86


