In [1]:
import pandas as pd
import numpy as np
import os
#import altair as alt
from IPython.display import Image
import nbconvert
import time
from datetime import datetime

## Importing Activities Data

In [2]:
# Importing the dataset
dsActivities = pd.read_csv('S1Activities.csv', index_col = None)

In [3]:
dsActivities.head(n=10)

Unnamed: 0,Heading,Category,Subcategory,Code
0,Employment related,Employment work at home,Work at home,1
1,Employment related,Travel employment,Going out to work,5
2,Personal needs,Eating,Eating,10
3,Personal needs,Personal hygiene,Toileting,15
4,Personal needs,Personal hygiene,Bathing,20
5,Personal needs,Personal hygiene,Grooming,25
6,Personal needs,Personal hygiene,Dressing,30
7,Personal needs,Personal hygiene,Washing hands,35
8,Personal needs,Personal medical,Taking medication,40
9,Personal needs,Sleeping,Sleeping,45


## Importing Sensor Data

In [4]:
# Importing the dataset
dsS1Sensors = pd.read_csv('S1sensors.csv', index_col = None, header = None)

In [5]:
dsS1Sensors.head(n=10)

Unnamed: 0,0,1,2
0,100,Bathroom,Toilet Flush
1,101,Bathroom,Light switch
2,104,Foyer,Light switch
3,105,Kitchen,Light switch
4,106,Kitchen,Burner
5,107,Living room,Light switch
6,108,Bedroom,Light switch
7,109,Porch,Light switch
8,118,Kitchen,Burner
9,119,Kitchen,Coffee machine


## The Activities Data Set
The activities data set `S1activities.csv` will be imported, evaluated and cleaned. The goal of the pre-processing will be
to restructure the dataset into a 'tidy' format, that is, where the attributes are columns, the rows are instances, and each cell contains only one value. Given that the data is time-series, timestamps will be used as indexes. The data will also be cast into continuous 24 hour segments, with timestamps in the form `YYYY-MM-DD hh:mm:ss` (using datetime data type)

We aim to perform the preprocessing in such a way that is
* minimally computationally intensive
* reproducible / traceable code
* reasonable checks (validation)

### Importing the Data
The activities data was imported into an indexed dataframe, containing only one column, with 1475 rows, with all values comma-separated (per row). This style of import had to be used, due to the varying number of comma-separated elements in each row (as seen in figure X, above). 

In [7]:
dsS1 = pd.read_csv('S1activities_data.csv', sep = 'delimiter', header = None)

  """Entry point for launching an IPython kernel.


In [8]:
dsS1.head(n=10)

Unnamed: 0,0
0,"Bathing,4/1/2003,20:41:35,21:32:50"
1,"100,68,81,101,93,137,93,58,57,67,93,58,68,88,5..."
2,"Toilet Flush,Sink faucet - hot,Closet,Light sw..."
3,"20:51:52,20:51:58,20:53:36,20:53:49,20:53:52,2..."
4,"21:5:20,20:52:5,20:53:43,21:21:43,20:58:42,20:..."
5,"Toileting,4/1/2003,17:30:36,17:46:41"
6,10068
7,"Toilet Flush,Sink faucet - hot"
8,"17:39:37,17:39:46"
9,"18:10:57,17:39:52"


In [9]:
# Confirming the length of the dataframe
len(dsS1)

1475

### Converting the Dataframe to an Array
As mentioned above, in order to work with these data, they need to be in a 'tidy' format [ref], that is, the attributes are columns, the rows are instances, and each cell contains only one value. 
* Note: An array of arrays
* An array where each element is an array (list?)
* Between each increment of 5 (0 - 4), the sub-arrays have different lengths
Table [ref], below, contains a summary of the data structure after the operation `np.array(dsS1)` is performed. In order to continue pre-processing, the array had to be flattened from a 2D structure to a 1D structure, using `flatten()`.

In [10]:
a = np.array(dsS1)

In [11]:
i = 0
while i < 5:
    print(a[i])
    i += 1

['Bathing,4/1/2003,20:41:35,21:32:50']
['100,68,81,101,93,137,93,58,57,67,93,58,68,88,57,67,100,68,67,76']
['Toilet Flush,Sink faucet - hot,Closet,Light switch,Shower faucet,Freezer,Shower faucet,Medicine cabinet,Medicine cabinet,Cabinet,Shower faucet,Medicine cabinet,Sink faucet - hot,Sink faucet - cold,Medicine cabinet,Cabinet,Toilet Flush,Sink faucet - hot,Cabinet,Lamp']
['20:51:52,20:51:58,20:53:36,20:53:49,20:53:52,20:58:22,20:58:43,21:5:23,21:5:46,21:5:47,21:18:34,21:18:55,21:19:41,21:20:4,21:20:38,21:20:39,21:21:13,21:21:16,21:21:37,21:22:8']
['21:5:20,20:52:5,20:53:43,21:21:43,20:58:42,20:58:32,21:6:9,21:5:45,21:18:55,21:5:49,21:18:35,21:20:37,21:20:5,21:20:34,21:21:41,21:20:42,23:10:23,21:21:23,21:21:38,23:11:8']


* The DF has been converted into a 2D array of, where each element is an array with 1 element
* Each 

Index (a[i]) | Type                           | Description            | Desired Type
----------   | ----------------               | ---------------------- | ---------------------- 
0            | An array of comma-sep strings  | Activity information, date, start time, end time | ...
1            | An array of comma-sep strings  | Sub-activity reference value    | Levels
2            | An array of comma-sep strings  | Sub-activity descriptive value  | Levels
3            | An array of comma-sep strings  | Sub-activity start time   | Datetime including the date extracted from the index 3 rows above
4            | An array of comma-sep strings  | Sub-activity end time     | Datetime including the date extracted from the index 4 rows above

In [12]:
# Flattening the 2D array to a 1D array
a = a.flatten()

In [13]:
i = 0
while i < 5:
    print(a[i])
    i += 1

Bathing,4/1/2003,20:41:35,21:32:50
100,68,81,101,93,137,93,58,57,67,93,58,68,88,57,67,100,68,67,76
Toilet Flush,Sink faucet - hot,Closet,Light switch,Shower faucet,Freezer,Shower faucet,Medicine cabinet,Medicine cabinet,Cabinet,Shower faucet,Medicine cabinet,Sink faucet - hot,Sink faucet - cold,Medicine cabinet,Cabinet,Toilet Flush,Sink faucet - hot,Cabinet,Lamp
20:51:52,20:51:58,20:53:36,20:53:49,20:53:52,20:58:22,20:58:43,21:5:23,21:5:46,21:5:47,21:18:34,21:18:55,21:19:41,21:20:4,21:20:38,21:20:39,21:21:13,21:21:16,21:21:37,21:22:8
21:5:20,20:52:5,20:53:43,21:21:43,20:58:42,20:58:32,21:6:9,21:5:45,21:18:55,21:5:49,21:18:35,21:20:37,21:20:5,21:20:34,21:21:41,21:20:42,23:10:23,21:21:23,21:21:38,23:11:8


In [14]:
a[0][0]

'B'

In [15]:
i = 0
while i < 10:
    print(a[0][i])
    i += 1

B
a
t
h
i
n
g
,
4
/


* A array of strings, instead of an array of arrays
Analysis of the original dataset, and exploration during pre-processing to this point, shows us that the original dataset follows a structure such that each 5 rows of data contains is one discrete set of data. In this structure,
* Row 1 = Activity, Date, Start Time, End Time
* Row 2 = Sub-activity (an action that can be executed as part of performing the activity) code-values
* Row 3 = Sub-activity descriptive values
* Row 4 = Sub-activity start time
* Row 5 = Sub-activity end time
In order to access the values programmatically, we will now turn the 1D array list back into a 2D array list, where each element array[i] contains the 5 rows of information, as described above. 

#### Chunking the flattened array
Chucking the 1D array back into a 2D array allows us to more easily use loops to extract the required values.

In [16]:
from more_itertools import chunked
a = list(chunked(a, 5))

#### Looping on a[i]
* We will continue to perform sanity checks during the pre-processing
* Here we loop through values a[i] from i = 0 till i = 4
* We can see that we have successfully structured the data such that each element ...

In [17]:
i = 0
while i < 5:
    print(a[i])
    i += 1

['Bathing,4/1/2003,20:41:35,21:32:50', '100,68,81,101,93,137,93,58,57,67,93,58,68,88,57,67,100,68,67,76', 'Toilet Flush,Sink faucet - hot,Closet,Light switch,Shower faucet,Freezer,Shower faucet,Medicine cabinet,Medicine cabinet,Cabinet,Shower faucet,Medicine cabinet,Sink faucet - hot,Sink faucet - cold,Medicine cabinet,Cabinet,Toilet Flush,Sink faucet - hot,Cabinet,Lamp', '20:51:52,20:51:58,20:53:36,20:53:49,20:53:52,20:58:22,20:58:43,21:5:23,21:5:46,21:5:47,21:18:34,21:18:55,21:19:41,21:20:4,21:20:38,21:20:39,21:21:13,21:21:16,21:21:37,21:22:8', '21:5:20,20:52:5,20:53:43,21:21:43,20:58:42,20:58:32,21:6:9,21:5:45,21:18:55,21:5:49,21:18:35,21:20:37,21:20:5,21:20:34,21:21:41,21:20:42,23:10:23,21:21:23,21:21:38,23:11:8']
['Toileting,4/1/2003,17:30:36,17:46:41', '100,68', 'Toilet Flush,Sink faucet - hot', '17:39:37,17:39:46', '18:10:57,17:39:52']
['Toileting,4/1/2003,18:4:43,18:18:2', '68,107', 'Sink faucet - hot,Light switch', '18:11:2,18:12:28', '18:11:13,21:21:53']
['Toileting,4/1/2003,

#### Looping on a[0][i]
* This give us access to XYZ data

In [18]:
i = 0
while i < 5:
    print(a[0][i])
    i += 1

Bathing,4/1/2003,20:41:35,21:32:50
100,68,81,101,93,137,93,58,57,67,93,58,68,88,57,67,100,68,67,76
Toilet Flush,Sink faucet - hot,Closet,Light switch,Shower faucet,Freezer,Shower faucet,Medicine cabinet,Medicine cabinet,Cabinet,Shower faucet,Medicine cabinet,Sink faucet - hot,Sink faucet - cold,Medicine cabinet,Cabinet,Toilet Flush,Sink faucet - hot,Cabinet,Lamp
20:51:52,20:51:58,20:53:36,20:53:49,20:53:52,20:58:22,20:58:43,21:5:23,21:5:46,21:5:47,21:18:34,21:18:55,21:19:41,21:20:4,21:20:38,21:20:39,21:21:13,21:21:16,21:21:37,21:22:8
21:5:20,20:52:5,20:53:43,21:21:43,20:58:42,20:58:32,21:6:9,21:5:45,21:18:55,21:5:49,21:18:35,21:20:37,21:20:5,21:20:34,21:21:41,21:20:42,23:10:23,21:21:23,21:21:38,23:11:8


#### Looping on a[i][0]
* This give us access to XYZ data

In [19]:
i = 0
while i < 25:
    print(a[i][0])
    i += 5

Bathing,4/1/2003,20:41:35,21:32:50
Preparing lunch,4/1/2003,11:21:17,11:38:22
Going out to work,4/1/2003,7:0:55,7:2:30
Preparing a snack,4/2/2003,17:6:21,17:12:1
Preparing breakfast,4/2/2003,7:21:12,8:10:40


#### Confirming the outer loop elements

In [20]:
## Lenght
dimLength = []
i = 0
while i < len(a):
    dimLength.append(len(a[i]))
    i = i + 1

In [21]:
len(dimLength)

295

In [99]:
pd.options.mode.chained_assignment = None

#new
https://codereview.stackexchange.com/questions/200277/find-the-repeated-elements-in-a-list
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html
https://stackoverflow.com/questions/27263805/pandas-when-cell-contents-are-lists-create-a-row-for-each-element-in-the-list
https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas
https://www.w3schools.com/python/python_for_loops.asp
https://stackoverflow.com/questions/2632677/python-integer-incrementing-with
https://stackoverflow.com/questions/41925808/combing-columns-after-transposing-columns-pandas-dataframes
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot.html
https://stackoverflow.com/questions/17315737/split-a-large-pandas-dataframe
https://stackoverflow.com/questions/54334674/how-to-use-pandas-to-pivot-a-table-while-changing-values-to-boolean
http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.to_pydatetime.html#pandas-datetimeindex-to-pydatetime
https://pandas.pydata.org/pandas-docs/stable/install.html
https://stackoverflow.com/questions/39597553/from-datetimeindex-to-list-of-times
https://stackoverflow.com/questions/32271474/datetime-index-in-python-dataframe
https://stackoverflow.com/questions/28133018/convert-pandas-series-to-datetime-in-a-dataframe
https://stackoverflow.com/questions/28990256/python-pandas-time-series-year-extraction?rq=1
https://pandas.pydata.org/pandas-docs/version/0.17.0/generated/pandas.DatetimeIndex.strftime.html
https://www.dataquest.io/blog/settingwithcopywarning/
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html




In [179]:
mylist = new.index
mylist = list(dict.fromkeys(mylist))

In [182]:
len(mylist)

830697