# CASAS Dataset [Unannotated]

### Activity Recognition in Pervasive Smart Home Environments

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [10, 5]

In [2]:
fileName = "raw"
dataDir = '../Data/twor.2009/'
# dataDir = '../Data/twor.summer.2009/'
columns = ["Date", "Time", "SensorID", "SensorValue", "ActivityLabel", "Status"]
raw = pd.read_csv(dataDir+fileName, sep=" ", names=columns, header=None)

In [3]:
raw.head()

Unnamed: 0,Date,Time,SensorID,SensorValue,ActivityLabel,Status
0,2009-02-01,01:03:57.032147,P001,1529.0,,
1,2009-02-01,01:04:27.362479,P001,1512.0,,
2,2009-02-01,01:05:37.044209,P001,1502.0,,
3,2009-02-01,01:08:37.066213,P001,1492.0,,
4,2009-02-01,01:09:21.757159,P001,1006.1,,


In [4]:
# Map sensor index 
sensornames = {"A001": "AD1-A", "A002": "AD1-B", "A003": "AD1-C", "D003": "D03", "D005": "D05", "D007": "D07", "D008": "D08", "D009": "D09", "D010": "D10", "D012": "D12", "D014": "D14", "D015": "D15", "I012": "I03", "L004": "L04", "L006": "L06", "L009": "L09", "L010": "L10", "L011": "L11", "L012": "L12", "L013": "L13", "M001": "M01", "M002": "M02", "M003": "M03", "M004": "M04", "M005": "M05", "M006": "M06", "M007": "M07", "M008": "M08", "M009": "M09", "M010": "M10", "M011": "M11", "M012": "M12", "M013": "M13", "M014": "M14", "M015": "M15", "M016": "M16", "M017": "M17", "M018": "M18", "M019": "M19", "M020": "M20", "M021": "M21", "M022": "M22", "M023": "M23", "M024": "M24", "M025": "M25", "M026": "M26", "M027": "M27", "M028": "M28", "M029": "M29", "M030": "M30", "M031": "M31", "M032": "M32", "M033": "M33", "M034": "M34", "M035": "M35", "M036": "M36", "M037": "M37", "M038": "M38", "M039": "M39", "M040": "M40", "M041": "M41", "M042": "M42", "M043": "M43", "M044": "M44", "M045": "M45", "M046": "M46", "M047": "M47", "M048": "M48", "M049": "M49", "M050": "M50", "M051": "M51"}

def change_sensorname(x):
    if x["SensorID"] not in sensornames:
        return "unknow"
    
    return sensornames[x["SensorID"]]

raw["NewSensorID"] = raw.apply(change_sensorname, axis=1)

In [6]:
raw = raw[raw["NewSensorID"]!="unknow"]
raw["SensorID"] = raw["NewSensorID"]

In [7]:
raw.head()

Unnamed: 0,Date,Time,SensorID,SensorValue,ActivityLabel,Status,NewSensorID
498,2009-02-01,07:21:12.003703,D12,OPEN,,,D12
501,2009-02-01,07:21:50.066499,D12,OPEN,,,D12
503,2009-02-01,07:21:56.849709,D12,OPEN,,,D12
506,2009-02-01,07:22:14.109659,D12,OPEN,,,D12
516,2009-02-01,07:24:06.069741,D12,OPEN,,,D12


In [8]:
activity_labels = raw["ActivityLabel"].unique()
notna = [type(l)==type("") for l in activity_labels]
activity_labels = activity_labels[notna]
activity_labels

array([], dtype=float64)

In [9]:
classmap = {k:i for i,k in enumerate(activity_labels)}

In [20]:
# sensors = np.sort(raw["SensorID"].unique())
sensors = list(sensornames.values())
sensormap = {k:i for i, k in enumerate(sensors)}

In [21]:
analog_sensors = ["AD1-A", "AD1-B", "AD1-C"]
for s in analog_sensors:
    d = raw[raw["SensorID"]==s]
    d["SensorValue"] = d["SensorValue"].astype(float)
    
    print("Sensor: ", s)
    mx = d["SensorValue"].max()
    mn = d["SensorValue"].min()
    print("Max", mn, "Min", mx)
    def normalise(x):
        if x["SensorID"]==s:
            try:
                return (float(x["SensorValue"]) - mn)*1.0/(mx - mn)
            except Exception as e:
                print(e)
                print(type(mn), type(mx), type(float(x["SensorValue"])))
                print("ERROR", x["SensorValue"])
                return 0
        else:
            return x["SensorValue"]
        
    raw["NormSensorValue"] = raw.apply(normalise, axis=1)
    

Sensor:  AD1-A
Max 1.88167 Min 3.37138
Sensor:  AD1-B
Max 0.0268363 Min 0.593915
Sensor:  AD1-C
Max 0.00757824 Min 0.632432


In [22]:
raw["SensorValue"] = raw["NormSensorValue"]

In [23]:
# Add time stamp to data
data = raw
# data = raw[(raw["ActivityLabel"].notna()) | (raw["SensorValue"] != "OFF")]
data['DateTime'] =  data['Date'] + " "+data['Time']
data['Timestamp'] =  pd.to_datetime(data['DateTime'], format='%Y-%m-%d %H:%M:%S.%f')

In [34]:
df_grouped = np.array_split(data, 100)

In [38]:
import tqdm

def sensorvalue(row):
    sensorId = row["SensorID"]
    if row["SensorValue"] in ["O", "ON", "ONF", "OPEN", "PRESENT"]:
        return (sensorId, 1)
    elif row["SensorValue"] in ["OFFF","OF", "OFF", "CLOSE", "ABSENT"]:
        return (sensorId, 0)
    else:
        if sensorId not in analog_sensors:
            print(sensorId)
        # check isnumber https://stackoverflow.com/questions/354038/how-do-i-check-if-a-string-is-a-number-float
        value = str(row["SensorValue"])
        isNumberic = value.replace('.','',1).isdigit()
        if isNumberic:
            value = float(value)
            return (sensorId, value)
        else:
            raise Exception('Unknown keyword: '+row["SensorValue"])
            
def toTimeSeries(df, width):

    rows = []
    start = df["Timestamp"].iloc[0].timestamp()
    timestamp = start
    activityLabel = df["ActivityLabel"].iloc[0]
    
    eventsinwindow = {}
    for i, row in df.iterrows():
        t = row["Timestamp"].timestamp()
        interval = ((t-start)//width)
        
        if interval in eventsinwindow:
            eventsinwindow[interval].append(sensorvalue(row))
        else:
            eventsinwindow[interval] = [sensorvalue(row)]

    
    timeslot = eventsinwindow.keys()
    timerange = range(int(min(timeslot)), int(max(timeslot))+1)
    sensor_values = []

    for s in sensors:
        values = [0 for t in timerange]
        
        for t in timerange:
            if t not in eventsinwindow:
                continue
                
            events = eventsinwindow[t]
            for sensorId, value in events:
                ind = sensormap[sensorId]
                if sensorId == s:
                    values[t] += value
                    
        sensor_values.append(values)
        
    
    return sensor_values, timestamp
    



In [50]:
import tqdm

def to_vector_sliding_window(ts, label, window_size=2, timestamp=None, normalise=True):
    t0 = ts[0]
    T = len(t0)
    vectors = []
    for i in range(T):
        if i+window_size > T:
            break
        
        vector = [0 for s in sensors]
        for j in range(window_size):
            tid = i+j
            
            for sid in range(len(ts)):
                vector[sid] += ts[sid][tid]*1.0
            
        if normalise:
            acc = sum(vector)
            if acc == 0:
                continue
            vector = [v*1.0/acc for v in vector]
        
        vector.append(label)
        vectors.append(vector)
    return vectors

def to_vector_chunk(ts, label, chunk_size = 15, timestamp=None, normalise=True):
    t0 = ts[0]
    T = len(t0)
    vectors = []
    chunk_index = range(0, len(t0), chunk_size)
    
    for i in chunk_index:
        vector = [0 for s in sensors]
        for j in range(i, i+chunk_size):
            if j >= len(t0):
                break
                
            tid = j
            for sid in range(len(ts)):
                vector[sid] += ts[sid][tid]*1.0
                
        acc = sum(vector)
        if acc == 0:
            continue 
            
        if normalise:
            vector = [v*1.0/acc for v in vector]
            
        vector.append("unknown")
        vectors.append(vector)
    
    if label%10==0:
        print("DONE", label, "%")
    
    return vectors


cols = list(sensors)
cols.append("ActivityLabel")
vectors = pd.DataFrame(columns=cols)
ind = 0
for d in df_grouped:
    ts, timestamp = toTimeSeries(d, 60)
    
        
    v = to_vector_chunk(ts, ind, chunk_size=15, timestamp=timestamp, normalise=True)  
    
    v = pd.DataFrame(v, columns=cols)
    vectors = vectors.append(v, ignore_index=True)
    ind += 1
    

print("length:", len(vectors))
vectors.head()






DONE 0 %
DONE 10 %
DONE 20 %
DONE 30 %
DONE 40 %
DONE 50 %
DONE 60 %
DONE 70 %
DONE 80 %
DONE 90 %
length: 7775


Unnamed: 0,AD1-A,AD1-B,AD1-C,D03,D05,D07,D08,D09,D10,D12,...,M43,M44,M45,M46,M47,M48,M49,M50,M51,ActivityLabel
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.916667,...,0.0,0.0,0.0,0.0,0.0625,0.020833,0.0,0.0,0.0,unknown
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.756098,...,0.0,0.0,0.0,0.0,0.121951,0.097561,0.0,0.0,0.0,unknown
2,0.0,0.0,0.001853,0.0,0.0,0.0,0.0,0.016776,0.0,0.268409,...,0.058715,0.058715,0.041939,0.016776,0.008388,0.008388,0.025163,0.050327,0.0,unknown
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.310345,...,0.034483,0.034483,0.0,0.0,0.137931,0.310345,0.068966,0.034483,0.0,unknown
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5625,...,0.0,0.0,0.0,0.0,0.0625,0.0625,0.0,0.0,0.0,unknown


In [51]:
vectors.to_csv(dataDir+fileName+".feat.ch15", index=False)