### Load libraries 




In [1]:
import pandas as pd 
import pickle
from IPython import display
import datetime
import numpy as np
from skimage import util
import seaborn as sns
import matplotlib.pyplot as plt

import os 
import sys

In [2]:
pd.__version__

'1.1.4'

In [3]:
#!pwd
print(os.getcwd())

D:\Documents\Education\University\4th Year\ELEC6200 - Group Design Project\gdp-wild-dogs


### Checking OS system 
- slight difference in file path for different OS systems:
    - linux: '/home/user/Data'
    - windows: 'C:\\user\\Data'

In [4]:
os_system = sys.platform
if os_system.startswith('win'):
    os_system = 'windows'
print('OS System: ', os_system)

OS System:  windows


### List Directory Tree

In [5]:
def list_files(startpath):
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print('{}{}'.format(subindent, f))

In [6]:
list_files(os.path.join(os.getcwd(), 'Data'))

Data/
    2020-11-05/
        201105_807d3a2a0fe8_xyz.pkl
        cut.pkl
        cutoff.txt
        dataFeatured.pkl
        labelled.pkl
        labels.csv
        raw.pkl
    2020-11-16/
        201116_807d3a2a0fe8_xyz.pkl
        cut.pkl
        cutoff.txt
        dataFeatured.pkl
        labelled.pkl
        labels.csv
        raw.pkl
    2020-11-23/
        201123_807d3a2a0fe8_xyz.pkl
        cut.pkl
        cutoff.txt
        dataFeatured.pkl
        labelled.pkl
        labels.csv
        raw.pkl
    2020-11-26/
        201126_807d3a2a0fe8_xyz.pkl
        cut.pkl
        cutoff.txt
        dataFeatured.pkl
        labelled.pkl
        labels.csv
        raw.pkl
    2020-11-27/
        201127_807d3a2a0fe8_xyz.pkl
        cut.pkl
        cutoff.txt
        dataFeatured.pkl
        labelled.pkl
        labels.csv
        raw.pkl
    2020-11-28/
        201128_807d3a2a0fe8_xyz.pkl
        raw.pkl
    2020-12-03/
        201203_807d3a2a0fe8.pkl
        201203_807d3a2a0fe8_xyz.pkl
  

 ### Functions to Trim, Load, and Label data

In [7]:
os_system = sys.platform
if os_system.startswith('win'):
    os_system = 'windows'
print('OS System: ', os_system)

if os_system == 'linux':
    filepath = './Data'
    pathstyle = '/'
elif os_system == 'windows':
    filepath = 'Data'
    pathstyle = '\\'

def trimData(folderPath):
        
    cutoffPresent = 0
    
    for filename in os.listdir(folderPath):
        if filename == "cutoff.txt":
            cutoffPresent = 1
            f = open('./'+folderPath+'/cutoff.txt', "r")
            start = f.readline()
            start = datetime.time(int(start.split(",")[0]), int(start.split(",")[1]), int(start.split(",")[2]))
            end = f.readline()
            end = datetime.time(int(end.split(",")[0]), int(end.split(",")[1]), int(end.split(",")[2]))
    for filename in os.listdir(folderPath):
        if cutoffPresent == 1:
            if filename == "raw.pkl":
                df = pd.read_pickle(subdir+pathstyle+ filename)
                df.reset_index(inplace=True)
                df = df.drop(['mag_x', 'mag_y', 'mag_z', 'pressure'], axis=1)
                display.display(df)
                df = df.loc[(df['dt'].dt.time > start) & (df['dt'].dt.time < end)]
                df.to_pickle("./"+folderPath+"/cut.pkl")
                cutoffPresent = 0
    return 0

def loadData(folderPath):
    for filename in os.listdir(folderPath):
        if filename == "cut.pkl":
            data = pd.read_pickle(folderPath+pathstyle+filename)
            data.reset_index(inplace=True)
            return data

def loadLabels(folderPath):
    for filename in os.listdir(folderPath):
        if filename == "labels.csv":
            labels = pd.read_csv(folderPath+pathstyle+filename)
            labels.reset_index(inplace=True)
            return labels

def labelData(data, labels):
    if('activity' in data.columns):
                    data.drop('activity', 1, inplace=True)

    labelsList = []
    intervalIndex = 0
    intervalBegin = pd.to_datetime(labels['start_time'][intervalIndex], utc=True)
    intervalEnd = pd.to_datetime(labels['end_time'][intervalIndex], utc=True)
    for i in range(0, len(data)):
        time = pd.to_datetime(data['dt'][i])
        while(time > intervalEnd):
            if(intervalIndex + 1 >= len(labels)):
                break
            intervalIndex += 1
            intervalBegin = pd.to_datetime(labels['start_time'][intervalIndex], utc=True)
            intervalEnd = pd.to_datetime(labels['end_time'][intervalIndex], utc=True)
        if(time > intervalEnd or time < intervalBegin):
            labelsList.append("None")
        else:
            labelsList.append(labels["activity"][intervalIndex])
    data["activity"] = labelsList

def loadAndLabel():
        
    labelledDfList = []

    for subdir, dirs, files in os.walk(filepath):
        if subdir != "Data":
            data = loadData(subdir)
            labels = loadLabels(subdir)
            if (data is not None) & (labels is not None):
                print("Labelled Data from {}".format((str(pd.to_datetime(data['dt'][0], utc=True))[0:10])))
                labelData(data, labels)
                labelledDfList.append(data)
                
    return labelledDfList

OS System:  windows


In [8]:
dfList = loadAndLabel()
print(len(dfList[0]))

Labelled Data from 2020-11-05
Labelled Data from 2020-11-16
Labelled Data from 2020-11-23
Labelled Data from 2020-11-26
Labelled Data from 2020-11-27
51603


In [9]:
query_variables = ["acc_x", "acc_y", "acc_z", "gyro_x", "gyro_y", "gyro_z"]
composite_variables = []
windowSize = 10 #Maximum Size of window on either side of point
dfIndex = 0;
show_progress=False
for df in dfList:
    print("Starting DF " + str(dfIndex))
    values = {}
    for variable in query_variables:
            values[variable + "_MEAN"] = []
            values[variable + "_VAR"] = []
    for i in range(0, len(df.index)):
        minIndex = 0
        maxIndex = len(df.index)
        if(i >= windowSize):
            minIndex = i - windowSize
        if(i <= len(df.index)-1-windowSize):
            maxIndex = i+windowSize
        slice = df.iloc[minIndex:maxIndex+1]
        for variable in query_variables:
            values[variable + "_MEAN"].append(slice[variable].mean())
            values[variable + "_VAR"].append(slice[variable].var())
        if(show_progress):
            display.clear_output()
            print("Processing Dataframe: " + str(dfIndex))
            print("Percentage Finished: " + str(float(i)*100/len(df.index)) + "%")
    for variable in query_variables:
            df[variable + "_MEAN"] = values[variable + "_MEAN"]
            df[variable + "_VAR"] = values[variable + "_VAR"]
            if(not (variable + "_MEAN") in composite_variables):
                composite_variables.append((variable + "_MEAN"))
            if(not (variable + "_VAR") in composite_variables):
                composite_variables.append((variable + "_VAR"))
    dfIndex += 1

Starting DF 0
Starting DF 1
Starting DF 2
Starting DF 3
Starting DF 4


In [10]:
def add_reciprocal_column(df, column1, column2, dfIndex, log=False, verbose=False):
    count = 0
    if(column1 in df.columns and column2 in df.columns):
        print("Forming reciprocal of " + column1 + "/" + column2)
        reciprocal_vals = []
        if((column1 + "/" + column2) in df.columns):
            df.drop(column1 + "/" + column2, 1, inplace=True)
        
        new_df = df
        for i in range(len(df)):
            x = df[column1][i]
            y = df[column2][i]
            if(x == 0):
                x = 0.00001
            if(y == 0):
                y = 0.00001
            
            result = x/y
            if(np.isnan(result) or np.isinf(result)):
                count += 1
                result = 0.00001
            if(log):
                result = np.log10(np.abs(result))
            reciprocal_vals.append(result)
            
            if(verbose):
                display.clear_output()
                print("Forming reciprocal of " + column1 + "/" + column2 + " for Dataframe " + str(dfIndex))
                print("Percentage Finished: " + str(float(i)*100/len(df)) + "%")
                print("NaN or Inf results: " + str(count))
        new_df[column1 + "/" + column2] = reciprocal_vals
        
        return new_df
    else:
        print("Columns not present in Dataframe. Cannot form Reciprocal!")

In [11]:
for i in range(0,len(dfList)):
    dfList[i] = add_reciprocal_column(dfList[i], "acc_x", "acc_y", i, log=True)
for i in range(0,len(dfList)):
    dfList[i] = add_reciprocal_column(dfList[i], "acc_x", "acc_z", i, log=True)
for i in range(0,len(dfList)):
    dfList[i] = add_reciprocal_column(dfList[i], "acc_y", "acc_z", i, log=True)

Forming reciprocal of acc_x/acc_y
Forming reciprocal of acc_x/acc_y
Forming reciprocal of acc_x/acc_y
Forming reciprocal of acc_x/acc_y
Forming reciprocal of acc_x/acc_y
Forming reciprocal of acc_x/acc_z
Forming reciprocal of acc_x/acc_z
Forming reciprocal of acc_x/acc_z
Forming reciprocal of acc_x/acc_z
Forming reciprocal of acc_x/acc_z
Forming reciprocal of acc_y/acc_z
Forming reciprocal of acc_y/acc_z
Forming reciprocal of acc_y/acc_z
Forming reciprocal of acc_y/acc_z
Forming reciprocal of acc_y/acc_z


In [12]:
def calculate_fourier(column, df, rate, label, use_label=False, M = 64, freq_limit=-1, show=False):
    if(column in df.columns):
        data = np.array(df[column])
        if(use_label):
            data = np.array(df.loc[df['activity'] == label][column])
        N = data.shape[0]
        if(N <= M):
            if(use_label):
                print("Insufficient Data for activity: " + label)
            else:
                print("Insufficient Complete Data")
            return []
        L = N / rate
        slices = util.view_as_windows(data, window_shape=(M,), step=1)
        slices = slices * np.hanning(M + 1)[:-1]
        slices = slices.T
        spectrum = np.fft.fft(slices, axis=0)[:M // 2 + 1:-1]
        spectrum = np.abs(spectrum)
        

        S = np.abs(spectrum)
        #S = 20 * np.log10(S / np.max(S))
        S = S / np.max(S)

        if(show):
            f, ax = plt.subplots(figsize=(20, 10))
            if(freq_limit != -1):
                ax.imshow(S[0:freq_limit], origin='lower', cmap='viridis', extent=(0, L, 0, freq_limit))
            else:
                ax.imshow(S, origin='lower', cmap='viridis', extent=(0, L, 0, np.max(np.fft.fftfreq(M, d=1/rate))))
            ax.axis('tight')
            y_label = 'Frequency [Hz] of value: '
            y_label = y_label + column
            ax.set_ylabel(y_label)
            ax.set_xlabel(label + ' Time [s]');
        #S has first index referencing frequency in Hz, second index is window index
        #print(L)
        return S
    
def add_fourier_variables(variable_frequencies, labelledDFList, window_size):
    fourier_variables = []
    for df in labelledDFList:
        for variable in variable_frequencies:
            frequency_indices = variable_frequencies[variable]
            S = calculate_fourier(variable, df, 50, '', use_label=False, M=window_size, show=False)
            for frequency_index in frequency_indices:
                new_column = []
                new_column.extend(np.zeros(window_size-1))
                new_column.extend(S[frequency_index])
                print(variable + " at frequency index: " + str(frequency_index))
                new_var_name = variable + "_freq_" + str(frequency_index)
                if(new_var_name in df.columns):
                    df.drop(new_var_name, 1, inplace=True)
                df[new_var_name] = new_column
                if(not new_var_name in fourier_variables):
                    fourier_variables.append(new_var_name)
    return fourier_variables

In [13]:
from imblearn.under_sampling import RandomUnderSampler
def split_test_train(data, training_variables, excluded_activities=[], test_ratio=0.25, perform_one_hot=False, undersample=True, oversample=False):
    excluded_activities.append('None')
    new_data = data[~data['activity'].isin(excluded_activities)]
    activities = new_data['activity'].unique()
    print("Activity Data Count:")
    print(new_data['activity'].value_counts())
    print()
    if(undersample):
        print("Undersampling to: " + str(np.min(new_data['activity'].value_counts())) + " data points per activity" )
    elif(oversample):
        print("Oversampling to: " + str(np.max(new_data['activity'].value_counts())) + " data points per activity" )
    independent_data = new_data[training_variables]
    
    one_hot=new_data['activity'].to_numpy()
    print(one_hot)
    if(perform_one_hot):
        one_hot = pd.get_dummies(new_data['activity'])
        if(undersample):
            rus = RandomUnderSampler()
            X_resampled, y_resampled = rus.fit_resample(independent_data.to_numpy(), one_hot.to_numpy())
            #Returns X_train,X_test,y_train,y_test, independent_data.columns, one_hot.columns
            return (train_test_split(X_resampled,y_resampled,test_size=test_ratio), independent_data.columns, one_hot.columns)
        elif(oversample):
            ros = RandomOverSampler()
            X_resampled, y_resampled = ros.fit_resample(independent_data.to_numpy(), one_hot)
            #Returns X_train,X_test,y_train,y_test, independent_data.columns, one_hot.columns
            return (train_test_split(X_resampled,y_resampled,test_size=test_ratio), independent_data.columns, one_hot.columns)
        else:
            return (train_test_split(independent_data.to_numpy(),one_hot.to_numpy(),test_size=test_ratio), independent_data.columns, one_hot.columns)
    else:
        one_hot=new_data['activity'].to_numpy()
        if(undersample):
            rus = RandomUnderSampler()
            X_resampled, y_resampled = rus.fit_resample(independent_data.to_numpy(), one_hot)
            #Returns X_train,X_test,y_train,y_test, independent_data.columns, one_hot.columns
            return (train_test_split(X_resampled,y_resampled,test_size=test_ratio), independent_data.columns, [])
        elif(oversample):
            ros = RandomOverSampler()
            X_resampled, y_resampled = ros.fit_resample(independent_data.to_numpy(), one_hot)
            #Returns X_train,X_test,y_train,y_test, independent_data.columns, one_hot.columns
            return (train_test_split(X_resampled,y_resampled,test_size=test_ratio), independent_data.columns, [])
        else:
            return (train_test_split(independent_data.to_numpy(),one_hot,test_size=test_ratio), independent_data.columns, [])

In [14]:
useful_frequencies = {'acc_x': [0, 2, 3, 4, 5, 7], 'acc_y': [0, 4, 5], 'acc_z': [0, 2, 3, 4, 5], 'gyro_x': [5], 'gyro_y': [3, 4], 'acc_x/acc_y': [0, 2, 3, 4], 'acc_x/acc_z': [5], 'acc_y/acc_z': [0, 2, 3]}
fourier_vars = add_fourier_variables(useful_frequencies, dfList, 80)

acc_x at frequency index: 0
acc_x at frequency index: 2
acc_x at frequency index: 3
acc_x at frequency index: 4
acc_x at frequency index: 5
acc_x at frequency index: 7
acc_y at frequency index: 0
acc_y at frequency index: 4
acc_y at frequency index: 5
acc_z at frequency index: 0
acc_z at frequency index: 2
acc_z at frequency index: 3
acc_z at frequency index: 4
acc_z at frequency index: 5
gyro_x at frequency index: 5
gyro_y at frequency index: 3
gyro_y at frequency index: 4
acc_x/acc_y at frequency index: 0
acc_x/acc_y at frequency index: 2
acc_x/acc_y at frequency index: 3
acc_x/acc_y at frequency index: 4
acc_x/acc_z at frequency index: 5
acc_y/acc_z at frequency index: 0
acc_y/acc_z at frequency index: 2
acc_y/acc_z at frequency index: 3
acc_x at frequency index: 0
acc_x at frequency index: 2
acc_x at frequency index: 3
acc_x at frequency index: 4
acc_x at frequency index: 5
acc_x at frequency index: 7
acc_y at frequency index: 0
acc_y at frequency index: 4
acc_y at frequency index:

In [15]:
from sklearn.model_selection import train_test_split


useful_values = ["acc_x", "acc_y", "acc_z", "gyro_x", "gyro_y", "gyro_z", "acc_x/acc_y", "acc_x/acc_z", "acc_y/acc_z"] + composite_variables + fourier_vars
activities_to_exclude = ['barking', 'jumping', 'None']#, 'eating', 'jumping', 'playing', 'standing']
dfconcat = pd.concat(dfList)
dfconcat = dfconcat[~dfconcat['activity'].isin(activities_to_exclude)]
activities = dfconcat['activity'].unique()


independent_data = dfconcat[useful_values].to_numpy()
labels = dfconcat['activity'].to_numpy()
    

train = pd.concat(dfList[0:3])
train = train[~train['activity'].isin(activities_to_exclude)]
trainx = train[useful_values].to_numpy()
trainy = train['activity'].to_numpy()

test_temp = dfList[4]
test_temp = test_temp[~test_temp['activity'].isin(activities_to_exclude)]
test_tempx = test_temp[useful_values].to_numpy()
test_tempy = test_temp['activity'].to_numpy()

pseudo_trainx, testx, pseudo_trainy, testy = train_test_split(test_tempx, test_tempy, test_size=0.25)

In [16]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

#trees_classifier = ExtraTreesClassifier(n_estimators=100, criterion='entropy')
#trees_classifier.fit(trainx,trainy)
#predictions = trees_classifier.predict(pseudo_trainx)

adaboost_classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=10), n_estimators = 100)
adaboost_classifier.fit(trainx,trainy)
predictions = adaboost_classifier.predict(pseudo_trainx)

correct = 0

for x in range(0, len(predictions)):
    if predictions[x] == pseudo_trainy[x]:
        correct += 1
        
print(correct/len(predictions)*100)

74.46754692163708


In [17]:
train2x = np.append(trainx, pseudo_trainx, axis=0)
train2y = np.append(trainy, predictions)

In [18]:
adaboost_classifier.fit(train2x, train2y)
predictions2 = adaboost_classifier.predict(testx)

correct = 0

for x in range(0, len(predictions2)):
    if predictions2[x] == testy[x]:
        correct += 1
        
print(correct/len(predictions2)*100)

74.09318561238427
