# Testing Functions for Calculating Activity Onset

### In this notebook:
- I will be testing necessary functions for calculating activity onset

In [21]:
#Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta

#Read Spider activity CSV file
df = pd.read_csv('Metazygia wittfeldae Monitor 1 Updated_LD.csv',)
df.set_index('Date-Time', inplace = True)

#Turn date-time column into datetime format
df.index = pd.to_datetime(df.index)

#Cut out certain columns to make data more managable to test on
df.drop(df.iloc[:, 2:], axis = 1, inplace = True)
df.drop(df.index[4320:], inplace = True)

df

Unnamed: 0_level_0,Light,Monitor 1 Spider 1
Date-Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-04-21 00:00:00,0,0
2017-04-21 00:01:00,0,0
2017-04-21 00:02:00,0,0
2017-04-21 00:03:00,0,0
2017-04-21 00:04:00,0,0
...,...,...
2017-04-23 23:55:00,0,0
2017-04-23 23:56:00,0,0
2017-04-23 23:57:00,0,0
2017-04-23 23:58:00,0,0


###### I tried so hard to get the following function to work with arrays, but ended up not being able to so I found much more simple code for flattening lists

In [23]:
#This function is used for adding a column to the dataframe that contains the numerical minute out of the day.
#For example, at 12:25 AM, the column will have a value of 25.
def add_minutes_column(df):
    
    #Create a list containing each minute in a day
    total_minutes = []
    minutes = np.arange(1,1441).tolist()
    num_days = len(pd.unique(df.index.date).tolist())
    
    #extend the list n number of times, with n being the number of days in the dataframe
    total_minutes.extend([minutes for i in range(num_days)])
    
    #create new list that flattens the original list, allowing it to be used as a dataframe column
    total_minutes_list = [x for l in total_minutes for x in l]

    #append the flattened list to the original dataframe
    df['Total Minutes'] = total_minutes_list
    return df

In [24]:
#Testing add minutes to dataframe function
add_minutes_column(df)

Unnamed: 0_level_0,Light,Monitor 1 Spider 1,Total Minutes
Date-Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-04-21 00:00:00,0,0,1
2017-04-21 00:01:00,0,0,2
2017-04-21 00:02:00,0,0,3
2017-04-21 00:03:00,0,0,4
2017-04-21 00:04:00,0,0,5
...,...,...,...
2017-04-23 23:55:00,0,0,1436
2017-04-23 23:56:00,0,0,1437
2017-04-23 23:57:00,0,0,1438
2017-04-23 23:58:00,0,0,1439


### These functions are used throughout my code to separate the dataframe into multiple days

In [25]:
#This function is used to create a list of indexes to be used to separate the days
def find_light_switch_index(df):
    transition_time = '07:00:00'
    transition_indexes_df = df.loc[df.index.strftime("%H:%M:%S") == transition_time]
    transition_indexes_list = transition_indexes_df.index.to_list()
    
    return transition_indexes_list

In [26]:
#Test this function
find_light_switch_index(df)

[Timestamp('2017-04-21 07:00:00'),
 Timestamp('2017-04-22 07:00:00'),
 Timestamp('2017-04-23 07:00:00')]

### The following function creates a new dataframe for every "day". The days are defined by the time the light turns on in the morning and ends when the lights turn on the next morning

In [27]:
#This function is used in my bigger functions to separate days. Creates a list of dataframes, with each dataframe containing a 
def create_list_of_df(transition_indexes_list):
    list_of_df = []
    for x in range(len(transition_indexes_list)-1):
        list_of_df.append(df[(df.index >= transition_indexes_list[x]) & (df.index < transition_indexes_list[x+1])])
    return(list_of_df)

In [28]:
#Testing create list of dataframe function
first_step = find_light_switch_index(df)

second_step = create_list_of_df(first_step)

print(second_step)

[                     Light  Monitor 1 Spider 1  Total Minutes
Date-Time                                                    
2017-04-21 07:00:00      0                   0            421
2017-04-21 07:01:00      1                   1            422
2017-04-21 07:02:00      1                  10            423
2017-04-21 07:03:00      1                   9            424
2017-04-21 07:04:00      1                   6            425
...                    ...                 ...            ...
2017-04-22 06:55:00      0                   0            416
2017-04-22 06:56:00      0                   0            417
2017-04-22 06:57:00      0                   0            418
2017-04-22 06:58:00      0                   0            419
2017-04-22 06:59:00      0                   0            420

[1440 rows x 3 columns],                      Light  Monitor 1 Spider 1  Total Minutes
Date-Time                                                    
2017-04-22 07:00:00      0                 

### The following function will not be necessary when using the multiple spider dataframe, only for the test data

In [29]:
#Create function that places spider activity in the first column, as this is one of the parameters for the calculate activity onset code
def move_activity_column_to_0_index(df):
    first_column = df.pop('Monitor 1 Spider 1')
    df.insert(0, 'Monitor 1 Spider 1', first_column)
    return df

In [30]:
# Test column switch function
move_activity_column_to_0_index(df)

Unnamed: 0_level_0,Monitor 1 Spider 1,Light,Total Minutes
Date-Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-04-21 00:00:00,0,0,1
2017-04-21 00:01:00,0,0,2
2017-04-21 00:02:00,0,0,3
2017-04-21 00:03:00,0,0,4
2017-04-21 00:04:00,0,0,5
...,...,...,...
2017-04-23 23:55:00,0,0,1436
2017-04-23 23:56:00,0,0,1437
2017-04-23 23:57:00,0,0,1438
2017-04-23 23:58:00,0,0,1439


### The following function is used to add a rolling average column over 30 bins to the original dataframe

##### *** This is why the spider activity must be in the first column! When more spiders are introduced to the dataframe, the for loop used to calculate activity onset places the spider name in column 1

In [31]:
#Create add rolling average to dataframe function
def add_rolling_average_column(df):
    #calculate rolling average of activity
    activity = df.iloc[: , 0]
    rolling = activity.rolling(30).mean().dropna()

    #Add new column to dataframe with the rolling mean
    df['Rolling'] = rolling
    return df

In [32]:
#Test add rolling function
add_rolling_average_column(df)

Unnamed: 0_level_0,Monitor 1 Spider 1,Light,Total Minutes,Rolling
Date-Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-04-21 00:00:00,0,0,1,
2017-04-21 00:01:00,0,0,2,
2017-04-21 00:02:00,0,0,3,
2017-04-21 00:03:00,0,0,4,
2017-04-21 00:04:00,0,0,5,
...,...,...,...,...
2017-04-23 23:55:00,0,0,1436,0.0
2017-04-23 23:56:00,0,0,1437,0.0
2017-04-23 23:57:00,0,0,1438,0.0
2017-04-23 23:58:00,0,0,1439,0.0


In [39]:
#This function calculates the threshold value for a given day and appends the first instance where the threshold was crossed to a dataframe
def calculate_threshold(list_of_df):
    activity_onset_values = []
    for i in list_of_df:
        #calculate treshold
        stdv = i.Rolling.std()
        threshold = i.Rolling.mean() + stdv
    
        #determine all instances where threshold is crossed
        threshold_crossings = np.diff(i.Rolling > threshold, prepend=False)

        #create a new array with index of threshold crossings for each day
        threshold_crossings_array = np.argwhere(threshold_crossings)[::2,0]

        #find the first threshold crossing after minute 1142, as this is considered the start of the night
        true_threshold_crossings_array = threshold_crossings_array[threshold_crossings_array >1142]
        
        #append first threshold crossing to the activity onset list every day
        if len(true_threshold_crossings_array) > 0:
            activity_onset_values.append(true_threshold_crossings_array[0])
        else:
            activity_onset_values.append('NaN')
    
    return activity_onset_values
    

In [40]:
#Test calculate_threshold function using calculate_threshold and create_list_of_df functions
first_step = find_light_switch_index(df)

second_step = create_list_of_df(first_step)

third_step = calculate_threshold(second_step)

print(third_step)

['NaN', 1271]


### The following function is the final function for calculating activity onset; It takes the list of activity onsets and creates a dataframe for it

In [119]:
#Create activity onset dataframe function
def create_activity_onset_df(activity_onset_values):
    #define necessary variables and set up empty dataframe for following function
    lights_off = 1142
    list_of_days = list(pd.unique(df.index.date))
    list_of_days_for_df = list_of_days[1:]
    column_names = ['Date', 'Activity Onset']
    activity_onset_df = pd.DataFrame(columns = column_names)
    activity_onset_df['Date'] = list_of_days_for_df
    
    #subtract minutes from light onset from calculated value to get activity onset in relation to when lights turn on
    activity_onset_values_subtracted = []
    
    for x in activity_onset_values:
        if x != 'NaN':
            activity_onset_values_subtracted.append(x - lights_off)
        else:
            activity_onset_values_subtracted.append('NaN')
    
    #set this list as the activity onset column of the empty dataframe
    activity_onset_df['Activity Onset'] = activity_onset_values_subtracted
    activity_onset_df = activity_onset_df.set_index('Date')
    
    print('NaN values represent days where threshold was not crossed')
    return activity_onset_df

In [120]:
#Test this function using a 3 step process including the 2 functions previously used

step4 = create_activity_onset_df(third_step)

display(step4)

NaN values represent days where threshold was not crossed


Unnamed: 0_level_0,Activity Onset
Date,Unnamed: 1_level_1
2017-04-22,
2017-04-23,129.0
