In [None]:
# Import libraries
import numpy as np
import pandas as pd
import pickle as pkl
import statistics as stats

from thefuzz import process

In [None]:
# Open current pkl+ cut0 file for reading
myfile = open("/home/moonmoon/KeeneCollab/_Output/stops.pkl", "rb")
# Load pkl+ file into temporary pandas dataframe with following column names
stops = pkl.load(myfile)
# Close current file connection
myfile.close()

# Store stops keys as DictKeys
DictKeys = list(stops.keys())

# Create empty list for storing species names
SpeciesList = []

# Loop through each key in dictionary
for name in DictKeys:
    # Split video IDs at first underscore, store in temporary variable
    temp = name.split("_", maxsplit = 1)
    # Add first position (species name) to SpeciesList
    SpeciesList.append(temp[0])

# Find unique occurrences of species name, store in SpeciesList
SpeciesList = np.unique(SpeciesList)

# Make list of species names of all Keene videos
KeeneList = ['CALLIPTERA', 'KUMWERA', 'LABROSUS', 'STUARTGRANTI']

# Identify all species names that are not in KeeneList, store in AlbertsonList
AlbertsonList = list(filter(lambda x: x not in KeeneList, SpeciesList))

# Create empty list to hold zerodetections
zerodets = []

# Loop through all key, value pairs in stops dictionary
for key, value in stops.items():
    # Extract current species name from key
    currentName = key.split("_", maxsplit = 1)
    # Save current species name in currentName
    currentName = currentName[0]
    # if minimum value of Start is a float
    if isinstance(min(value['Start']), float):
        # Append key to zerodets
        zerodets.append(key)
    # else if minimum value of Start is not a float
    else:
        # if current name in Keenelist
        if currentName in KeeneList:
            # Set zeit-time (0-23h) by extracting floor of stored datetime object reflecting frame of the video (Keene videos start at ZT 1, an hour after lights on)
            value['ZT'] = value['Start'].dt.floor('h').dt.hour
            # Cut rows of ZT into Day and Night hours, store labels in column labeled Bin
            value['Bin'] = pd.cut(value['ZT'], bins = [0, 13, 23], labels = ['Day', 'Night'], include_lowest = True)
        # if current name in AlbertsonList
        elif currentName in AlbertsonList:
            # Set zeit-time (0-23h) by extracting ceiling of stored datetime object reflecting frame of the video (Albertson videos start at ZT 0, at lights on)
            value['ZT'] = value['Start'].dt.ceil('h').dt.hour
            # Cut rows of ZT into Day and Night hours, store labels in column labeled Bin
            value['Bin'] = pd.cut(value['ZT'], bins = [0, 13, 23], labels = ['Day', 'Night'], include_lowest = True)

In [None]:
# Loop over all entries in zerodets list
for i in zerodets:
    # Delete stops dictionary value where zerodets detected
    del stops[i]

# Loop over key, value pairs in stops dictionary
for key, value in stops.items():
    # if KUMWERA in key
    if 'KUMWERA' in key:
        # Print key to length of value
        print(f'{key} to {len(value)}')

KUMWERA_2M-3F_V1_1.0 to 15
KUMWERA_2M-3F_V1_2.0 to 12
KUMWERA_2M-3F_V1_3.0 to 12
KUMWERA_2M-3F_V1_4.0 to 10
KUMWERA_2M-3F_V1_5.0 to 13
KUMWERA_2M-3F_V2_1.0 to 29
KUMWERA_2M-3F_V2_2.0 to 26
KUMWERA_2M-3F_V2_3.0 to 27
KUMWERA_2M-3F_V2_4.0 to 22
KUMWERA_2M-3F_V2_5.0 to 20
TROPHEOPS KUMWERA_Cichlid1_10242022_2_2.0 to 2
TROPHEOPS KUMWERA_Cichlid1_10242022_2_3.0 to 2
TROPHEOPS KUMWERA_Cichlid1_10242022_2_5.0 to 2
TROPHEOPS KUMWERA_Cichlid2_01302023_2_1.0 to 55
TROPHEOPS KUMWERA_Cichlid2_01302023_2_2.0 to 47
TROPHEOPS KUMWERA_Cichlid2_01302023_2_3.0 to 55
TROPHEOPS KUMWERA_Cichlid2_01302023_2_4.0 to 50
TROPHEOPS KUMWERA_Cichlid2_01302023_2_5.0 to 53


In [None]:
# Create empty dictionaries to store stop statistics
grouped = {}
durations = {}
dDurations = {}
nDurations = {}

# Set established to False
established = False

# Loop over key, value pairs in stops dictionary
for key, value in stops.items():
    # Print key to value
    #print(f'{key} to {value}')
    # Extract current species name from key
    currentName = key.split("_", maxsplit = 1)
    # Save current species name in currentName
    currentName = currentName[0]
    if 'KUMWERA' not in currentName:
        # if established is equal to False and KUMWERA not in currentName
        if not established:
            # Print Start currenName
            print(f'Start {currentName}')
            # Store currentName as startName
            startName = currentName
            # Store value as resultValue
            resultValue = value
            # Set established to True
            established = True

            # Store sum of all stop durations in seconds divided by total seconds per day as duration
            duration = [sum(value['Duration_s'])/86400]

            # Store all stops occuring during the Day as dValue
            dValue = value[value['Bin'] == 'Day']
            # Store sum of all stop durations in seconds divided by total seconds per day as duration
            dDuration = [sum(dValue['Duration_s'])/50400]

            # Store all stops occuring during the Night as nValue
            nValue = value[value['Bin'] == 'Night']
            # Store sum of all stop durations in seconds divided by total seconds per day as duration
            nDuration = [sum(nValue['Duration_s'])/36000]
        # else if established is equal to true
        else:
            # and if currentName matches startName
            if currentName == startName:
                # Print Chunk currentName
                print(f'Chunk {currentName}')
                # Concatenate resultValue to value and overwrite resultValue
                resultValue = pd.concat([resultValue, value], axis = 0)

                # Append sum of all stop durations in seconds divided by total seconds per day as duration
                duration.append(sum(value['Duration_s'])/86400)

                # Store all stops occuring during the Day as dValue
                dValue = value[value['Bin'] == 'Day']
                # Append sum of all day stop durations in seconds divided by total seconds per day as duration
                dDuration.append(sum(dValue['Duration_s'])/50400)

                # Store all stops occuring during the Night as nValue
                nValue = value[value['Bin'] == 'Night']
                # Append sum of all night stop durations in seconds divided by total seconds per day as duration
                nDuration.append(sum(nValue['Duration_s'])/36000)
            # Else if currentame does not match startName and KUMWERA not in currentName
            elif currentName != startName:
                # Print End startName and Start currentName
                print(f'End {startName}; Start {currentName}')
                # Store resultValue as grouped value with startName as key
                grouped[startName] = resultValue
                # Store duration as durations value with startName as key
                durations[startName] = duration
                # Store day duration as dDurations value with startName as key
                dDurations[startName] = dDuration
                # Store day duration as nDurations value with startName as key
                nDurations[startName] = nDuration

                # Store currentName as startName
                startName = currentName
                # Store value as resultValue
                resultValue = value

                # Store sum of all stop durations in seconds divided by total seconds per day as duration
                duration = [sum(value['Duration_s'])/86400]

                # Store all stops occuring during the Day as dValue
                dValue = value[value['Bin'] == 'Day']
                # Store sum of all stop durations in seconds divided by total seconds per day as duration
                dDuration = [sum(dValue['Duration_s'])/50400]

                # Store all stops occuring during the Night as nValue
                nValue = value[value['Bin'] == 'Night']
                # Store sum of all stop durations in seconds divided by total seconds per day as duration
                nDuration = [sum(nValue['Duration_s'])/36000]

# Print End startName
print(f'End {startName}')
# Store resultValue as grouped value with startName as key
grouped[startName] = resultValue
# Store duration as durations value with startName as key
durations[startName] = duration
# Store day duration as dDurations value with startName as key
dDurations[startName] = dDuration
# Store day duration as nDurations value with startName as key
nDurations[startName] = nDuration

Start LABROSUS
Chunk LABROSUS
Chunk LABROSUS
Chunk LABROSUS
Chunk LABROSUS
End LABROSUS; Start MAYLANDIA FAIAZIBERI MAISON REEF
Chunk MAYLANDIA FAIAZIBERI MAISON REEF
Chunk MAYLANDIA FAIAZIBERI MAISON REEF
Chunk MAYLANDIA FAIAZIBERI MAISON REEF
Chunk MAYLANDIA FAIAZIBERI MAISON REEF
End MAYLANDIA FAIAZIBERI MAISON REEF; Start METRIACLIMA "DAKTARI" (HAI REEF)
Chunk METRIACLIMA "DAKTARI" (HAI REEF)
Chunk METRIACLIMA "DAKTARI" (HAI REEF)
Chunk METRIACLIMA "DAKTARI" (HAI REEF)
Chunk METRIACLIMA "DAKTARI" (HAI REEF)
End METRIACLIMA "DAKTARI" (HAI REEF)


In [None]:
# Set established to False
established = False

# Loop over key, value pairs in stops dictionary
for key, value in stops.items():
    # Print key to value
    #print(f'{key} to {len(value)}')
    # if KUMWERA in key
    if 'KUMWERA' in key:
        # Extract current species name from key
        currentName = key.split("_", maxsplit = 1)
        # Save current species name in currentName
        currentName = currentName[0]
        # if established is equal to False
        if not established:
            # Print Start currentName
            print(f'Start {currentName}')
            # Store currentName as startName
            startName = currentName
            # Store value as resultValue
            resultValue = value
            # Set established to True
            established = True

            # Store sum of all stop durations in seconds divided by total seconds per day as duration
            duration = [sum(value['Duration_s'])/345600]

            # Store all stops occuring during the Day as dValue
            dValue = value[value['Bin'] == 'Day']
            # Store sum of all stop durations in seconds divided by total seconds per day as duration
            dDuration = [sum(dValue['Duration_s'])/201600]

            # Store all stops occuring during the Night as nValue
            nValue = value[value['Bin'] == 'Night']
            # Store sum of all stop durations in seconds divided by total seconds per day as duration
            nDuration = [sum(nValue['Duration_s'])/144000]
        # if established is equal to True
        else:
            # Print Chunk currentName
            print(f'Chunk {currentName}')
            # Concatenate resultValue to value and overwrite resultValue
            resultValue = pd.concat([resultValue, value], axis = 0)

            # Append sum of all stop durations in seconds divided by total seconds per day as duration
            duration.append(sum(value['Duration_s'])/345600)

            # Store all stops occuring during the Day as dValue
            dValue = value[value['Bin'] == 'Day']
            # Append sum of all day stop durations in seconds divided by total seconds per day as duration
            dDuration.append(sum(dValue['Duration_s'])/201600)

            # Store all stops occuring during the Night as nValue
            nValue = value[value['Bin'] == 'Night']
            # Append sum of all night stop durations in seconds divided by total seconds per day as duration
            nDuration.append(sum(nValue['Duration_s'])/144000)

# Print End currentName
print(f'End {currentName}')
# Store resultValue as grouped value with currentName as key
grouped[currentName] = resultValue
# Store duration as durations value with currentName as key
durations[currentName] = duration
# Store day duration as dDurations value with currentName as key
dDurations[currentName] = dDuration
# Store day duration as nDurations value with currentName as key
nDurations[currentName] = nDuration

Start KUMWERA
Chunk KUMWERA
Chunk KUMWERA
Chunk KUMWERA
Chunk KUMWERA
Chunk KUMWERA
Chunk KUMWERA
Chunk KUMWERA
Chunk KUMWERA
Chunk KUMWERA
Chunk TROPHEOPS KUMWERA
Chunk TROPHEOPS KUMWERA
Chunk TROPHEOPS KUMWERA
Chunk TROPHEOPS KUMWERA
Chunk TROPHEOPS KUMWERA
Chunk TROPHEOPS KUMWERA
Chunk TROPHEOPS KUMWERA
Chunk TROPHEOPS KUMWERA
End TROPHEOPS KUMWERA


In [None]:
# Read in mbuna key
key = pd.read_csv('/home/moonmoon/KeeneCollab/mbunaKey.csv')
# Sort rows by Mbuna x Species, reset index and drop
key = key.sort_values(['Mbuna', 'Species']).reset_index(drop = True)

# Store stops keys in pkeys
pkeys = pd.DataFrame(stops.keys())

# Make empty lists to store keys and their matches
keys = []
matches = []

# Loop through grouped keys
for pkey in grouped.keys():
    # Store matches from key for each pkey in match
    match = process.extractOne(pkey, key['Species'])
    # Print pkey and its best match
    #print(f'{pkey} to {match[0]}')
    # Append first position of match (best match) to matches list
    matches.append(match[0])
    # Append pkey to keys list
    keys.append(pkey)

# Store keys and matches in temporary dataframe, reset index
temp = pd.DataFrame(keys, matches).reset_index()

# Loop through new and old key pairs
for newkey, oldkey in zip(temp.iloc[:, 0], temp.iloc[:, 1]):
    # Print new and old keys
    #print(f'{newkey} to {oldkey}')
    # Pop oldkey from grouped, replace with newkey
    grouped[newkey] = grouped.pop(oldkey)
    # Pop oldkey from durations, replace with newkey
    durations[newkey] = durations.pop(oldkey)
    # Pop oldkey from dDurations, replace with newkey
    dDurations[newkey] = dDurations.pop(oldkey)
    # Pop oldkey from nDurations, replace with newkey
    nDurations[newkey] = nDurations.pop(oldkey)

In [None]:
# Open pickle for writing
myfile = open(f'/home/moonmoon/KeeneCollab/_Output/StopDetection/grouped.pkl', "wb")
# Dump pickle data into file and seal up for sleepies
pkl.dump(grouped, myfile)
# Close file connection
myfile.close()

In [None]:
# Set established to False
established = False

# Loop through key, value pairs
for catkey, value in durations.items():
    # if established is equal to False
    if not established:
        # Store Species Name and Stop %/SD/SE as totalstop
        totalstops = [catkey, sum(value)/len(value), stats.stdev(value), stats.stdev(value) / len(value) ** 0.5]
        # Set established to True
        established = True
    # else if established is equal to True
    else:
        # Stack totalstop with Species Name and Stop %/SD/SE as totalstops
        totalstops = np.vstack([totalstops, [catkey, sum(value)/len(value), stats.stdev(value), stats.stdev(value) / len(value) ** 0.5]])

# Store totalstops as a dataframe with appropriate column names
totalstops = pd.DataFrame(totalstops, columns = ['Species', 'Stop_%', 'Stop_SD', 'Stop_SE'])

# Save totalstops as a csv
totalstops.to_csv('/home/moonmoon/KeeneCollab/_Output/StopDetection/totalstops.csv', index = False)

In [None]:
# Set established to False
established = False

# Loop through key, value pairs
for catkey, value in dDurations.items():
    # if established is equal to False
    if not established:
        # Store Species Name and Stop %/SD/SE
        totalstops = [catkey, sum(value)/len(value), stats.stdev(value), stats.stdev(value) / len(value) ** 0.5]
        # Set established to True
        established = True
    # else if established is equal to True
    else:
        # Stack totalstop with Species Name and Stop %/SD/SE as totalstops
        totalstops = np.vstack([totalstops, [catkey, sum(value)/len(value), stats.stdev(value), stats.stdev(value) / len(value) ** 0.5]])

# Store totalstops as a dataframe with appropriate column names
totalstops = pd.DataFrame(totalstops, columns = ['Species', 'Stop_%', 'Stop_SD', 'Stop_SE'])

# Save totalstops as a csv
totalstops.to_csv('/home/moonmoon/KeeneCollab/_Output/StopDetection/daystops.csv', index = False)


In [None]:
# Set established to False
established = False

# Loop through key, value pairs
for catkey, value in nDurations.items():
    # if established is equal to False
    if not established:
        # Store Species Name and Stop %/SD/SE
        totalstops = [catkey, sum(value)/len(value), stats.stdev(value), stats.stdev(value) / len(value) ** 0.5]
        # Set established to True
        established = True
    # else if established is equal to True
    else:
        # Stack totalstop with Species Name and Stop %/SD/SE as totalstops
        totalstops = np.vstack([totalstops, [catkey, sum(value)/len(value), stats.stdev(value), stats.stdev(value) / len(value) ** 0.5]])

# Store totalstops as a dataframe with appropriate column names
totalstops = pd.DataFrame(totalstops, columns = ['Species', 'Stop_%', 'Stop_SD', 'Stop_SE'])

# Save totalstops as a csv
totalstops.to_csv('/home/moonmoon/KeeneCollab/_Output/StopDetection/nightstops.csv', index = False)