### Import packages and setup plot parameters

In [None]:
# Import packages
import re
import os

import numpy as np
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt

from datetime import datetime
from sklearn.preprocessing import MinMaxScaler

# Ensure interactive mode enabled/disabled as needed
#plt.ion()

# Store plot parameters
params = {"ytick.color" : "w",
          "xtick.color" : "w",
          "axes.labelcolor" : "w",
          "axes.edgecolor" : "w"}
# Update plot parameters
plt.rcParams.update(params)

### Load demographic information and species keys, merge together so each individual has a translated Species column

In [None]:
demoinfo = pd.read_csv("/home/moonmoon/FD/Native_Pop.csv")
SpKey = pd.read_csv("/home/moonmoon/FD/SpeciesKey.csv")
SpKey = SpKey.dropna(axis='columns')
demoinfo = pd.merge(demoinfo, SpKey, left_on = 'Species', right_on = "Key", how = 'inner')

In [None]:
# Change current working directory
os.chdir("/home/moonmoon/FD/cut0s/Albertson/Group")

# Create empty dictionary
dist = {}

# Set static variable
FPS = 15

# Create empty dataframe
demodata = pd.DataFrame()

# List files in current working directory
list_files = os.listdir(os.getcwd())

# Loop through all files in current working directory
for file in list_files:
    # If file has extension .pkl and has cut0 in the name, execute the following commands
    if file.endswith('.pkl') and 'cut0' in file:
        # Split unnecessary file information off of filename
        namestr = re.split("-|crop|(?=\\d{2}_)|\\.", file)
        # Add '20' to front of year
        namestr = (f'{namestr[2]}20{namestr[3]}')
        # Check filename
        #print(namestr)
        # Filter demographic information by filename
        filedemo = demoinfo.loc[demoinfo['FileName'] == namestr].copy()
        # Find mean size of all fish in given file
        temp = filedemo['Size (inches)'].mean()
        # Override measured sizes by average size for body lengths calculation
        filedemo['Size (inches)'] = temp
        # Drop duplicates (all numbers should match now)
        filedemo = filedemo.drop_duplicates()
        # Check demographic information is a single row
        #print(filedemo)
        # Append demographic information to dataframe
        demodata = pd.concat([demodata, filedemo], axis = 0)
        # Open current pkl+ cut0 file for reading
        myfile = open(file, "rb")
        # Check which file is being accessed
        #print(myfile)
        # Load pkl+ file into temporary pandas dataframe with following column names
        temp = pd.DataFrame(pkl.load(myfile), columns = ['x1', 'y1', 'x2', 'y2', 'id', 'frame'])
        # Check content and dimensions of file
        #print(f'{temp} by {len(temp)})
        # Close current file connection
        myfile.close()
        # Group temporary dataframe by ID to facilitate future filtering of null detections
        grouped = temp.groupby('id')
        # Check content and dimensions of grouped data
        #print(f'{grouped} by {len(grouped)}')
        # Loop through grouped data storing name and group content, then execute the following commands
        key = filedemo['Species_y'].iloc[0]
        key = key.upper()
        #print(key)
        for name, group in grouped:
            # If key/name string ends with ID 0.0, execute the following command
            if name == 0:
                # Skip this iteration
                continue
            # If valid ID, execute the following commands
            else:
                group = group.reset_index()

                # Find midpoint between x1 and x2, store in x variable
                group['x'] = (group['x1'] + group['x2']) / 2
                # Find midpoint between y1 and y2, store in y variable
                group['y'] = (group['y1'] + group['y2']) / 2

                # Convert to artifical lng [0-1]
                group['x'] = MinMaxScaler(feature_range=(0, 1)).fit_transform(pd.DataFrame(group['x']))
                # Convert to artifical lat [0-1]
                group['y'] = MinMaxScaler(feature_range=(0, 0.5)).fit_transform(pd.DataFrame(group['y']))

                # Divide frame series by frames per second, store as seconds
                seconds = group['frame']/FPS
                # Create proxy date (real date unnecessary)
                dateproxy = datetime(2024, 8, 25, 0, 0, 0)
                # Check static variable content
                #print(dateproxy)

                # Create time proxy (real time unnecessary, may implement in future)
                timeproxy = pd.to_timedelta(seconds, unit = 's')
                # Check static variable content
                #print(timeproxy)

                # Merge date and time proxies for final proxy to use in MovingPandas analyses
                finalproxy = dateproxy + timeproxy
                # Check static variable content
                #print(finalproxy)

                # Assign converted values to original value
                group['frame'] = finalproxy

                print(f'{key}_{namestr}_{name}')
                # Store filtered and pared data into new dictionary with key/file name as the key
                dist[f'{key}_{namestr}_{name}'] = pd.concat([group['frame'], group['x'], group['y'], group['id']], axis = 1)
# Ad hoc sort dist dictionary to order Species names/graphs
dist = dict(sorted(dist.items()))

TROPHEOPS KUMWERA_Cichlid1_10242022_2_1.0
TROPHEOPS KUMWERA_Cichlid1_10242022_2_2.0
TROPHEOPS KUMWERA_Cichlid1_10242022_2_3.0
TROPHEOPS KUMWERA_Cichlid1_10242022_2_4.0
TROPHEOPS KUMWERA_Cichlid1_10242022_2_5.0
MAYLANDIA FAIAZIBERI MAISON REEF_Cichlid2_01302023_1_1.0
MAYLANDIA FAIAZIBERI MAISON REEF_Cichlid2_01302023_1_2.0
MAYLANDIA FAIAZIBERI MAISON REEF_Cichlid2_01302023_1_3.0
MAYLANDIA FAIAZIBERI MAISON REEF_Cichlid2_01302023_1_4.0
MAYLANDIA FAIAZIBERI MAISON REEF_Cichlid2_01302023_1_5.0
TROPHEOPS KUMWERA_Cichlid2_01302023_2_1.0
TROPHEOPS KUMWERA_Cichlid2_01302023_2_2.0
TROPHEOPS KUMWERA_Cichlid2_01302023_2_3.0
TROPHEOPS KUMWERA_Cichlid2_01302023_2_4.0
TROPHEOPS KUMWERA_Cichlid2_01302023_2_5.0
METRIACLIMA "DAKTARI" (HAI REEF)_Cichlid2_04192023_2_1.0
METRIACLIMA "DAKTARI" (HAI REEF)_Cichlid2_04192023_2_2.0
METRIACLIMA "DAKTARI" (HAI REEF)_Cichlid2_04192023_2_3.0
METRIACLIMA "DAKTARI" (HAI REEF)_Cichlid2_04192023_2_4.0
METRIACLIMA "DAKTARI" (HAI REEF)_Cichlid2_04192023_2_5.0


### Delete extraneous variables to prevent crashing when additional data is loaded (janky troubleshooting)

In [None]:
del dateproxy, demoinfo, file, filedemo, finalproxy, group, grouped, key, list_files, myfile, name, namestr, seconds, SpKey, temp, timeproxy

### Load Cut0s into dictionary with species and file names as keys

In [None]:
# Change current working directory
os.chdir("/home/moonmoon/FD/cut0s/Keene/Group")

# Create empty dictionary
Kdist = {}

# List files in current working directory
list_files = os.listdir(os.getcwd())

# Loop through all files in current working directory
for file in list_files:
    # If file has extension .pkl and has cut0 in the name, execute the following commands
    if file.endswith('.pkl') and 'cut0' in file:
        # Open current pkl+ cut0 file for reading
        myfile = open(file, "rb")
        # Check which file is being accessed
        #print(myfile)
        # Load pkl+ file into temporary pandas dataframe with following column names
        temp = pd.DataFrame(pkl.load(myfile), columns = ['x1', 'y1', 'x2', 'y2', 'id', 'frame'])
        # Check content and dimensions of file
        #print(f'{temp} by {len(temp)})
        # Close current file connection
        myfile.close()
        # Group temporary dataframe by ID to facilitate future filtering of null detections
        grouped = temp.groupby('id')
        # Check content and dimensions of grouped data
        #print(f'{grouped} by {len(grouped)})
        # Loop through grouped data storing name and group content, then execute the following commands
        for name, group in grouped:
            # Create namestring for alteration, by removing extension from file name and adding ID
            namestr = f'{os.path.splitext(file)[0]}_{name}'
            # Convert all strings to uppercase, to standardize capitalization
            namestr = namestr.upper()
            # Split at the first occurence of an underscore character, store list in namestr variable
            namestr = namestr.split('_', maxsplit = 1)
            # Check content of namestr variable and group
            #print(f'{namestr[1]} in {group}')

            # If key/name string ends with ID 0.0, execute the following command
            if name == 0:
                # Skip this iteration
                continue
            # If valid ID, execute the following commands
            else:
                group = group.reset_index()

                # Find midpoint between x1 and x2, store in x variable
                group['x'] = (group['x1'] + group['x2']) / 2
                # Find midpoint between y1 and y2, store in y variable
                group['y'] = (group['y1'] + group['y2']) / 2

                # Convert to artifical lng [0-1]
                group['x'] = MinMaxScaler(feature_range=(0, 1)).fit_transform(pd.DataFrame(group['x']))
                # Convert to artifical lat [0-1]
                group['y'] = MinMaxScaler(feature_range=(0, 0.5)).fit_transform(pd.DataFrame(group['y']))

                # Divide frame series by frames per second, store as seconds
                seconds = group['frame']/FPS
                # Create proxy date (real date unnecessary)
                dateproxy = datetime(2024, 8, 25, 0, 0, 0)
                # Check static variable content
                #print(dateproxy)

                # Create time proxy (real time unnecessary, may implement in future)
                timeproxy = pd.to_timedelta(seconds, unit = 's')
                # Check static variable content
                #print(timeproxy)

                # Merge date and time proxies for final proxy to use in MovingPandas analyses
                finalproxy = dateproxy + timeproxy
                # Check static variable content
                #print(finalproxy)

                # Assign converted values to original value
                group['frame'] = finalproxy

                # Store filtered and pared data into new dictionary with key/file name as the key
                Kdist[f'{namestr[1]}'] = pd.concat([group['frame'], group['x'], group['y'], group['id']], axis = 1)
                # Check contents of dictionary given key
                #print(Kdist[f'{key}'])

# Ad hoc sort dist dictionary to order Species names/graphs
Kdist = dict(sorted(Kdist.items()))

### Delete extraneous variables to prevent crashing when additional data is loaded (janky troubleshooting)

In [None]:
del dateproxy, file, finalproxy, FPS, group, grouped, list_files, myfile, name, namestr, seconds, temp, timeproxy

### Extract dictionary keys, combine dictionaries, create species list from dictionary keys

In [None]:
DictKeys = list(Kdist.keys()) + list(dist.keys())

sorted_dist = {**Kdist, **dist}
sorted_dist = dict(sorted(sorted_dist.items()))

SpeciesList = []

for name in DictKeys:
    temp = name.split("_", maxsplit = 1)
    SpeciesList.append(temp[0])

SpeciesList = np.unique(SpeciesList)

In [None]:
del DictKeys, dist, Kdist, name, temp

In [None]:
# Open cut0 (all cuts) pickle for writing, assign name by folder in loop
myfile = open(f'/home/moonmoon/FD/_Output/sorted_dist.pkl', "wb")
# Dump pickle data into file and seal up for sleepies
pkl.dump(sorted_dist, myfile)
# Close file connection
myfile.close()