# Dataset Extraction into Three Excel Files

## 1. Brain Mass & Body Length Data
This file contains measurements of brain mass and body length for 339 individuals across 23 species.

## 2. Head Shape Landmark Data
This file includes head shape data for 23 species using 12 homologous landmarks, formatted in NTS.

## 3. Molecular-Morphological Phylogeny
This file presents a Nexus-format phylogenetic description of the species based on molecular and morphological data.

In [4]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [30]:
import pandas as pd
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [6]:
# Re-read the dataset
with open("Data Pre-Processing\Syngnathid brain size.txt", "r", encoding="utf-8") as file:
    data = file.readlines()

In [7]:
# Initialize lists for each section
brain_mass_data = []
head_shape_data = []
phylogeny_data = []

current_section = None

In [8]:
for line in data:
    line = line.strip()  # Remove extra spaces and newlines

    if "#1: Species Data" in line:
        current_section = "brain_mass"
        continue
    elif "#2 Head morphology data" in line:
        current_section = "head_shape"
        continue
    elif "#3 Molecular-morphological phylogeny" in line:
        current_section = "phylogeny"
        continue

    if current_section == "brain_mass" and line:
        brain_mass_data.append(line.split("\t"))  # Tab-separated values
    elif current_section == "head_shape" and line:
        head_shape_data.append(line.split())  # Space-separated values
    elif current_section == "phylogeny" and line:
        phylogeny_data.append([line])  # Store as text lines

In [9]:
# Convert to DataFrames, ensuring proper structure
brain_mass_columns = ["Species", "Sex", "Length (mm)", "Brain weight (mg)", "Preservative", "Sampled location", "Sample code"]
brain_mass_df = pd.DataFrame(brain_mass_data[1:], columns=brain_mass_columns)

In [10]:
# Head shape data (Assuming the first row contains species names)
head_shape_df = pd.DataFrame(head_shape_data)

# Phylogeny data as text
phylogeny_df = pd.DataFrame(phylogeny_data, columns=["Phylogeny"])

In [11]:
# Save to Excel files
brain_mass_df.to_excel("Data Pre-Processing/brain_mass_body_length.xlsx", index=False)
head_shape_df.to_excel("Data Pre-Processing/head_shape.xlsx", index=False)
phylogeny_df.to_excel("Data Pre-Processing/molecular_phylogeny.xlsx", index=False)

# Data Visualisation of brain_mass_body_length.xlsx

In [12]:
# Load the brain mass and body length dataset
brain_mass_path = "Data Pre-Processing/brain_mass_body_length.xlsx"
brain_mass_df = pd.read_excel(brain_mass_path)

In [49]:
print(len(brain_mass_df))

339


In [13]:
brain_mass_df.head(5)
brain_mass_df.head(-5)

Unnamed: 0,Species,Sex,Length (mm),Brain weight (mg),Preservative,Sampled location,Sample code
0,Corythoichthys haematopterus,f,112.6,3.27,Formalin,Japan,
1,Corythoichthys haematopterus,f,115.0,3.97,Formalin,Japan,
2,Corythoichthys haematopterus,f,121.4,4.83,Formalin,Japan,
3,Corythoichthys haematopterus,f,122.4,4.09,Formalin,Japan,
4,Corythoichthys haematopterus,f,128.0,4.29,Formalin,Japan,
...,...,...,...,...,...,...,...
329,Syngnathus typhle,m,204.0,11.30,Formalin,Sweden,
330,Syngnathus typhle,m,176.0,10.20,Formalin,Sweden,
331,Syngnathus typhle,m,166.0,9.40,Formalin,Sweden,
332,Syngnathus typhle,m,208.0,10.60,Formalin,Sweden,


Will rename 'Sample location' to 'location', 'Brain weight (mg)' to 'Brain_weight', 'Length (mm)' to 'Length'

In [None]:
# Rename multiple columns for consistency
brain_mass_df.rename(columns={
    "Sample location": "Location",
    "Brain weight (mg)": "Brain_weight",
    "Length (mm)": "Length"
}, inplace=True)

brain_mass_df.to_excel("data/brain_mass_body_length.xlsx", index=False)

In [15]:
#To check if changes made
brain_mass_df.head(5)

Unnamed: 0,Species,Sex,Length (mm),Brain weight (mg),Preservative,Location,Sample code
0,Corythoichthys haematopterus,f,112.6,3.27,Formalin,Japan,
1,Corythoichthys haematopterus,f,115.0,3.97,Formalin,Japan,
2,Corythoichthys haematopterus,f,121.4,4.83,Formalin,Japan,
3,Corythoichthys haematopterus,f,122.4,4.09,Formalin,Japan,
4,Corythoichthys haematopterus,f,128.0,4.29,Formalin,Japan,


In [16]:
brain_mass_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339 entries, 0 to 338
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Species            339 non-null    object 
 1   Sex                339 non-null    object 
 2   Length (mm)        339 non-null    float64
 3   Brain weight (mg)  339 non-null    float64
 4   Preservative       339 non-null    object 
 5   Location           339 non-null    object 
 6   Sample code        33 non-null     object 
dtypes: float64(2), object(5)
memory usage: 18.7+ KB


# Inference:
There are 339 values in each variables except 'Code' which only has 33 values.

We will drop 'Sample code' column as it is unnecessary and only has 33 values. Then, we will see the unique values of each column and check if there are any irregularities.

In [17]:
# Drop the "Code" column from the dataset as it is unecessary and save it.
brain_mass_df.drop(columns=["Sample code"], inplace=True, errors="ignore")
brain_mass_df.to_excel("data/brain_mass_body_length.xlsx", index=False)

In [18]:
#To check if changes made
brain_mass_df.head(5)

Unnamed: 0,Species,Sex,Length (mm),Brain weight (mg),Preservative,Location
0,Corythoichthys haematopterus,f,112.6,3.27,Formalin,Japan
1,Corythoichthys haematopterus,f,115.0,3.97,Formalin,Japan
2,Corythoichthys haematopterus,f,121.4,4.83,Formalin,Japan
3,Corythoichthys haematopterus,f,122.4,4.09,Formalin,Japan
4,Corythoichthys haematopterus,f,128.0,4.29,Formalin,Japan


In [19]:
brain_mass_df.nunique()

Species               24
Sex                    2
Length (mm)          224
Brain weight (mg)    267
Preservative           2
Location              17
dtype: int64

Now we will see all the unique values in Species, Preservatives and Location

In [20]:
brain_mass_df["Species"].unique()

array(['Corythoichthys haematopterus', 'Corythoichthys intestinalis',
       'Doriyichthys boaja', 'Doriyichthys martensi',
       'Doryhampus dactyliophorus', 'Doryrhamphus japonicus',
       'Entelurus aequoreus', 'Hippichthys penicillus',
       'Hippocampus abdominalis', 'Hippocampus addominalis',
       'Hippocampus comes', 'Hippocampus hippocampus', 'Hippocampus kuda',
       'Hippocampus spinosissimus', 'Hippocampus trimaculatus',
       'Microphis brachyurus', 'Nerophis lumbriciformis',
       'Nerophis ophidion', 'Syngnathoides biaculeatus',
       'Syngnathus abaster', 'Syngnathus acus', 'Syngnathus schlegeli',
       'Syngnathus typhle', 'Trachyhampus serratus'], dtype=object)

Here there is an spelling error in one of the species. The error is "Hippocampus addominalis", the correct one is "Hippocampus abdominalis". We will check the rows which have the incorrect species and then change it.

In [21]:
# Find rows where the incorrect species name "Hippocampus addominalis" appears
incorrect_rows = brain_mass_df[brain_mass_df["Species"] == "Hippocampus addominalis"]
incorrect_rows

Unnamed: 0,Species,Sex,Length (mm),Brain weight (mg),Preservative,Location
124,Hippocampus addominalis,f,184.5,15.8,Formalin,Australia
125,Hippocampus addominalis,f,146.7,10.1,Formalin,Australia
126,Hippocampus addominalis,f,156.6,20.2,Formalin,Australia
127,Hippocampus addominalis,m,151.0,10.9,Formalin,Australia
128,Hippocampus addominalis,m,165.0,27.1,Formalin,Australia
129,Hippocampus addominalis,m,136.8,12.1,Formalin,Australia
130,Hippocampus addominalis,m,156.1,14.6,Formalin,Australia


In [22]:
# Replace the incorrect name with the correct one "Hippocampus abdominalis" and save it
brain_mass_df["Species"] = brain_mass_df["Species"].replace("Hippocampus addominalis", "Hippocampus abdominalis")
brain_mass_df.to_excel("data/brain_mass_body_length.xlsx", index=False)

To check all the unique values in our data and check if something need to be changed or deleted

In [23]:
length = len(brain_mass_df["Species"].unique())
print("Length of 'Species':", length)
brain_mass_df["Species"].unique()

Length of 'Species': 23


array(['Corythoichthys haematopterus', 'Corythoichthys intestinalis',
       'Doriyichthys boaja', 'Doriyichthys martensi',
       'Doryhampus dactyliophorus', 'Doryrhamphus japonicus',
       'Entelurus aequoreus', 'Hippichthys penicillus',
       'Hippocampus abdominalis', 'Hippocampus comes',
       'Hippocampus hippocampus', 'Hippocampus kuda',
       'Hippocampus spinosissimus', 'Hippocampus trimaculatus',
       'Microphis brachyurus', 'Nerophis lumbriciformis',
       'Nerophis ophidion', 'Syngnathoides biaculeatus',
       'Syngnathus abaster', 'Syngnathus acus', 'Syngnathus schlegeli',
       'Syngnathus typhle', 'Trachyhampus serratus'], dtype=object)

In [24]:
brain_mass_df["Preservative"].unique()

array(['Formalin', 'Ethanol'], dtype=object)

In [25]:
brain_mass_df["Location"].unique()

array(['Japan', 'Malaysia', 'Micronesia', 'Marshall', 'Tokushima (Japan)',
       'Hahajima (Japan)', 'Ioujima (Japan)', 'Izu (Japan)',
       'Miyakejima (Japan)', 'Sweden', 'Australia', 'Okinawa (Japan)',
       'Portugal', 'Otsuchi bay (Japan)', 'Taiwan', 'Surugawan (Japan)',
       'Hyogo (Japan)'], dtype=object)

In [26]:
brain_mass_df.describe()

Unnamed: 0,Length (mm),Brain weight (mg)
count,339.0,339.0
mean,175.572566,8.615929
std,65.259763,5.717701
min,51.5,1.7
25%,124.65,4.505
50%,162.0,7.99
75%,218.5,10.315
max,415.0,32.1


# Inference

## 1. Species  
There are **23** species in the dataset. They are:  

### **Seahorses (Genus: Hippocampus)**
- *Hippocampus abdominalis*  
- *Hippocampus comes*  
- *Hippocampus hippocampus*  
- *Hippocampus kuda*  
- *Hippocampus spinosissimus*  
- *Hippocampus trimaculatus*  

### **Pipefishes (Various Genera)**
- **Tropical Reef Pipefishes:** *Corythoichthys haematopterus*, *Corythoichthys intestinalis*  
- **Freshwater Pipefishes:** *Doriyichthys boaja*, *Doriyichthys martensi*  
- **Cleaner Pipefishes:** *Doryhampus dactyliophorus*, *Doryrhamphus japonicus*  
- **Northern Pipefishes:** *Entelurus aequoreus*  
- **Mangrove Pipefishes:** *Hippichthys penicillus*  
- **River Pipefishes:** *Microphis brachyurus*  
- **Eel-like Pipefishes:** *Nerophis lumbriciformis*, *Nerophis ophidion*  
- **Alligator Pipefish:** *Syngnathoides biaculeatus*  
- **Common Pipefishes:** *Syngnathus abaster*, *Syngnathus acus*, *Syngnathus schlegeli*, *Syngnathus typhle*  
- **Deep-water Pipefishes:** *Trachyhampus serratus*  

---

2. Sex  
There are *2* sexes: *m* or *f*.  

---

3. Length  
- Mean Length: *175.57 mm*  
- Minimum Length: *51.5 mm*  
- Maximum Length: *415 mm*  

---

4. Brain Weight  
- Mean Brain Weight: *8.61 mg*  
- Minimum Brain Weight: *1.7 mg*  
- Maximum Brain Weight: *32.1 mg*  

---

5. Preservative  
There are *2* preservatives used:  
- Formalin  
- Ethanol  

---

6. Location  
There are **17** locations from which data was sampled. They are:  
*Japan, Malaysia, Micronesia, Marshall, Tokushima (Japan), Hahajima (Japan), Ioujima (Japan), Izu (Japan), Miyakejima (Japan), Sweden, Australia, Okinawa (Japan), Portugal, Otsuchi bay (Japan), Taiwan, Surugawan (Japan), Hyogo (Japan)*  


Since there are two species, we can further divide them into 2 excels to ease access.

In [29]:
# List of Seahorse species (Genus: Hippocampus)
seahorse_species = [
    "Hippocampus abdominalis", "Hippocampus comes", "Hippocampus hippocampus",
    "Hippocampus kuda", "Hippocampus spinosissimus", "Hippocampus trimaculatus"
]

# Separate the dataset into Seahorses and Pipefishes
brain_mass_seahorses_df = brain_mass_df[brain_mass_df["Species"].isin(seahorse_species)]
brain_mass_pipefishes_df = brain_mass_df[~brain_mass_df["Species"].isin(seahorse_species)]

# Save the separate Excel files
seahorses_file_path = "Data Pre-Processing/seahorses.xlsx"
pipefishes_file_path = "Data Pre-Processing/pipefishes.xlsx"

brain_mass_seahorses_df.to_excel(seahorses_file_path, index=False)
brain_mass_pipefishes_df.to_excel(pipefishes_file_path, index=False)

# Encoding Categorical Data for GA and RL

Now, as we want to implement both **Genetic Algorithm (GA)** and **Reinforcement Learning (RL)**, we need to encode the categorical data differently for each.

## Encoding for Genetic Algorithms (GA)  
**Use Label Encoding**  

GA works well with numerical representations rather than sparse matrices.  
One-hot encoding would create too many dimensions, making crossover and mutation inefficient.  
Label encoding assigns each category a unique integer, making crossover and mutation operations more meaningful.  

## Encoding for Reinforcement Learning (RL)  
**Use One-Hot Encoding**  

RL learns from state transitions, and categorical values shouldn't have a numerical hierarchy.  
One-hot encoding ensures the RL model treats each category as an independent entity.  
Label encoding may mislead RL into thinking higher numbers mean better categories (e.g., `Species_A = 1`, `Species_B = 2` suggests `B > A`).  


In [31]:
# Define file paths
seahorses_file_path = "Data Pre-Processing/seahorses.xlsx"
pipefishes_file_path = "Data Pre-Processing/pipefishes.xlsx"

# Load datasets
df_seahorses = pd.read_excel(seahorses_file_path)
df_pipefishes = pd.read_excel(pipefishes_file_path)

# For GA Algorithm, we will use label encoding.

In [None]:
def apply_label_encoding(df):
    df_encoded = df.copy()
    label_encoders = {}
    for col in ["Species", "Preservative", "Location"]:
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col])
        label_encoders[col] = le
    return df_encoded

df_seahorses_ga = apply_label_encoding(df_seahorses)
df_pipefishes_ga = apply_label_encoding(df_pipefishes)

# Save Label Encoded datasets for GA
seahorses_ga_file = "GA ALgorithm/seahorses_ga.xlsx"
pipefishes_ga_file = "GA ALgorithm/pipefishes_ga.xlsx"

df_seahorses_ga.to_excel(seahorses_ga_file, index=False)
df_pipefishes_ga.to_excel(pipefishes_ga_file, index=False)

In [51]:
df_seahorses_ga.head()

Unnamed: 0,Species,Sex,Length (mm),Brain weight (mg),Preservative,Location
0,0,m,135.0,12.6,1,0
1,0,f,184.5,15.8,1,0
2,0,f,146.7,10.1,1,0
3,0,f,156.6,20.2,1,0
4,0,m,151.0,10.9,1,0


In [50]:
df_pipefishes_ga.head()

Unnamed: 0,Species,Sex,Length (mm),Brain weight (mg),Preservative,Location
0,0,f,112.6,3.27,1,4
1,0,f,115.0,3.97,1,4
2,0,f,121.4,4.83,1,4
3,0,f,122.4,4.09,1,4
4,0,f,128.0,4.29,1,4


# For RL Algorithm, we will use label encoding.

In [None]:
df_seahorses_rl = pd.get_dummies(df_seahorses, columns=["Species", "Preservative", "Location"], drop_first=True)
df_pipefishes_rl = pd.get_dummies(df_pipefishes, columns=["Species", "Preservative", "Location"], drop_first=True)

# Save One-Hot Encoded datasets for RL
seahorses_rl_file = "RL Algorithm/seahorses_rl.xlsx"
pipefishes_rl_file = "RL Algorithm/pipefishes_rl.xlsx"

df_seahorses_rl.to_excel(seahorses_rl_file, index=False)
df_pipefishes_rl.to_excel(pipefishes_rl_file, index=False)

In [52]:
df_seahorses_rl.head()

Unnamed: 0,Sex,Length (mm),Brain weight (mg),Species_Hippocampus comes,Species_Hippocampus hippocampus,Species_Hippocampus kuda,Species_Hippocampus spinosissimus,Species_Hippocampus trimaculatus,Preservative_Formalin,Location_Malaysia
0,m,135.0,12.6,False,False,False,False,False,True,False
1,f,184.5,15.8,False,False,False,False,False,True,False
2,f,146.7,10.1,False,False,False,False,False,True,False
3,f,156.6,20.2,False,False,False,False,False,True,False
4,m,151.0,10.9,False,False,False,False,False,True,False


In [53]:
df_pipefishes_rl.head()

Unnamed: 0,Sex,Length (mm),Brain weight (mg),Species_Corythoichthys intestinalis,Species_Doriyichthys boaja,Species_Doriyichthys martensi,Species_Doryhampus dactyliophorus,Species_Doryrhamphus japonicus,Species_Entelurus aequoreus,Species_Hippichthys penicillus,...,Location_Marshall,Location_Micronesia,Location_Miyakejima (Japan),Location_Okinawa (Japan),Location_Otsuchi bay (Japan),Location_Portugal,Location_Surugawan (Japan),Location_Sweden,Location_Taiwan,Location_Tokushima (Japan)
0,f,112.6,3.27,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,f,115.0,3.97,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,f,121.4,4.83,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,f,122.4,4.09,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,f,128.0,4.29,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Data Visualisation of head_shape.xlsx

In [35]:
# Load the head shape dataset
head_shape_path = "Data Pre-Processing/head_shape.xlsx"
head_shape_df = pd.read_excel(head_shape_path)

In [48]:
print(len(head_shape_df))

279


In [36]:
head_shape_df.head(5)
head_shape_df.head(-5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1,23L,24,0,DIM=2,,,,,,,,,,,,,,,
1,Chaematopterus,Cintestinalis,Dboaja,Ddactyliophorus,Djaponicus,Dmartensi,Eaequoreus,Habdominalis,Hcomes,Hhippocampus,Hkuda,Hpenicillus,Hspinosissimus,Htrimaculatus,Mbrachyurus,Nlumbriciformis,Nophidion,Sabaster,Sacus,Sbiaculeatus
2,Sschlegeli,Styphle,Tserratus,,,,,,,,,,,,,,,,,
3,-0.356713,0.055463,,,,,,,,,,,,,,,,,,
4,-0.359061,-0.013548,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269,-0.279988,-0.043781,,,,,,,,,,,,,,,,,,
270,-0.283813,0.043850,,,,,,,,,,,,,,,,,,
271,-0.101853,0.054450,,,,,,,,,,,,,,,,,,
272,-0.062368,0.029681,,,,,,,,,,,,,,,,,,


INFERENCE: UNECESSARY RIGHT NOW

# Data Visualisation of molecular_phylogeny.xlsx

In [37]:
# Load the molecular phylogeny dataset
molecular_phylogeny_path = "Data Pre-Processing/molecular_phylogeny.xlsx"
molecular_phylogeny_df = pd.read_excel(molecular_phylogeny_path)

In [46]:
print(len(molecular_phylogeny_df))

30


In [47]:
molecular_phylogeny_df.head(30)


Unnamed: 0,Phylogeny
0,#NEXUS
1,BEGIN TREES;
2,"Title 'Trees from ""lumbtree.nex""';"
3,LINK Taxa = Taxa;
4,TRANSLATE
5,"[0] \t\t1 Chaematopterus,"
6,"[1] \t\t2 Cintestinalis,"
7,"[2] \t\t3 Dboaja,"
8,"[3] \t\t4 Ddactyliophorus,"
9,"[4] \t\t5 Djaponicus,"


INFERENCE: UNECESSARY RIGHT NOW