In [1]:
# Library imports

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from scikeras.wrappers import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import make_scorer, accuracy_score

import seaborn as sns
import matplotlib.pyplot as plt

from itertools import product

2025-03-10 16:41:07.402814: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-10 16:41:07.411380: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741624867.421361   98474 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741624867.424062   98474 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-10 16:41:07.434292: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
print("TensorFlow Version:", tf.__version__)
print("Is TensorFlow built with CUDA?", tf.test.is_built_with_cuda())
print("GPU Available:", tf.config.list_physical_devices('GPU'))
print("GPU Device Name:", tf.test.gpu_device_name())
tf.config.experimental.set_memory_growth(tf.config.list_physical_devices('GPU')[0], True)

TensorFlow Version: 2.18.0
Is TensorFlow built with CUDA? True
GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU Device Name: /device:GPU:0


I0000 00:00:1741624869.540112   98474 gpu_device.cc:2022] Created device /device:GPU:0 with 9558 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070, pci bus id: 0000:01:00.0, compute capability: 8.9


In [3]:
# Load the dataset

dataset = pd.read_csv('rare_species 1/metadata.csv')

# Display the first few rows and columns of the dataset

print("Dataset shape:", dataset.shape)
print("Dataset columns:", dataset.columns)

# EOL means Encyclopedia of Life

Dataset shape: (11983, 7)
Dataset columns: Index(['rare_species_id', 'eol_content_id', 'eol_page_id', 'kingdom', 'phylum',
       'family', 'file_path'],
      dtype='object')


In [4]:
# Selecting the features and the target variable

X = dataset.iloc[:, 1:5].values  # Select features, excluding 'rare_species_id' and 'file_path'
y = dataset.iloc[:, 5].values   # 'family' column as the target

In [5]:
# Check the unique values of the columns,
# seeing which can be one-hot encoded

#print(dataset['rare_species_id'].unique())
print("eol_content_id unique values:", dataset['eol_content_id'].nunique())
print("eol_page_id unique values:", dataset['eol_page_id'].nunique())
print("kingdom unique values:", dataset['kingdom'].nunique())
print("phylum unique values:", dataset['phylum'].nunique())

print("family unique values:", dataset['family'].nunique())

eol_content_id unique values: 11983
eol_page_id unique values: 400
kingdom unique values: 1
phylum unique values: 5
family unique values: 202


In [6]:
# Display 'phylum' unique values
print("phylum unique values:", dataset['phylum'].unique())

phylum unique values: ['mollusca' 'chordata' 'arthropoda' 'echinodermata' 'cnidaria']


In [7]:
# Display 'kingdom' unique values
print("kingdom unique values:", dataset['kingdom'].unique())

# Do we really need the 'kingdom' column?

kingdom unique values: ['animalia']


In [None]:
print(X[:5])  # Display the first 5 rows of X to identify the correct indices

[[12853737 449393 'animalia' 'mollusca']
 [20969394 793083 'animalia' 'chordata']
 [28895411 319982 'animalia' 'chordata']
 [29658536 45510188 'animalia' 'chordata']
 [21252576 7250886 'animalia' 'chordata']]


In [9]:
# Transforming categorical data into numerical data
ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(drop="first"), [3])],  # Only 'phylum' column
    remainder='passthrough'
)

X = np.array(ct.fit_transform(X))

# Display the transformed feature set
print(X[:5])

[[0.0 0.0 0.0 1.0 12853737 449393 'animalia']
 [1.0 0.0 0.0 0.0 20969394 793083 'animalia']
 [1.0 0.0 0.0 0.0 28895411 319982 'animalia']
 [1.0 0.0 0.0 0.0 29658536 45510188 'animalia']
 [1.0 0.0 0.0 0.0 21252576 7250886 'animalia']]
