In [1]:
%pip install category_encoders

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [10]:
df = pd.read_csv("../imputation/imputed_dataset.csv")

# Encode columns that appear as an object to numerical

In [11]:
df.dtypes

fish_id                      object
species                      object
common_name                  object
kingdom                      object
phylum                       object
class                        object
order                        object
family                       object
genus                        object
status                       object
feeding_type                 object
temp_max                     object
weight_max                   object
length_max                   object
temp_pref_min                object
temp_pref_max                object
temp_range_min              float64
temp_range_max              float64
trophic_lvl_estimate_min    float64
trophic_lvl_estimate_max    float64
trophic_lvl                  object
fecundity_mean               object
fecundity_min               float64
fecundity_max               float64
waterbody_name               object
wb_ph_min                   float64
wb_ph_max                   float64
wb_salinity_min             

In [12]:
num_cols = [
    "temp_max", "weight_max", "length_max",
    "temp_pref_min", "temp_pref_max",
]

for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

In [13]:
df.dtypes

fish_id                      object
species                      object
common_name                  object
kingdom                      object
phylum                       object
class                        object
order                        object
family                       object
genus                        object
status                       object
feeding_type                 object
temp_max                    float64
weight_max                  float64
length_max                  float64
temp_pref_min               float64
temp_pref_max               float64
temp_range_min              float64
temp_range_max              float64
trophic_lvl_estimate_min    float64
trophic_lvl_estimate_max    float64
trophic_lvl                  object
fecundity_mean               object
fecundity_min               float64
fecundity_max               float64
waterbody_name               object
wb_ph_min                   float64
wb_ph_max                   float64
wb_salinity_min             

In [14]:
num_cols = [
    "trophic_lvl", "fecundity_mean"
]

for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

In [15]:
df.dtypes

fish_id                      object
species                      object
common_name                  object
kingdom                      object
phylum                       object
class                        object
order                        object
family                       object
genus                        object
status                       object
feeding_type                 object
temp_max                    float64
weight_max                  float64
length_max                  float64
temp_pref_min               float64
temp_pref_max               float64
temp_range_min              float64
temp_range_max              float64
trophic_lvl_estimate_min    float64
trophic_lvl_estimate_max    float64
trophic_lvl                 float64
fecundity_mean              float64
fecundity_min               float64
fecundity_max               float64
waterbody_name               object
wb_ph_min                   float64
wb_ph_max                   float64
wb_salinity_min             

In [16]:
df.to_csv("encoded.csv", index=False)

# Risk mapping

In [18]:
df = pd.read_csv("encoded.csv")

In [19]:
# Columns to drop (not useful for modeling)
drop_cols = ['fish_id', 'common_name']

# High-cardinality categorical (many unique values)
high_cardinality = ['species', 'waterbody_name']

# Low-cardinality categorical (few unique categories)
low_cardinality = [
    'kingdom', 'phylum', 'class', 'order', 
    'family', 'genus', 'status', 'feeding_type'
]

# Numeric features
numeric_cols = [
    'temp_max', 'weight_max', 'length_max',
    'temp_pref_min', 'temp_pref_max',
    'temp_range_min', 'temp_range_max',
    'trophic_lvl_estimate_min', 'trophic_lvl_estimate_max', 'trophic_lvl',
    'fecundity_mean', 'fecundity_min', 'fecundity_max',
    'wb_ph_min', 'wb_ph_max',
    'wb_salinity_min', 'wb_salinity_max',
    'wb_do_min', 'wb_do_max',
    'wb_bod_min', 'wb_bod_max',
    'wb_turbidity_min', 'wb_turbidity_max',
    'wb_temp_min', 'wb_temp_max'
]

# Import necessary libraries
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce

# Define transformers
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

low_card_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

high_card_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target', ce.TargetEncoder())
])

# Column transformer (final preprocessor)
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('low_cat', low_card_transformer, low_cardinality),
    ('high_cat', high_card_transformer, high_cardinality)
], remainder='drop')

In [21]:
print(df.status.value_counts())

status
established           1456
invasive              1410
reported               800
EN                     787
other                  405
failed                 381
CR                     249
LC                      50
extirpated              39
Locally Threatened       6
Name: count, dtype: int64


In [22]:
# Define numeric invasion risk mapping
risk_mapping = {
    'failed': 0.1,           # Failed to establish (lowest risk)
    'extirpated': 0.15,      # Was established but eliminated (slight residual risk of reintroduction)
    'reported': 0.25,        # Introduction reported, outcome unknown
    'CR': 0.2,               # Critically endangered (very unlikely to become invasive)
    'EN': 0.3,               # Endangered (unlikely to become invasive)
    'LC': 0.4,               # Least concern (neutral invasion potential)
    'Locally Threatened': 0.35, # Threatened by invasives, unlikely to be invasive themselves
    'other': 0.5,            # Unknown status (moderate uncertainty)
    'established': 0.7,      # Successfully established populations (high potential for impact)
    'invasive': 0.9          # Confirmed negative impacts (highest risk)
}

# Ensure your status column matches the keys
# df['status'] = df['status'].str.capitalize().str.strip()

# Create numeric target
df['invasion_risk_score'] = df['status'].map(risk_mapping)

# Quick check
df[['status', 'invasion_risk_score']].value_counts()

status              invasion_risk_score
established         0.70                   1456
invasive            0.90                   1410
reported            0.25                    800
EN                  0.30                    787
other               0.50                    405
failed              0.10                    381
CR                  0.20                    249
LC                  0.40                     50
extirpated          0.15                     39
Locally Threatened  0.35                      6
Name: count, dtype: int64

In [23]:
df.to_csv("super_dataset.csv", index=False)