# Encode categories

In [6]:
%pip install category_encoders

import pandas as pd
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Collecting patsy>=0.5.1 (from category_encoders)
  Downloading patsy-1.0.1-py2.py3-none-any.whl.metadata (3.3 kB)
Collecting statsmodels>=0.9.0 (from category_encoders)
  Downloading statsmodels-0.14.5-cp313-cp313-win_amd64.whl.metadata (9.8 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
Downloading patsy-1.0.1-py2.py3-none-any.whl (232 kB)
Downloading statsmodels-0.14.5-cp313-cp313-win_amd64.whl (9.6 MB)
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   -------- ------------------------------- 2.1/9.6 MB 10.0 MB/s eta 0:00:01
   ---------------------------------------  9.4/9.6 MB 23.7 MB/s eta 0:00:01
   ---------------------------------------- 9.6/9.6 MB 19.3 MB/s  0:00:00
Installing collected packages: patsy, statsmodels, category_encoders

   ---------------------------------------- 0/3 [patsy]
   ---------------------------------------- 0/3 

In [12]:
df = pd.read_csv("standardized_dataset.csv")

print(df.columns.tolist())


['fish_id', 'species', 'common_name', 'waterbody_name', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'status', 'feeding_type', 'temp_max', 'weight_max', 'length_max', 'temp_pref_min', 'temp_pref_max', 'temp_range', 'fecundity_mean', 'fecundity_min', 'fecundity_max', 'trophic_level_estimate', 'trophic_level', 'wb_ph_min', 'wb_ph_max', 'wb_salinity_min', 'wb_salinity_max', 'wb_do_min', 'wb_do_max', 'wb_bod_min', 'wb_bod_max', 'wb_turbidity_min', 'wb_turbidity_max', 'wb_temp_min', 'wb_temp_max']


# Rename status values

In [23]:
print(df['status'].nunique())
print(df['status'].value_counts())

df["status"] = df["status"].replace({
    "Reported/Native": "Reported",
    "Native, threatened": "Reported",
    "Native/Amphidromous": "Reported",
    "Native/Riverine": "Reported",
    "Endemic/Established": "Established",
    "Recorded": "Reported",
    "LC": "EN"
})


6
status
Invasive       889
Established    666
Reported       336
EN             232
Failed         201
Extirpated      18
Name: count, dtype: int64


In [35]:
print(df.columns.tolist())

['fish_id', 'species', 'common_name', 'waterbody_name', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'status', 'feeding_type', 'temp_max', 'weight_max', 'length_max', 'temp_pref_min', 'temp_pref_max', 'fecundity_mean', 'fecundity_min', 'fecundity_max', 'trophic_level_estimate', 'trophic_level', 'wb_ph_min', 'wb_ph_max', 'wb_salinity_min', 'wb_salinity_max', 'wb_do_min', 'wb_do_max', 'wb_bod_min', 'wb_bod_max', 'wb_turbidity_min', 'wb_turbidity_max', 'wb_temp_min', 'wb_temp_max', 'temp_range_min', 'temp_range_max']


# Drop temp_range_min/max columns

In [36]:
df = df.drop(columns=["temp_range_min", "temp_range_max"])

In [37]:
print(df.columns.tolist())

['fish_id', 'species', 'common_name', 'waterbody_name', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'status', 'feeding_type', 'temp_max', 'weight_max', 'length_max', 'temp_pref_min', 'temp_pref_max', 'fecundity_mean', 'fecundity_min', 'fecundity_max', 'trophic_level_estimate', 'trophic_level', 'wb_ph_min', 'wb_ph_max', 'wb_salinity_min', 'wb_salinity_max', 'wb_do_min', 'wb_do_max', 'wb_bod_min', 'wb_bod_max', 'wb_turbidity_min', 'wb_turbidity_max', 'wb_temp_min', 'wb_temp_max']


# Check columns before encoding

In [40]:
df.dtypes

fish_id                    object
species                    object
common_name                object
waterbody_name             object
kingdom                    object
phylum                     object
class                      object
order                      object
family                     object
genus                      object
status                     object
feeding_type               object
temp_max                   object
weight_max                 object
length_max                 object
temp_pref_min              object
temp_pref_max              object
fecundity_mean             object
fecundity_min             float64
fecundity_max             float64
trophic_level_estimate     object
trophic_level             float64
wb_ph_min                 float64
wb_ph_max                 float64
wb_salinity_min           float64
wb_salinity_max           float64
wb_do_min                 float64
wb_do_max                 float64
wb_bod_min                float64
wb_bod_max    

In [39]:
for col in ['temp_max', 'weight_max', 'length_max', 'temp_pref_min', 'temp_pref_max', 'fecundity_mean']:
    print(f"\nUnique sample values in {col}:")
    print(df[col].dropna().unique()[:20])  # show first 20 unique non-null values


Unique sample values in temp_max:
['5.15' '4.714286' '3.666667' '3' '3.617647' '5.666667' '4.25' '3.5' '6'
 '5.75' '8' '4.5' '11' '8.75' '12' '9' '7' '11.83333' '13' '7.8']

Unique sample values in weight_max:
['0.328' '0.5' '0.629' '1.003' '5' '5.06' '5.675' '6.3' '6.4' '8.06'
 '8.41' '10.52' '11.9' '13.2' '14.91' '15' '19.10667' '22' '22.58' '22.89']

Unique sample values in length_max:
['3.489' '3.8' '4.9' '3.572' '2.8' '11.25' '6.4' '8.285' '10.41667' '5.6'
 '11.1' '8.3' '7.29' '6.2' '17.44' '10.6' '11.2' '16.91667' '10.5' '11.9']

Unique sample values in temp_pref_min:
['25' '23' '18' '22' '14' '2' '20' '17' '24' '15' '5' '10' '11' '4' '27'
 '21' '6' '26' '0' '1']

Unique sample values in temp_pref_max:
['29' '28' '22' '26' '24' '18' '23' '30' '25' '20' '37' '34' '15' '35'
 '27' '36' '33' '40' '43' '14']

Unique sample values in fecundity_mean:
['175' '6390' '85' '450' '250' '1443.25' '30864.3' '2976.5' '94527.13'
 '2016' '650' '11774.75' '70958.67' '15917' '7285.75' '700' '11950

In [41]:
num_cols = [
    "temp_max", "weight_max", "length_max",
    "temp_pref_min", "temp_pref_max", "fecundity_mean", "trophic_level_estimate"
]

for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

In [42]:
df.dtypes

fish_id                    object
species                    object
common_name                object
waterbody_name             object
kingdom                    object
phylum                     object
class                      object
order                      object
family                     object
genus                      object
status                     object
feeding_type               object
temp_max                  float64
weight_max                float64
length_max                float64
temp_pref_min             float64
temp_pref_max             float64
fecundity_mean            float64
fecundity_min             float64
fecundity_max             float64
trophic_level_estimate    float64
trophic_level             float64
wb_ph_min                 float64
wb_ph_max                 float64
wb_salinity_min           float64
wb_salinity_max           float64
wb_do_min                 float64
wb_do_max                 float64
wb_bod_min                float64
wb_bod_max    

In [44]:
df.select_dtypes(include=['float64']).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2342 entries, 0 to 2341
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   temp_max                1008 non-null   float64
 1   weight_max              1380 non-null   float64
 2   length_max              1724 non-null   float64
 3   temp_pref_min           1874 non-null   float64
 4   temp_pref_max           1886 non-null   float64
 5   fecundity_mean          1165 non-null   float64
 6   fecundity_min           2342 non-null   float64
 7   fecundity_max           2342 non-null   float64
 8   trophic_level_estimate  349 non-null    float64
 9   trophic_level           1813 non-null   float64
 10  wb_ph_min               2342 non-null   float64
 11  wb_ph_max               2342 non-null   float64
 12  wb_salinity_min         2342 non-null   float64
 13  wb_salinity_max         2342 non-null   float64
 14  wb_do_min               2342 non-null   

In [45]:
categorical_cols = [
    "species", "waterbody_name",
    "kingdom", "phylum", "class", "order", "family", "genus",
    "status", "feeding_type"
]

for col in categorical_cols:
    print(f"\n{col} ({df[col].nunique()} unique):")
    print(df[col].value_counts(dropna=False).head(10))


species (642 unique):
species
Leuciscus leuciscus       18
Morone americana          17
Salvelinus fontinalis     16
Mastacembelus armatus     15
Prosopium cylindraceum    15
Hypsibarbus pierrei       13
Thymallus thymallus       13
Lates mariae              12
Ameiurus nebulosus        11
Salvelinus malma          11
Name: count, dtype: int64

waterbody_name (1888 unique):
waterbody_name
Laguna de Bay                 36
Lake Erie, USA                18
Lake Michigan, USA            16
Taal Lake                     14
Lake Ontario, Canada          13
Laguna River                  13
Laguna de Bay, Philippines    13
Lake Balaton, Hungary         11
Lake Ontario                   8
Lake Malawi, Malawi            8
Name: count, dtype: int64

kingdom (1 unique):
kingdom
Animalia    2340
NaN            2
Name: count, dtype: int64

phylum (1 unique):
phylum
Chordata    2340
NaN            2
Name: count, dtype: int64

class (1 unique):
class
Actinopterygii    2340
NaN                  2
Name

In [47]:
# Find suspicious waterbody_name entries (non-lake/river)
print(df["waterbody_name"].value_counts().tail(50))

waterbody_name
Ganga River, Farrukhabad                                                                     1
Ganga River, Prayagraj (Allahabad)                                                           1
Rio Negro                                                                                    1
Solimões / Middle Amazon (Mamirauá)                                                          1
Palumeu River (upper Palumeu watershed, southeastern Suriname)                               1
Sipaliwini & Kutari rivers (Kwamalasamutu region, SW Suriname)                               1
Oyapock / Oiapoque estuary (lower Oyapock / coastal rivers between French Guiana & Amapá)    1
Ganga River, Varanasi / Prayagraj                                                            1
Padma River, Bheramara (Kushtia)                                                             1
Brahmaputra River, Guwahati / Kamrup stretches                                               1
Korotoa River (Bogura)             

In [51]:
import re

def clean_waterbody_name(name):
    # Remove everything inside parentheses
    name = re.sub(r"\(.*?\)", "", name)
    # Keep only the first part before "/" if it looks like multiple names
    name = name.split("/")[0]
    # Remove extra descriptions like "basin", "drainages", "coastal rivers"
    name = re.sub(r"\b(basin|drainages|watershed|region|stretches|estuary|coastal|coast)\b.*", "", name, flags=re.IGNORECASE)
    # Strip whitespace
    return name.strip()

In [50]:
# Find suspicious waterbody_name entries (non-lake/river)
print(df["waterbody_name"].value_counts().tail(50))

waterbody_name
Ganga River, Farrukhabad                                                                     1
Ganga River, Prayagraj (Allahabad)                                                           1
Rio Negro                                                                                    1
Solimões / Middle Amazon (Mamirauá)                                                          1
Palumeu River (upper Palumeu watershed, southeastern Suriname)                               1
Sipaliwini & Kutari rivers (Kwamalasamutu region, SW Suriname)                               1
Oyapock / Oiapoque estuary (lower Oyapock / coastal rivers between French Guiana & Amapá)    1
Ganga River, Varanasi / Prayagraj                                                            1
Padma River, Bheramara (Kushtia)                                                             1
Brahmaputra River, Guwahati / Kamrup stretches                                               1
Korotoa River (Bogura)             

In [52]:
rename_wb = {
    "Ganga River, Farrukhabad": "Ganga River, Farrukhabad",
    "Ganga River, Prayagraj (Allahabad)": "Ganga River, Prayagraj",
    "Rio Negro": "Rio Negro",
    "Solimões / Middle Amazon (Mamirauá)": "Solimões River",
    "Palumeu River (upper Palumeu watershed, southeastern Suriname)": "Palumeu River",
    "Sipaliwini & Kutari rivers (Kwamalasamutu region, SW Suriname)": "Sipaliwini & Kutari Rivers",
    "Oyapock / Oiapoque estuary (lower Oyapock / coastal rivers between French Guiana & Amapá)": "Oyapock River",
    "Ganga River, Varanasi / Prayagraj": "Ganga River, Varanasi",
    "Padma River, Bheramara (Kushtia)": "Padma River, Bheramara",
    "Brahmaputra River, Guwahati / Kamrup stretches": "Brahmaputra River",
    "Korotoa River (Bogura)": "Korotoa River",
    "McCauley Spring (Sandoval Co.)": "McCauley Spring",
    "Kallar River": "Kallar River",
    "Dibru River (Dibru-Saikhowa region)": "Dibru River",
    "Stanley Reservoir": "Stanley Reservoir",
    "Upstream Bangpakong River": "Bangpakong River",
    "Maeklong watershed": "Maeklong River",
    "Chao Phraya": "Chao Phraya River",
    "Cold streams and lakes in North America": "Cold Streams & Lakes (N. America)",
    "Rio Usumacinta / Atlantic slope rivers of southern Mexico & Guatemala": "Rio Usumacinta",
    "Central American Atlantic & Pacific coastal drainages": "Central American Rivers",
    "Columbia River basin (USA)": "Columbia River Basin",
    "Paint Rock River, USA": "Paint Rock River",
    "Little Paint Rock Creek, USA": "Little Paint Rock Creek",
    "Sepik River, Papua New Guinea": "Sepik River",
    "Lufira River, Congo basin (DRC)": "Lufira River",
    "Drysdale River, Kimberley (Australia)": "Drysdale River"
}

df["waterbody_name"] = df["waterbody_name"].replace(rename_wb)


In [53]:
# Find suspicious waterbody_name entries (non-lake/river)
print(df["waterbody_name"].value_counts().tail(50))

waterbody_name
Lower Paraná River                                        1
Lake St. Clair, Ontario                                   1
Patos Lagoon                                              1
Lake Chapala                                              1
Lerma River                                               1
Chalan Beel (Bangladesh)                                  1
Mekong River                                              1
Irrawaddy (Ayeyarwady) River                              1
Salween (Thanlwin / Salouen) River                        1
Mae Khlong River                                          1
Sumatra rivers                                            1
Typical stream, Hainan Island                             1
Lake Havasu, USA                                          1
Chilika Lake, Rambha                                      1
Cempaka                                                   1
Thai koi / culture pond                                   1
Osprey Marsh (Ontario, Ca

In [77]:
rename_wb = {
    "Lake St. Clair, Ontario": "Lake St. Clair",
    "Chalan Beel (Bangladesh)": "Chalan Beel",
    "Irrawaddy (Ayeyarwady) River": "Irrawaddy River",
    "Salween (Thanlwin / Salouen) River": "Salween River",
    "Mae Khlong River": "Maeklong River",
    "Typical stream, Hainan Island": "Hainan Stream",
    "Lake Havasu, USA": "Lake Havasu",
    "Chilika Lake, Rambha": "Chilika Lake",
    "Thai koi / culture pond": "Thai Culture Pond",
    "Osprey Marsh (Ontario, Canada)": "Osprey Marsh",
    "Ganga River, Narora": "Ganga River",
    "Cold Streams & Lakes (N. America)": "Cold Streams & Lakes",
    "Central American Rivers": "Central American Drainages",
    "Columbia River Basin": "Columbia River",
    "Little Paint Rock Creek": "Paint Rock Creek",
    "Ganga River, Prayagraj": "Ganga River",
    "Lake Victoria (Uganda/Kenya/Tanzania)": "Lake Victoria",
    "Salween (Thanlwin / Salouen) River ": "Salween River",
    "Hudson River estuary (NY)": "Hudson River",
    "Kali River / Sharavathi (Western Ghats, Karnataka)": "Kali River",
    "Irrawaddy (Ayeyarwady) River": "Irrawaddy River",
    "Canterbury/Otago rivers (New Zealand)": "Canterbury Otago Rivers",
    "(Omani garra)\r\nWadi Bani Khalid pools, Oman": "Wadi Bani Khalid Pools",
    "Lower Zambezi and lower parts of Sabi, Lundi, Pungwe, Buzi Rivers": "Lower Zambezi River",
    "Small rivers/creeks on Atlantic slope Central America": "Small Rivers (Central America)",
    "Lake Neusiedl / Neusiedler See": "Lake Neusiedl",
    "Lake Chapala (Lerma Chapala basin, Mexico)": "Lake Chapala",
    "Irrawaddy (Ayeyarwady) River ": "Irrawaddy River",
    "Thai koi / culture pond": "Thai Culture Pond",
    "Lake Victoria, Mwanza Gulf (Tanzania)": "Lake Victoria, Mwanza",
    "Chalakudy River / Periyar River (Western Ghats, India)": "Chalakudy River",
    "Gulf of Nicoya estuary (Costa Rica)": "Gulf of Nicoya Estuary",
    "Snowy River headwaters, SE Australia": "Snowy River Headwaters",
    "Derwent headwaters, Tasmania (Australia)": "Derwent Headwaters",
    "Canterbury Otago Rivers": "Canterbury Otago Rivers",
    "Lake Manyara (and adjacent streams), Tanzania": "Lake Manyara"

    
}

df["waterbody_name"] = df["waterbody_name"].replace(rename_wb)

# Find suspicious waterbody_name entries (non-lake/river)
print(df["waterbody_name"].value_counts().tail(50))


waterbody_name
Lake Bracciano (Italy)                                   1
Kali River                                               1
Suriname River Kabel Station                             1
Chilika Lake                                             1
Río de la Plata estuary                                  1
Lower Paraná River                                       1
Cempaka                                                  1
Patos Lagoon                                             1
Thai koi / culture pond                                  1
Lerma River                                              1
Chalan Beel                                              1
Mekong River                                             1
Irrawaddy River                                          1
Salween River                                            1
Mae Khlong River                                         1
Sumatra rivers                                           1
Typical stream, Hainan Island            

In [78]:
print(df["species"].nunique(), "unique species")
print(df["waterbody_name"].nunique(), "unique waterbodies")

642 unique species
1880 unique waterbodies


In [80]:
df.isnull().sum().sort_values(ascending=False).head(15).to_list()

[1993, 1334, 1177, 962, 618, 529, 468, 456, 446, 53, 3, 2, 2, 2, 2]

In [82]:
df['trophic_level'] = df['trophic_level'].fillna(df['trophic_level_estimate'])
df['trophic_level'].value_counts().sort_index()


trophic_level
2.00    96
2.11     2
2.13     4
2.16     1
2.18     4
        ..
4.42     4
4.45     6
4.47     5
4.47     5
4.50    44
Name: count, Length: 205, dtype: int64

In [90]:
print(df["feeding_type"].value_counts().tail(100))

feeding_type
predator                                         1060
variable                                          274
Omnivore                                          160
grazer                                            142
selective plankton feeding                         56
Carnivore                                          41
browser                                            27
Insectivore                                        23
filtering plankton                                 13
other                                              10
Herbivore                                           8
Benthic insectivore                                 8
scavenger                                           7
Omnivores                                           6
small omnivore/insectivore                          5
sucking food-containing material                    4
Benthic insectivore/omnivore                        3
Omnivore / insectivore                              3
Small insectivo

In [91]:
df['feeding_type'] = df['feeding_type'].str.lower().str.strip()

In [92]:
feeding_map = {
    'omnivore': 'omnivore',
    'omnivores': 'omnivore',
    'omnivore / insectivore': 'omnivore',
    'small omnivore/insectivore': 'omnivore',
    'omnivore (small inverts)': 'omnivore',
    'herbivore': 'herbivore',
    'herbvore/detritivore': 'herbivore',
    'predator': 'predator',
    'predator/opportunistic carnivore': 'predator',
    'carnivore': 'carnivore',
    'grazer': 'grazer',
    'browser': 'browser',
    'selective plankton feeding': 'planktivore',
    'filtering plankton': 'planktivore',
    'benthic insectivore': 'benthic_insectivore',
    'benthic insectivore/omnivore': 'benthic_insectivore',
    'insectivore / surface feeder': 'benthic_insectivore',
    'small insectivore/omnivore': 'benthic_insectivore',
    'detritivore': 'detritivore',
    'sucking food-containing material': 'other',
    'other': 'other',
    'benthic forager': 'other',
    'planktivore / piscivore': 'planktivore'
}

df['feeding_type'] = df['feeding_type'].replace(feeding_map)

In [93]:
print(df["feeding_type"].value_counts().tail(100))

feeding_type
predator                                         1062
variable                                          274
omnivore                                          175
grazer                                            142
planktivore                                        70
carnivore                                          41
browser                                            27
insectivore                                        23
benthic_insectivore                                17
other                                              15
herbivore                                          10
scavenger                                           7
detritivore                                         2
insectivore / surface insect feeder                 1
omnivore / herbivore tendencies (tilapias)          1
omnivore/insectivore                                1
benthic insectivore / small-fish predator           1
omnivore / benthic feeder                           1
parasite       

In [98]:
main_types = [
    'predator', 'variable', 'omnivore', 'grazer', 'planktivore',
    'carnivore', 'browser', 'insectivore', 'benthic_insectivore',
    'herbivore', 'scavenger', 'detritivore', 'other'
]

df['feeding_type_clean'] = df['feeding_type'].apply(
    lambda x: x if x in main_types else 'other'
)

# Counts per category
print(df['feeding_type_clean'].value_counts())

# Total number of rows
print("Total rows:", df['feeding_type_clean'].shape[0])


feeding_type_clean
predator               1062
other                   492
variable                274
omnivore                175
grazer                  142
planktivore              70
carnivore                41
browser                  27
insectivore              23
benthic_insectivore      17
herbivore                10
scavenger                 7
detritivore               2
Name: count, dtype: int64
Total rows: 2342


In [None]:
df.to_csv("pre_encoded_final_final.csv", index=False)

# Encode categories

In [87]:
df.dtypes
#

fish_id                    object
species                    object
common_name                object
waterbody_name             object
kingdom                    object
phylum                     object
class                      object
order                      object
family                     object
genus                      object
status                     object
feeding_type               object
temp_max                  float64
weight_max                float64
length_max                float64
temp_pref_min             float64
temp_pref_max             float64
fecundity_mean            float64
fecundity_min             float64
fecundity_max             float64
trophic_level_estimate    float64
trophic_level             float64
wb_ph_min                 float64
wb_ph_max                 float64
wb_salinity_min           float64
wb_salinity_max           float64
wb_do_min                 float64
wb_do_max                 float64
wb_bod_min                float64
wb_bod_max    