# Final Pre-Processing Step

In [107]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce

In [108]:
df = pd.read_csv("pre_encoded_final_final_final_final.csv")
print(df.feeding_type.value_counts())

df.head()

feeding_type
predator               1062
other                   492
variable                274
omnivore                175
grazer                  142
planktivore              70
carnivore                41
browser                  27
insectivore              23
benthic_insectivore      17
herbivore                10
scavenger                 7
detritivore               2
Name: count, dtype: int64


Unnamed: 0,fish_id,species,common_name,kingdom,phylum,class,order,family,genus,status,...,wb_salinity_min,wb_salinity_max,wb_do_min,wb_do_max,wb_bod_min,wb_bod_max,wb_turbidity_min,wb_turbidity_max,wb_temp_min,wb_temp_max
0,F0001,Copella arnoldi,Splash tetra,Animalia,Chordata,Actinopterygii,Characiformes,Lebiasinidae,Copella,Established,...,0.0,0.0,5.5,5.5,2.0,2.0,4.0,4.0,27.0,27.0
1,F0001,Copella arnoldi,Splash tetra,Animalia,Chordata,Actinopterygii,Characiformes,Lebiasinidae,Copella,Established,...,0.0,0.0,6.0,6.0,3.0,3.0,6.0,6.0,27.0,27.0
2,F0001,Copella arnoldi,Splash tetra,Animalia,Chordata,Actinopterygii,Characiformes,Lebiasinidae,Copella,Established,...,0.0,0.0,5.0,5.0,2.0,2.0,3.0,3.0,26.5,26.5
3,F0001,Copella arnoldi,Splash tetra,Animalia,Chordata,Actinopterygii,Characiformes,Lebiasinidae,Copella,Established,...,0.0,0.0,5.2,5.2,2.0,2.0,4.0,4.0,27.0,27.0
4,F0001,Copella arnoldi,Splash tetra,Animalia,Chordata,Actinopterygii,Characiformes,Lebiasinidae,Copella,Established,...,0.0,0.0,4.5,4.5,4.0,4.0,20.0,20.0,26.0,26.0


In [109]:
print(df.status.value_counts())

status
Invasive       889
Established    666
Reported       336
EN             232
Failed         201
Extirpated      18
Name: count, dtype: int64


In [110]:
# Columns to drop (not used for modeling)
drop_cols = ['fish_id', 'common_name']

# High-cardinality categorical
high_cardinality = ['species', 'waterbody_name']

# Low-cardinality categorical
low_cardinality = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'status', 'feeding_type']

# Numeric features
numeric_cols = [
    'temp_max', 'weight_max', 'length_max', 'temp_pref_min', 'temp_pref_max',
    'fecundity_mean', 'fecundity_min', 'fecundity_max',
    'trophic_level_estimate', 'trophic_level',
    'wb_ph_min', 'wb_ph_max', 'wb_salinity_min', 'wb_salinity_max',
    'wb_do_min', 'wb_do_max', 'wb_bod_min', 'wb_bod_max',
    'wb_turbidity_min', 'wb_turbidity_max', 'wb_temp_min', 'wb_temp_max'
]

# Import necessary libraries
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce

# Define transformers
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

low_card_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

high_card_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target', ce.TargetEncoder())
])

# Column transformer
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('low_cat', low_card_transformer, low_cardinality),
    ('high_cat', high_card_transformer, high_cardinality)
], remainder='drop')

In [111]:
# Define numeric invasion risk mapping
risk_mapping = {
    'Failed': 0.1,        # Attempted introduction but failed to establish
    'Reported': 0.2,      # Reported introduction, survival unknown
    'EN': 0.3,            # Endangered - unlikely to become invasive
    'Established': 0.5,   # Successfully established, potential for impact
    'Invasive': 0.9,      # Confirmed negative ecological/economic impact
    'Extirpated': 0.0     # No longer present, zero risk
}

# Ensure your status column matches the keys
# df['status'] = df['status'].str.capitalize().str.strip()

# Create numeric target
df['invasion_risk_score'] = df['status'].map(risk_mapping)

# Quick check
df[['status', 'invasion_risk_score']].value_counts()


status       invasion_risk_score
Invasive     0.9                    889
Established  0.5                    666
Reported     0.2                    336
EN           0.3                    232
Failed       0.1                    201
Extirpated   0.0                     18
Name: count, dtype: int64

In [121]:
df.to_csv("pinakafinal_final.csv", index=False)

# IGNORE CELLS BELOW

In [113]:
y = df['invasion_risk_score']

In [114]:
# Count NaN values in target
print(df["invasion_risk_score"].isna().sum())

# Percentage of NaN in target
print(df["invasion_risk_score"].isna().mean() * 100, "%")

# Show rows where target is NaN
df[df["invasion_risk_score"].isna()].head()

0
0.0 %


Unnamed: 0,fish_id,species,common_name,kingdom,phylum,class,order,family,genus,status,...,wb_salinity_max,wb_do_min,wb_do_max,wb_bod_min,wb_bod_max,wb_turbidity_min,wb_turbidity_max,wb_temp_min,wb_temp_max,invasion_risk_score


In [115]:
# Count NaN values in target
print(df["invasion_risk_score"].isna().sum())

# Percentage of NaN in target
print(df["invasion_risk_score"].isna().mean() * 100, "%")

# Show rows where target is NaN
df[df["invasion_risk_score"].isna()].head()

0
0.0 %


Unnamed: 0,fish_id,species,common_name,kingdom,phylum,class,order,family,genus,status,...,wb_salinity_max,wb_do_min,wb_do_max,wb_bod_min,wb_bod_max,wb_turbidity_min,wb_turbidity_max,wb_temp_min,wb_temp_max,invasion_risk_score


In [116]:
# Rename specific value in 'status' column
df['status'] = df['status'].replace({'En': 'EN'})

# Check
print(df['status'].unique())

['Established' 'Reported' 'Invasive' 'Failed' 'Extirpated' 'EN']


In [117]:
# Define numeric invasion risk mapping
risk_mapping = {
    'Failed': 0.1,        # Attempted introduction but failed to establish
    'Reported': 0.2,      # Reported introduction, survival unknown
    'EN': 0.3,            # Endangered - unlikely to become invasive
    'Established': 0.5,   # Successfully established, potential for impact
    'Invasive': 0.9,      # Confirmed negative ecological/economic impact
    'Extirpated': 0.0     # No longer present, zero risk
}

# Ensure your status column matches the keys
# df['status'] = df['status'].str.capitalize().str.strip()

# Create numeric target
df['invasion_risk_score'] = df['status'].map(risk_mapping)

# Quick check
df[['status', 'invasion_risk_score']].value_counts()


status       invasion_risk_score
Invasive     0.9                    889
Established  0.5                    666
Reported     0.2                    336
EN           0.3                    232
Failed       0.1                    201
Extirpated   0.0                     18
Name: count, dtype: int64

In [118]:
# Count NaN values in target
print(df["invasion_risk_score"].isna().sum())

# Percentage of NaN in target
print(df["invasion_risk_score"].isna().mean() * 100, "%")

# Show rows where target is NaN
df[df["invasion_risk_score"].isna()].head()

0
0.0 %


Unnamed: 0,fish_id,species,common_name,kingdom,phylum,class,order,family,genus,status,...,wb_salinity_max,wb_do_min,wb_do_max,wb_bod_min,wb_bod_max,wb_turbidity_min,wb_turbidity_max,wb_temp_min,wb_temp_max,invasion_risk_score


In [119]:
df.head()

Unnamed: 0,fish_id,species,common_name,kingdom,phylum,class,order,family,genus,status,...,wb_salinity_max,wb_do_min,wb_do_max,wb_bod_min,wb_bod_max,wb_turbidity_min,wb_turbidity_max,wb_temp_min,wb_temp_max,invasion_risk_score
0,F0001,Copella arnoldi,Splash tetra,Animalia,Chordata,Actinopterygii,Characiformes,Lebiasinidae,Copella,Established,...,0.0,5.5,5.5,2.0,2.0,4.0,4.0,27.0,27.0,0.5
1,F0001,Copella arnoldi,Splash tetra,Animalia,Chordata,Actinopterygii,Characiformes,Lebiasinidae,Copella,Established,...,0.0,6.0,6.0,3.0,3.0,6.0,6.0,27.0,27.0,0.5
2,F0001,Copella arnoldi,Splash tetra,Animalia,Chordata,Actinopterygii,Characiformes,Lebiasinidae,Copella,Established,...,0.0,5.0,5.0,2.0,2.0,3.0,3.0,26.5,26.5,0.5
3,F0001,Copella arnoldi,Splash tetra,Animalia,Chordata,Actinopterygii,Characiformes,Lebiasinidae,Copella,Established,...,0.0,5.2,5.2,2.0,2.0,4.0,4.0,27.0,27.0,0.5
4,F0001,Copella arnoldi,Splash tetra,Animalia,Chordata,Actinopterygii,Characiformes,Lebiasinidae,Copella,Established,...,0.0,4.5,4.5,4.0,4.0,20.0,20.0,26.0,26.0,0.5


In [120]:
df.to_csv("processed_dataset2.csv", index=False)