# Final Pre-Processing Step

In [2]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce

In [None]:
df = pd.read_csv("pre_encoded_final_final_final_final.csv")
print(df.feeding_type.value_counts())

df.head()

feeding_type
predator               1062
other                   492
variable                274
omnivore                175
grazer                  142
planktivore              70
carnivore                41
browser                  27
insectivore              23
benthic_insectivore      17
herbivore                10
scavenger                 7
detritivore               2
Name: count, dtype: int64


In [23]:
print(df.status.value_counts())

status
Invasive       889
Established    666
Reported       336
EN             232
Failed         201
Extirpated      18
Name: count, dtype: int64


In [16]:
# Columns to drop (not used for modeling)
drop_cols = ['fish_id', 'common_name']

# High-cardinality categorical
high_cardinality = ['species', 'waterbody_name']

# Low-cardinality categorical
low_cardinality = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'status', 'feeding_type']

# Numeric features
numeric_cols = [
    'temp_max', 'weight_max', 'length_max', 'temp_pref_min', 'temp_pref_max',
    'fecundity_mean', 'fecundity_min', 'fecundity_max',
    'trophic_level_estimate', 'trophic_level',
    'wb_ph_min', 'wb_ph_max', 'wb_salinity_min', 'wb_salinity_max',
    'wb_do_min', 'wb_do_max', 'wb_bod_min', 'wb_bod_max',
    'wb_turbidity_min', 'wb_turbidity_max', 'wb_temp_min', 'wb_temp_max'
]

# Import necessary libraries
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce

# Define transformers
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

low_card_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

high_card_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target', ce.TargetEncoder())
])

# Column transformer
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('low_cat', low_card_transformer, low_cardinality),
    ('high_cat', high_card_transformer, high_cardinality)
], remainder='drop')

In [24]:
# Define numeric invasion risk mapping
risk_mapping = {
    'Failed': 0.1,        # Attempted introduction but failed to establish
    'Reported': 0.2,      # Reported introduction, survival unknown
    'EN': 0.3,            # Endangered - unlikely to become invasive
    'Established': 0.7,   # Successfully established, potential for impact
    'Invasive': 0.9,      # Confirmed negative ecological/economic impact
    'Extirpated': 0.0     # No longer present, zero risk
}

# Ensure your status column matches the keys
df['status'] = df['status'].str.capitalize().str.strip()

# Create numeric target
df['invasion_risk_score'] = df['status'].map(risk_mapping)

# Quick check
df[['status', 'invasion_risk_score']].value_counts()


status       invasion_risk_score
Invasive     0.9                    889
Established  0.7                    666
Reported     0.2                    336
Failed       0.1                    201
Extirpated   0.0                     18
Name: count, dtype: int64

In [25]:
y = df['invasion_risk_score']

In [None]:
df.head()

In [27]:
df.to_csv("processed_dataset.csv", index=False)