In [1]:
%pip install seaborn

Note: you may need to restart the kernel to use updated packages.


In [36]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import os
import re
import unicodedata
from sklearn.impute import SimpleImputer
from sklearn.linear_model import BayesianRidge
from IPython.display import display

In [37]:
df = pd.read_csv("../Round1/final_dataset_v1.csv")

# Impute raw dataset IGNORE

In [10]:
print(df.columns)

Index(['fish_id', 'species', 'common_name', 'kingdom', 'phylum', 'class',
       'order', 'family', 'genus', 'status', 'feeding_type', 'temp_max',
       'weight_max', 'length_max', 'temp_pref_min', 'temp_pref_max',
       'temp_range_min', 'temp_range_max', 'trophic_lvl_estimate_min',
       'trophic_lvl_estimate_max', 'trophic_lvl', 'fecundity_mean',
       'fecundity_min', 'fecundity_max', 'waterbody_name', 'wb_ph_min',
       'wb_ph_max', 'wb_salinity_min', 'wb_salinity_max', 'wb_do_min',
       'wb_do_max', 'wb_bod_min', 'wb_bod_max', 'wb_turbidity_min',
       'wb_turbidity_max', 'wb_temp_min', 'wb_temp_max'],
      dtype='object')


In [13]:
class ITOFFGroupedImputer:
    def __init__(self, data):
        self.data = data.copy()
        self.imputed_data = data.copy()
        self.imputation_groups = self._define_groups()
        
    def _define_groups(self):
        """Define biologically meaningful groups for imputation based on available columns"""
        return {
            'size_traits': {
                'columns': ['temp_max', 'weight_max', 'length_max'],
                'predictors': ['trophic_lvl_estimate_min', 'trophic_lvl_estimate_max', 'trophic_lvl', 'family', 'order', 'feeding_type']
            },
            'temperature_traits': {
                'columns': ['temp_pref_min', 'temp_pref_max', 'temp_range_min', 'temp_range_max', 'wb_temp_min', 'wb_temp_max'],
                'predictors': ['family', 'order', 'waterbody_name']
            },
            'fecundity_traits': {
                'columns': ['fecundity_mean', 'fecundity_min', 'fecundity_max'],
                'predictors': ['temp_max', 'weight_max', 'length_max', 'family', 'order', 'feeding_type']
            },
            'water_chemistry_ph_oxygen': {
                'columns': ['wb_ph_min', 'wb_ph_max', 'wb_do_min', 'wb_do_max'],
                'predictors': ['family', 'waterbody_name', 'trophic_lvl_estimate_min', 'trophic_lvl_estimate_max', 'trophic_lvl']
            },
            'water_chemistry_other': {
                'columns': ['wb_salinity_min', 'wb_salinity_max', 'wb_bod_min', 
                           'wb_bod_max', 'wb_turbidity_min', 'wb_turbidity_max'],
                'predictors': ['waterbody_name', 'family', 'trophic_lvl_estimate_min', 'trophic_lvl_estimate_max', 'trophic_lvl']
            }
        }

    def analyze_missing_patterns(self):
        """Analyze missing data patterns for each group"""
        print("Missing Data Analysis by Group:")
        print("=" * 50)
        
        for group_name, group_info in self.imputation_groups.items():
            print(f"\n{group_name.upper()}:")
            columns = group_info['columns']
            
            # Check if columns exist in data
            existing_cols = [col for col in columns if col in self.data.columns]
            missing_cols = [col for col in columns if col not in self.data.columns]
            
            if missing_cols:
                print(f"  Missing columns: {missing_cols}")
            
            if existing_cols:
                missing_stats = self.data[existing_cols].isnull().sum()
                total_rows = len(self.data)
                
                for col in existing_cols:
                    missing_count = missing_stats[col]
                    missing_pct = (missing_count / total_rows) * 100
                    print(f"  {col}: {missing_count}/{total_rows} ({missing_pct:.1f}% missing)")

    def check_correlations(self, group_name, min_correlation=0.3):
        """Check correlations within a group"""
        group_info = self.imputation_groups[group_name]
        columns = [col for col in group_info['columns'] if col in self.data.columns]
        
        if len(columns) < 2:
            print(f"Not enough columns in {group_name} for correlation analysis")
            return None
            
        print(f"\nCorrelations within {group_name}:")
        print("-" * 40)
        
        # Calculate correlation matrix
        corr_data = self.data[columns].select_dtypes(include=[np.number])
        if corr_data.empty:
            print("No numeric data for correlation analysis")
            return None
            
        corr_matrix = corr_data.corr()
        
        # Display significant correlations
        for i in range(len(columns)):
            for j in range(i+1, len(columns)):
                if columns[i] in corr_matrix.columns and columns[j] in corr_matrix.columns:
                    corr_val = corr_matrix.loc[columns[i], columns[j]]
                    if not np.isnan(corr_val) and abs(corr_val) >= min_correlation:
                        print(f"  {columns[i]} <-> {columns[j]}: {corr_val:.3f}")
        
        return corr_matrix
    
    def prepare_group_data(self, group_name):
        """Prepare data for a specific group imputation"""
        group_info = self.imputation_groups[group_name]
        target_cols = [col for col in group_info['columns'] if col in self.data.columns]
        predictor_cols = [col for col in group_info['predictors'] if col in self.data.columns]
        
        # Combine target and predictor columns
        all_cols = target_cols + predictor_cols
        
        # Get subset of data
        group_data = self.data[all_cols].copy()
        
        # Handle categorical variables by encoding them
        categorical_cols = group_data.select_dtypes(include=['object', 'category']).columns
        for col in categorical_cols:
            if col in predictor_cols:  # Only encode predictors, not targets
                # Simple label encoding for now
                group_data[col] = pd.Categorical(group_data[col]).codes
                group_data[col] = group_data[col].replace(-1, np.nan)  # Handle unknown categories
        
        return group_data, target_cols, predictor_cols
    
    def impute_group(self, group_name, method='iterative', n_neighbors=5):
        """Impute missing values for a specific group"""
        print(f"\nImputing {group_name}...")
        
        group_data, target_cols, predictor_cols = self.prepare_group_data(group_name)
        
        if group_data.empty or not target_cols:
            print(f"No data to impute for {group_name}")
            return
        
        # Check if there's enough data for imputation
        numeric_data = group_data.select_dtypes(include=[np.number])
        if numeric_data.shape[1] < 2:
            print(f"Not enough numeric columns for {group_name} imputation")
            return
        
        # Choose imputation method
        if method == 'knn':
            imputer = KNNImputer(n_neighbors=n_neighbors)
        elif method == 'iterative':
            imputer = IterativeImputer(
                estimator=RandomForestRegressor(n_estimators=10, random_state=42),
                random_state=42,
                max_iter=10
            )
        else:
            raise ValueError("Method must be 'knn' or 'iterative'")

        # Fit and transform
        try:
            imputed_values = imputer.fit_transform(numeric_data)
            imputed_df = pd.DataFrame(imputed_values, columns=numeric_data.columns, index=numeric_data.index)
            
            # Update only the target columns in the main dataset
            for col in target_cols:
                if col in imputed_df.columns:
                    self.imputed_data.loc[imputed_df.index, col] = imputed_df[col]
                    
            print(f"  Successfully imputed {len(target_cols)} columns")
            
        except Exception as e:
            print(f"  Error imputing {group_name}: {str(e)}")
    
    def impute_all_groups(self, method='iterative'):
        """Impute all groups sequentially"""
        print("Starting grouped imputation...")
        print("=" * 50)
        
        # Order groups by biological dependency
        order = ['size_traits', 'temperature_traits', 'water_chemistry_ph_oxygen', 
                'water_chemistry_other', 'fecundity_traits']
        
        for group_name in order:
            if group_name in self.imputation_groups:
                self.impute_group(group_name, method=method)

    def validate_imputation(self):
        """Validate imputation results"""
        print("\nImputation Validation:")
        print("=" * 30)
        
        for group_name, group_info in self.imputation_groups.items():
            columns = [col for col in group_info['columns'] if col in self.data.columns]
            
            if not columns:
                continue
                
            print(f"\n{group_name}:")
            for col in columns:
                original_missing = self.data[col].isnull().sum()
                after_missing = self.imputed_data[col].isnull().sum()
                imputed_count = original_missing - after_missing
                
                if original_missing > 0:
                    print(f"  {col}: {imputed_count}/{original_missing} values imputed "
                          f"({(imputed_count/original_missing)*100:.1f}%)")
    
    def get_imputed_data(self):
        """Return the imputed dataset"""
        return self.imputed_data


# Usage example:
def run_grouped_imputation(df):
    imputer = ITOFFGroupedImputer(df)
    
    # Analyze missing patterns
    imputer.analyze_missing_patterns()
    
    # Check correlations
    print("\n" + "="*60)
    print("CORRELATION ANALYSIS")
    print("="*60)
    for group_name in imputer.imputation_groups.keys():
        imputer.check_correlations(group_name)
    
    # Perform imputation
    print("\n" + "="*60)
    print("IMPUTATION PROCESS")
    print("="*60)
    imputer.impute_all_groups(method='iterative')
    
    # Validate results
    imputer.validate_imputation()
    
    return imputer.get_imputed_data()


In [22]:
imputed_data = run_grouped_imputation(df)

Missing Data Analysis by Group:

SIZE_TRAITS:
  temp_max: 4390/5583 (78.6% missing)
  weight_max: 3756/5583 (67.3% missing)
  length_max: 2945/5583 (52.7% missing)

TEMPERATURE_TRAITS:
  temp_pref_min: 2718/5583 (48.7% missing)
  temp_pref_max: 2705/5583 (48.5% missing)
  temp_range_min: 2754/5583 (49.3% missing)
  temp_range_max: 2754/5583 (49.3% missing)
  wb_temp_min: 837/5583 (15.0% missing)
  wb_temp_max: 837/5583 (15.0% missing)

FECUNDITY_TRAITS:
  fecundity_mean: 4249/5583 (76.1% missing)
  fecundity_min: 4170/5583 (74.7% missing)
  fecundity_max: 3914/5583 (70.1% missing)

WATER_CHEMISTRY_PH_OXYGEN:
  wb_ph_min: 747/5583 (13.4% missing)
  wb_ph_max: 747/5583 (13.4% missing)
  wb_do_min: 1161/5583 (20.8% missing)
  wb_do_max: 1161/5583 (20.8% missing)

WATER_CHEMISTRY_OTHER:
  wb_salinity_min: 1144/5583 (20.5% missing)
  wb_salinity_max: 1144/5583 (20.5% missing)
  wb_bod_min: 3384/5583 (60.6% missing)
  wb_bod_max: 3384/5583 (60.6% missing)
  wb_turbidity_min: 2680/5583 (48.0%



  Successfully imputed 3 columns

Imputing temperature_traits...




  Successfully imputed 6 columns

Imputing water_chemistry_ph_oxygen...




  Successfully imputed 4 columns

Imputing water_chemistry_other...




  Successfully imputed 6 columns

Imputing fecundity_traits...
  Successfully imputed 3 columns

Imputation Validation:

size_traits:
  temp_max: 0/4390 values imputed (0.0%)
  weight_max: 0/3756 values imputed (0.0%)
  length_max: 0/2945 values imputed (0.0%)

temperature_traits:
  temp_pref_min: 0/2718 values imputed (0.0%)
  temp_pref_max: 0/2705 values imputed (0.0%)
  temp_range_min: 2754/2754 values imputed (100.0%)
  temp_range_max: 2754/2754 values imputed (100.0%)
  wb_temp_min: 837/837 values imputed (100.0%)
  wb_temp_max: 837/837 values imputed (100.0%)

fecundity_traits:
  fecundity_mean: 0/4249 values imputed (0.0%)
  fecundity_min: 4170/4170 values imputed (100.0%)
  fecundity_max: 3914/3914 values imputed (100.0%)

water_chemistry_ph_oxygen:
  wb_ph_min: 747/747 values imputed (100.0%)
  wb_ph_max: 747/747 values imputed (100.0%)
  wb_do_min: 1161/1161 values imputed (100.0%)
  wb_do_max: 1161/1161 values imputed (100.0%)

water_chemistry_other:
  wb_salinity_min: 1144/



In [23]:
imputed_data.to_csv("imputed_dataset2.csv", index=False)

In [25]:
# Load both datasets
df_original = pd.read_csv("../Round1/final_dataset_v1.csv")   # before imputation
df_imputed = pd.read_csv("imputed_dataset2.csv")               # after imputation

# Count missing values per column
missing_original = df_original.isnull().sum()
missing_imputed = df_imputed.isnull().sum()

# Combine into one DataFrame for comparison
missing_comparison = pd.DataFrame({
    "Missing Before": missing_original,
    "Missing After": missing_imputed,
    "Difference": missing_original - missing_imputed
})

# Optional: filter to only show columns that had missing values originally
missing_comparison = missing_comparison[missing_comparison["Missing Before"] > 0]

# Save or display
print(missing_comparison.sort_values("Missing Before", ascending=False))
missing_comparison.to_csv("missing_values_comparison.csv")

                          Missing Before  Missing After  Difference
temp_max                            4390           4390           0
fecundity_mean                      4249           4249           0
fecundity_min                       4170              0        4170
fecundity_max                       3914              0        3914
weight_max                          3756           3756           0
trophic_lvl_estimate_max            3595           3595           0
trophic_lvl_estimate_min            3595           3595           0
wb_bod_min                          3384              0        3384
wb_bod_max                          3384              0        3384
length_max                          2945           2945           0
temp_range_max                      2754              0        2754
temp_range_min                      2754              0        2754
temp_pref_min                       2718           2718           0
temp_pref_max                       2705        

# Imputation with Strategy

In [38]:
class ITOFFGroupedImputerV2:
    def __init__(self, data):
        self.data = data.copy()
        self.imputed_data = data.copy()
        self.imputation_groups = self._define_groups()

    def _define_groups(self):
        """Define biologically meaningful groups for imputation"""
        return {
            'size_traits': {
                'columns': ['temp_max', 'weight_max', 'length_max'],
                'predictors': ['trophic_lvl_estimate_min', 'trophic_lvl_estimate_max', 'trophic_lvl', 'family', 'order', 'feeding_type']
            },
            'temperature_traits': {
                'columns': ['temp_pref_min', 'temp_pref_max', 'temp_range_min', 'temp_range_max', 'wb_temp_min', 'wb_temp_max'],
                'predictors': ['family', 'order', 'waterbody_name']
            },
            'fecundity_traits': {
                'columns': ['fecundity_mean', 'fecundity_min', 'fecundity_max'],
                'predictors': ['temp_max', 'weight_max', 'length_max', 'family', 'order', 'feeding_type']
            },
            'water_chemistry_ph_oxygen': {
                'columns': ['wb_ph_min', 'wb_ph_max', 'wb_do_min', 'wb_do_max'],
                'predictors': ['family', 'waterbody_name', 'trophic_lvl_estimate_min', 'trophic_lvl_estimate_max', 'trophic_lvl']
            },
            'water_chemistry_other': {
                'columns': ['wb_salinity_min', 'wb_salinity_max', 'wb_bod_min', 
                           'wb_bod_max', 'wb_turbidity_min', 'wb_turbidity_max'],
                'predictors': ['waterbody_name', 'family', 'trophic_lvl_estimate_min', 'trophic_lvl_estimate_max', 'trophic_lvl']
            }
        }

    def prefill_sparse_columns(self, threshold=0.7):
        """Fill extremely sparse numeric columns with median before iterative imputation"""
        numeric_cols = self.data.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            missing_frac = self.data[col].isnull().mean()
            if missing_frac > threshold:
                median_val = self.data[col].median()
                # ✅ FIX: no inplace warning
                self.imputed_data[col] = self.imputed_data[col].fillna(median_val)
                print(f"Prefilled {col} ({missing_frac:.1%} missing) with median: {median_val}")

    def impute_group(self, group_name, method='iterative', n_neighbors=5):
        """Impute missing values for a specific group"""
        group_info = self.imputation_groups[group_name]
        target_cols = [col for col in group_info['columns'] if col in self.data.columns]
        predictor_cols = [col for col in group_info['predictors'] if col in self.data.columns]

In [39]:
class DatasetImputer:
    def __init__(self, df):
        self.original_data = df.copy()
        self.imputed_data = df.copy()
        self.imputer = IterativeImputer(
            estimator=BayesianRidge(),
            max_iter=20,        # more iterations for stability
            random_state=42,    # reproducibility
            tol=1e-3            # convergence tolerance
        )

    def prefill_sparse_numeric(self, threshold=0.7):
        """
        Prefill numeric columns with > threshold missingness using median
        """
        numeric_cols = self.imputed_data.select_dtypes(include=[np.number]).columns

        print("Prefilling sparse numeric columns...")
        for col in numeric_cols:
            missing_pct = self.imputed_data[col].isna().mean()
            if missing_pct >= threshold:
                median_val = self.imputed_data[col].median()
                # ✅ fixed line: no inplace warning
                self.imputed_data[col] = self.imputed_data[col].fillna(median_val)
                print(f"Prefilled {col} ({missing_pct:.1%} missing) with median: {median_val}")

    def iterative_imputation(self):
        """
        Run Iterative Imputer on numeric columns
        """
        numeric_cols = self.imputed_data.select_dtypes(include=[np.number]).columns
        print("\nStarting grouped imputation...")

        # Apply imputer only to numeric data
        num_data = self.imputed_data[numeric_cols]
        imputed_array = self.imputer.fit_transform(num_data)
        self.imputed_data[numeric_cols] = imputed_array

    def compare_missing(self):
        """
        Compare missing values before vs after imputation
        """
        before = self.original_data.isna().sum()
        after = self.imputed_data.isna().sum()
        diff = before - after
        summary = pd.DataFrame({
            "Missing Before": before,
            "Missing After": after,
            "Difference": diff
        }).sort_values("Difference", ascending=False)
        return summary

    def run(self, prefill_threshold=0.7):
        """
        Full pipeline: prefill sparse, then iterative impute
        """
        self.prefill_sparse_numeric(threshold=prefill_threshold)
        self.iterative_imputation()
        return self.imputed_data

In [43]:
def compare_missingness(original_df, imputed_df, top_n=20):
    """Compare missing values before and after imputation (with counts and %)."""
    total = len(original_df)

    before = original_df.isnull().sum()
    after = imputed_df.isnull().sum()
    diff = before - after

    report = pd.DataFrame({
        "Missing Before": before,
        "% Before": (before / total * 100).round(1),
        "Missing After": after,
        "% After": (after / total * 100).round(1),
        "Difference": diff
    }).sort_values(by="Missing Before", ascending=False)

    # Always print a summary
    print("📊 Missing Data Comparison")
    print("Rows in dataset:", total)
    print("Total missing values before:", int(before.sum()))
    print("Total missing values after:", int(after.sum()))
    print("\nTop columns with missing values:\n")

    # Force Jupyter/console to show the table
    print(report.head(top_n).to_string())

    return report

In [45]:
# Run imputation with DatasetImputer
imputer = DatasetImputer(df)
imputed_df = imputer.run(prefill_threshold=0.7)

# Compare before vs after
missing_report = compare_missingness(df, imputed_df, top_n=30)

Prefilling sparse numeric columns...
Prefilled fecundity_min (74.7% missing) with median: 3439.0
Prefilled fecundity_max (70.1% missing) with median: 15000.0

Starting grouped imputation...
📊 Missing Data Comparison
Rows in dataset: 5583
Total missing values before: 64703
Total missing values after: 24015

Top columns with missing values:

                          Missing Before  % Before  Missing After  % After  Difference
temp_max                            4390      78.6           4390     78.6           0
fecundity_mean                      4249      76.1           4249     76.1           0
fecundity_min                       4170      74.7              0      0.0        4170
fecundity_max                       3914      70.1              0      0.0        3914
weight_max                          3756      67.3           3756     67.3           0
trophic_lvl_estimate_max            3595      64.4              0      0.0        3595
trophic_lvl_estimate_min            3595      64.

In [46]:
unchanged = missing_report[missing_report["Difference"] == 0]
improved = missing_report[missing_report["Difference"] > 0]

print("\n✅ Columns improved (missingness reduced):")
display(improved)

print("\n⚠️ Columns unchanged (still missing):")
display(unchanged.head(20))  # show first 20


✅ Columns improved (missingness reduced):


Unnamed: 0,Missing Before,% Before,Missing After,% After,Difference
fecundity_min,4170,74.7,0,0.0,4170
fecundity_max,3914,70.1,0,0.0,3914
trophic_lvl_estimate_max,3595,64.4,0,0.0,3595
trophic_lvl_estimate_min,3595,64.4,0,0.0,3595
wb_bod_min,3384,60.6,0,0.0,3384
wb_bod_max,3384,60.6,0,0.0,3384
temp_range_max,2754,49.3,0,0.0,2754
temp_range_min,2754,49.3,0,0.0,2754
wb_turbidity_max,2680,48.0,0,0.0,2680
wb_turbidity_min,2680,48.0,0,0.0,2680



⚠️ Columns unchanged (still missing):


Unnamed: 0,Missing Before,% Before,Missing After,% After,Difference
temp_max,4390,78.6,4390,78.6,0
fecundity_mean,4249,76.1,4249,76.1,0
weight_max,3756,67.3,3756,67.3,0
length_max,2945,52.7,2945,52.7,0
temp_pref_min,2718,48.7,2718,48.7,0
temp_pref_max,2705,48.5,2705,48.5,0
trophic_lvl,2508,44.9,2508,44.9,0
common_name,509,9.1,509,9.1,0
waterbody_name,104,1.9,104,1.9,0
kingdom,24,0.4,24,0.4,0


In [47]:
# Export imputed dataset
imputed_df.to_csv("imputed_dataset.csv", index=False)

print("✅ Imputed dataset saved as imputed_dataset.csv")

✅ Imputed dataset saved as imputed_dataset.csv


In [51]:
df = pd.read_csv("imputed_dataset.csv")

len(df)
# print(df.columns)

5583

# Drop rows with n missing values

In [50]:
# Define threshold (0.5 means drop rows with more than 50% nulls in target columns)
null_threshold = 0.5

# Load your dataset
df = pd.read_csv("imputed_dataset.csv")

target_columns = [
    "temp_max", "weight_max", "length_max",
    "temp_pref_min", "temp_pref_max", "temp_range_min", "temp_range_max",
    "fecundity_mean", "fecundity_min", "fecundity_max",
    "wb_ph_min", "wb_ph_max",
    "wb_salinity_min", "wb_salinity_max",
    "wb_do_min", "wb_do_max",
    "wb_bod_min", "wb_bod_max",
    "wb_turbidity_min", "wb_turbidity_max",
    "wb_temp_min", "wb_temp_max"
]

# Drop rows where more than threshold % of target_columns are missing
row_null_fraction = df[target_columns].isnull().sum(axis=1) / len(target_columns)
rows_to_drop = row_null_fraction[row_null_fraction > 0.5].index
df_cleaned = df.drop(rows_to_drop)

# Drop Unnamed columns and Sources
# df_cleaned = df_cleaned.drop(columns=[col for col in df_cleaned.columns if "Unnamed" in col or col == "Sources "], errors="ignore")

print(f"\nDropped {len(rows_to_drop)} rows with >{null_threshold:.0%} missing values in target columns.")

# Save with explicit filename in same folder
output_file = "dropped_dataset.csv"
df_cleaned.to_csv(output_file, index=False, encoding="utf-8")

print("✅ Cleaned dataset saved as:", output_file)
print("Rows after cleaning:", len(df_cleaned))


Dropped 0 rows with >50% missing values in target columns.
✅ Cleaned dataset saved as: dropped_dataset.csv
Rows after cleaning: 5583
