In [19]:
%pip install seaborn

Note: you may need to restart the kernel to use updated packages.


In [20]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

Matplotlib is building the font cache; this may take a moment.


In [21]:
df = pd.read_csv("Dataset_v2_cleaned_ranges.csv", encoding="utf-8")

In [16]:
print(df.columns.tolist())

['Fish ID', 'Species', 'Common Name', 'Lake Name', 'Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Statues', 'FeedingType', 'Tmax', 'Wmax', 'Lmax', 'TempPref_min', 'TempPref_max', 'TempRange', 'MeanFecundity', 'MinFecundity', 'MaxFecundity', 'Trophic Level Estimate', 'Trophic Level', 'WB_pH_min', 'WB_pH_max', 'WB_Salinity_min', 'WB_Salinity_max', 'WB_Dissolved Oxygen (mg/L)_min', 'WB_Dissolved Oxygen (mg/L)_max', 'WB_Biochemical Oxygen Demand (mg/L)_min', 'WB_Biochemical Oxygen Demand (mg/L)_max', 'WB_Turbidity (NTU)_min', 'WB_Turbidity (NTU)_max', 'WB_Temperature_min', 'WB_Temperature_max']


In [None]:
class ITOFFGroupedImputer:
    def __init__(self, data):
        self.data = data.copy()
        self.imputed_data = data.copy()
        self.imputation_groups = self._define_groups()
        
    def _define_groups(self):
        """Define biologically meaningful groups for imputation based on available columns"""
        return {
            'size_traits': {
                'columns': ['Tmax', 'Wmax', 'Lmax'],
                'predictors': ['Trophic Level', 'Trophic Level Estimate', 'Family', 'Order', 'FeedingType']
            },
            'temperature_traits': {
                'columns': ['TempPref_min', 'TempPref_max', 'TempRange', 'WB_Temperature_min', 'WB_Temperature_max'],
                'predictors': ['Family', 'Order', 'Lake Name']
            },
            'fecundity_traits': {
                'columns': ['MeanFecundity', 'MinFecundity', 'MaxFecundity'],
                'predictors': ['Tmax', 'Wmax', 'Lmax', 'Family', 'Order', 'FeedingType']
            },
            'water_chemistry_ph_oxygen': {
                'columns': ['WB_pH_min', 'WB_pH_max', 'WB_Dissolved Oxygen (mg/L)_min', 'WB_Dissolved Oxygen (mg/L)_max'],
                'predictors': ['Lake Name', 'Family', 'Trophic Level', 'Trophic Level Estimate']
            },
            'water_chemistry_other': {
                'columns': ['WB_Salinity_min', 'WB_Salinity_max', 'WB_Biochemical Oxygen Demand (mg/L)_min', 
                           'WB_Biochemical Oxygen Demand (mg/L)_max', 'WB_Turbidity (NTU)_min', 'WB_Turbidity (NTU)_max'],
                'predictors': ['Lake Name', 'Family', 'Trophic Level', 'Trophic Level Estimate']
            }
        }
    
    def analyze_missing_patterns(self):
        """Analyze missing data patterns for each group"""
        print("Missing Data Analysis by Group:")
        print("=" * 50)
        
        for group_name, group_info in self.imputation_groups.items():
            print(f"\n{group_name.upper()}:")
            columns = group_info['columns']
            
            # Check if columns exist in data
            existing_cols = [col for col in columns if col in self.data.columns]
            missing_cols = [col for col in columns if col not in self.data.columns]
            
            if missing_cols:
                print(f"  Missing columns: {missing_cols}")
            
            if existing_cols:
                missing_stats = self.data[existing_cols].isnull().sum()
                total_rows = len(self.data)
                
                for col in existing_cols:
                    missing_count = missing_stats[col]
                    missing_pct = (missing_count / total_rows) * 100
                    print(f"  {col}: {missing_count}/{total_rows} ({missing_pct:.1f}% missing)")
    
    def check_correlations(self, group_name, min_correlation=0.3):
        """Check correlations within a group"""
        group_info = self.imputation_groups[group_name]
        columns = [col for col in group_info['columns'] if col in self.data.columns]
        
        if len(columns) < 2:
            print(f"Not enough columns in {group_name} for correlation analysis")
            return None
            
        print(f"\nCorrelations within {group_name}:")
        print("-" * 40)
        
        # Calculate correlation matrix
        corr_data = self.data[columns].select_dtypes(include=[np.number])
        if corr_data.empty:
            print("No numeric data for correlation analysis")
            return None
            
        corr_matrix = corr_data.corr()
        
        # Display significant correlations
        for i in range(len(columns)):
            for j in range(i+1, len(columns)):
                if columns[i] in corr_matrix.columns and columns[j] in corr_matrix.columns:
                    corr_val = corr_matrix.loc[columns[i], columns[j]]
                    if not np.isnan(corr_val) and abs(corr_val) >= min_correlation:
                        print(f"  {columns[i]} <-> {columns[j]}: {corr_val:.3f}")
        
        return corr_matrix
    
    def prepare_group_data(self, group_name):
        """Prepare data for a specific group imputation"""
        group_info = self.imputation_groups[group_name]
        target_cols = [col for col in group_info['columns'] if col in self.data.columns]
        predictor_cols = [col for col in group_info['predictors'] if col in self.data.columns]
        
        # Combine target and predictor columns
        all_cols = target_cols + predictor_cols
        
        # Get subset of data
        group_data = self.data[all_cols].copy()
        
        # Handle categorical variables by encoding them
        categorical_cols = group_data.select_dtypes(include=['object', 'category']).columns
        for col in categorical_cols:
            if col in predictor_cols:  # Only encode predictors, not targets
                # Simple label encoding for now
                group_data[col] = pd.Categorical(group_data[col]).codes
                group_data[col] = group_data[col].replace(-1, np.nan)  # Handle unknown categories
        
        return group_data, target_cols, predictor_cols
    
    def impute_group(self, group_name, method='iterative', n_neighbors=5):
        """Impute missing values for a specific group"""
        print(f"\nImputing {group_name}...")
        
        group_data, target_cols, predictor_cols = self.prepare_group_data(group_name)
        
        if group_data.empty or not target_cols:
            print(f"No data to impute for {group_name}")
            return
        
        # Check if there's enough data for imputation
        numeric_data = group_data.select_dtypes(include=[np.number])
        if numeric_data.shape[1] < 2:
            print(f"Not enough numeric columns for {group_name} imputation")
            return
        
        # Choose imputation method
        if method == 'knn':
            imputer = KNNImputer(n_neighbors=n_neighbors)
        elif method == 'iterative':
            imputer = IterativeImputer(
                estimator=RandomForestRegressor(n_estimators=10, random_state=42),
                random_state=42,
                max_iter=10
            )
        else:
            raise ValueError("Method must be 'knn' or 'iterative'")
        
        # Fit and transform
        try:
            imputed_values = imputer.fit_transform(numeric_data)
            imputed_df = pd.DataFrame(imputed_values, columns=numeric_data.columns, index=numeric_data.index)
            
            # Update only the target columns in the main dataset
            for col in target_cols:
                if col in imputed_df.columns:
                    self.imputed_data.loc[imputed_df.index, col] = imputed_df[col]
                    
            print(f"  Successfully imputed {len(target_cols)} columns")
            
        except Exception as e:
            print(f"  Error imputing {group_name}: {str(e)}")
    
    def impute_all_groups(self, method='iterative'):
        """Impute all groups sequentially"""
        print("Starting grouped imputation...")
        print("=" * 50)
        
        # Order groups by biological dependency
        # Size traits first (used as predictors for fecundity)
        # Temperature traits (independent)
        # Water chemistry (potentially dependent on habitat)
        # Fecundity last (depends on size traits)
        
        order = ['size_traits', 'temperature_traits', 'water_chemistry_ph_oxygen', 
                'water_chemistry_other', 'fecundity_traits']
        
        for group_name in order:
            if group_name in self.imputation_groups:
                self.impute_group(group_name, method=method)
    
    def validate_imputation(self):
        """Validate imputation results"""
        print("\nImputation Validation:")
        print("=" * 30)
        
        for group_name, group_info in self.imputation_groups.items():
            columns = [col for col in group_info['columns'] if col in self.data.columns]
            
            if not columns:
                continue
                
            print(f"\n{group_name}:")
            for col in columns:
                original_missing = self.data[col].isnull().sum()
                after_missing = self.imputed_data[col].isnull().sum()
                imputed_count = original_missing - after_missing
                
                if original_missing > 0:
                    print(f"  {col}: {imputed_count}/{original_missing} values imputed "
                          f"({(imputed_count/original_missing)*100:.1f}%)")
    
    def get_imputed_data(self):
        """Return the imputed dataset"""
        return self.imputed_data

# Usage example:
def run_grouped_imputation(df):
    """
    Main function to run grouped imputation
    
    Parameters:
    df: pandas DataFrame with ITOFF data
    
    Returns:
    DataFrame with imputed values
    """
    
    # Initialize imputer
    imputer = ITOFFGroupedImputer(df)
    
    # Analyze missing patterns
    imputer.analyze_missing_patterns()
    
    # Check correlations for each group
    print("\n" + "="*60)
    print("CORRELATION ANALYSIS")
    print("="*60)
    
    for group_name in imputer.imputation_groups.keys():
        imputer.check_correlations(group_name)
    
    # Perform imputation
    print("\n" + "="*60)
    print("IMPUTATION PROCESS")
    print("="*60)
    
    imputer.impute_all_groups(method='iterative')
    
    # Validate results
    imputer.validate_imputation()
    
    return imputer.get_imputed_data()

# Example of how to use:
# imputed_data = run_grouped_imputation(your_dataframe)

In [24]:
imputed_data = run_grouped_imputation(df)
imputed_data.to_csv('Imputed_dataset.csv', index=False)

Missing Data Analysis by Group:

SIZE_TRAITS:
  Tmax: 1333/2342 (56.9% missing)
  Wmax: 960/2342 (41.0% missing)
  Lmax: 607/2342 (25.9% missing)

TEMPERATURE_TRAITS:
  TempPref_min: 463/2342 (19.8% missing)
  TempPref_max: 450/2342 (19.2% missing)
  TempRange: 480/2342 (20.5% missing)
  WB_Temperature_min: 112/2342 (4.8% missing)
  WB_Temperature_max: 112/2342 (4.8% missing)

FECUNDITY_TRAITS:
  MeanFecundity: 1176/2342 (50.2% missing)
  MinFecundity: 1127/2342 (48.1% missing)
  MaxFecundity: 1011/2342 (43.2% missing)

WATER_CHEMISTRY_PH_OXYGEN:
  WB_pH_min: 109/2342 (4.7% missing)
  WB_pH_max: 109/2342 (4.7% missing)
  WB_Dissolved Oxygen (mg/L)_min: 177/2342 (7.6% missing)
  WB_Dissolved Oxygen (mg/L)_max: 177/2342 (7.6% missing)

WATER_CHEMISTRY_OTHER:
  WB_Salinity_min: 249/2342 (10.6% missing)
  WB_Salinity_max: 249/2342 (10.6% missing)
  WB_Biochemical Oxygen Demand (mg/L)_min: 1092/2342 (46.6% missing)
  WB_Biochemical Oxygen Demand (mg/L)_max: 1092/2342 (46.6% missing)
  WB_Tu



  Successfully imputed 3 columns

Imputing temperature_traits...




  Successfully imputed 5 columns

Imputing water_chemistry_ph_oxygen...




  Successfully imputed 4 columns

Imputing water_chemistry_other...




  Successfully imputed 6 columns

Imputing fecundity_traits...
  Successfully imputed 3 columns

Imputation Validation:

size_traits:
  Tmax: 0/1333 values imputed (0.0%)
  Wmax: 0/960 values imputed (0.0%)
  Lmax: 0/607 values imputed (0.0%)

temperature_traits:
  TempPref_min: 0/463 values imputed (0.0%)
  TempPref_max: 0/450 values imputed (0.0%)
  TempRange: 0/480 values imputed (0.0%)
  WB_Temperature_min: 112/112 values imputed (100.0%)
  WB_Temperature_max: 112/112 values imputed (100.0%)

fecundity_traits:
  MeanFecundity: 0/1176 values imputed (0.0%)
  MinFecundity: 1127/1127 values imputed (100.0%)
  MaxFecundity: 1011/1011 values imputed (100.0%)

water_chemistry_ph_oxygen:
  WB_pH_min: 109/109 values imputed (100.0%)
  WB_pH_max: 109/109 values imputed (100.0%)
  WB_Dissolved Oxygen (mg/L)_min: 177/177 values imputed (100.0%)
  WB_Dissolved Oxygen (mg/L)_max: 177/177 values imputed (100.0%)

water_chemistry_other:
  WB_Salinity_min: 249/249 values imputed (100.0%)
  WB_Sali



In [29]:
df = pd.read_csv("Imputed_dataset.csv", encoding='utf-8')
df.head()

Unnamed: 0,Fish ID,Species,Common Name,Lake Name,Kingdom,Phylum,Class,Order,Family,Genus,...,WB_Salinity_min,WB_Salinity_max,WB_Dissolved Oxygen (mg/L)_min,WB_Dissolved Oxygen (mg/L)_max,WB_Biochemical Oxygen Demand (mg/L)_min,WB_Biochemical Oxygen Demand (mg/L)_max,WB_Turbidity (NTU)_min,WB_Turbidity (NTU)_max,WB_Temperature_min,WB_Temperature_max
0,F0001,Copella arnoldi,Splash tetra,Amazon Basin,Animalia,Chordata,Actinopterygii,Characiformes,Lebiasinidae,Copella,...,0.0,0.0,5.5,5.5,2.0,2.0,4.0,4.0,27.0,27.0
1,F0001,Copella arnoldi,Splash tetra,Demerara River Wismar,Animalia,Chordata,Actinopterygii,Characiformes,Lebiasinidae,Copella,...,0.0,0.0,6.0,6.0,3.0,3.0,6.0,6.0,27.0,27.0
2,F0001,Copella arnoldi,Splash tetra,Yarakita River,Animalia,Chordata,Actinopterygii,Characiformes,Lebiasinidae,Copella,...,0.0,0.0,5.0,5.0,2.0,2.0,3.0,3.0,26.5,26.5
3,F0001,Copella arnoldi,Splash tetra,Arauau River,Animalia,Chordata,Actinopterygii,Characiformes,Lebiasinidae,Copella,...,0.0,0.0,5.2,5.2,2.0,2.0,4.0,4.0,27.0,27.0
4,F0001,Copella arnoldi,Splash tetra,"Kurupung River, Upper Mazaruni District",Animalia,Chordata,Actinopterygii,Characiformes,Lebiasinidae,Copella,...,0.0,0.0,4.5,4.5,4.0,4.0,20.0,20.0,26.0,26.0


In [30]:
# After imputation
df.to_csv("Imputed_dataset2.csv", index=False)

# Reload
import pandas as pd
df = pd.read_csv("Imputed_dataset2.csv")
df  # now the Data Viewer should appear


Unnamed: 0,Fish ID,Species,Common Name,Lake Name,Kingdom,Phylum,Class,Order,Family,Genus,...,WB_Salinity_min,WB_Salinity_max,WB_Dissolved Oxygen (mg/L)_min,WB_Dissolved Oxygen (mg/L)_max,WB_Biochemical Oxygen Demand (mg/L)_min,WB_Biochemical Oxygen Demand (mg/L)_max,WB_Turbidity (NTU)_min,WB_Turbidity (NTU)_max,WB_Temperature_min,WB_Temperature_max
0,F0001,Copella arnoldi,Splash tetra,Amazon Basin,Animalia,Chordata,Actinopterygii,Characiformes,Lebiasinidae,Copella,...,0.0,0.0,5.5,5.5,2.0000,2.0000,4.00,4.000,27.0,27.0
1,F0001,Copella arnoldi,Splash tetra,Demerara River Wismar,Animalia,Chordata,Actinopterygii,Characiformes,Lebiasinidae,Copella,...,0.0,0.0,6.0,6.0,3.0000,3.0000,6.00,6.000,27.0,27.0
2,F0001,Copella arnoldi,Splash tetra,Yarakita River,Animalia,Chordata,Actinopterygii,Characiformes,Lebiasinidae,Copella,...,0.0,0.0,5.0,5.0,2.0000,2.0000,3.00,3.000,26.5,26.5
3,F0001,Copella arnoldi,Splash tetra,Arauau River,Animalia,Chordata,Actinopterygii,Characiformes,Lebiasinidae,Copella,...,0.0,0.0,5.2,5.2,2.0000,2.0000,4.00,4.000,27.0,27.0
4,F0001,Copella arnoldi,Splash tetra,"Kurupung River, Upper Mazaruni District",Animalia,Chordata,Actinopterygii,Characiformes,Lebiasinidae,Copella,...,0.0,0.0,4.5,4.5,4.0000,4.0000,20.00,20.000,26.0,26.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2337,F1893,Coregonus hoyi,Bloater,Lake Michigan,Animalia,Chordata,Actinopterygii,Salmoniformes,Salmonidae,Coregonus,...,0.0,0.0,7.0,9.0,25.0000,60.0900,6.06,8.848,6.0,22.0
2338,F1895,Acipenser brevirostrum,Shortnose sturgeon,Hudson River estuary (NY),Animalia,Chordata,Actinopterygii,Acipenseriformes,Acipenseridae,Acipenser,...,0.0,25.0,8.0,9.2,11.2004,20.5472,27.52,83.850,0.0,28.0
2339,F1896,Psephurus gladius,Chinese paddlefish,Yangtze River (China),Animalia,Chordata,Actinopterygii,Acipenseriformes,Polyodontidae,Psephurus,...,0.5,0.5,6.0,10.0,2.0000,4.0000,30.00,200.000,5.0,25.0
2340,F1897,Oreochromis variabilis,Victoria tilapia,Lake Victoria (Uganda/Kenya/Tanzania),Animalia,Chordata,Actinopterygii,Perciformes,Cichlidae,Oreochromis,...,0.5,0.5,6.0,8.0,2.0000,6.0000,10.00,50.000,22.0,28.0
