# M-TRI Exploratory Data Analysis

This notebook performs initial data exploration for the Microbial Toxin-Risk Index (M-TRI) project. We analyze pond characteristics, temporal patterns, spatial coverage, and data quality to understand what we're working with before building models.

**Critical checks:**
- Data coverage across time and space
- Label balance (toxin vs non-toxin events)  
- Missing data patterns
- Spatial distribution of monitoring sites
- Quality issues that need preprocessing

If major data problems appear here, we need to fix ingestion before proceeding to modeling.

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium import plugins
import warnings
import os
from datetime import datetime, timedelta
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("Libraries loaded successfully!")

## Load Sample Dataset

Loading our sample dataset to check data structure and quality. This sample contains 25 pond observations across multiple dates in New Jersey.

In [None]:
# Load the sample dataset
data_path = "../data/sample/merged_features.csv"

if not os.path.exists(data_path):
    print(f"ERROR: Sample data not found at {data_path}")
    print("Make sure you're running this from the notebooks/ directory")
else:
    df = pd.read_csv(data_path)
    print(f"Dataset loaded successfully!")
    print(f"Shape: {df.shape}")
    print(f"Date range: {df['date'].min()} to {df['date'].max()}")
    print(f"Unique ponds: {df['pond_id'].nunique()}")
    
    # Convert date column
    df['date'] = pd.to_datetime(df['date'])
    
    # Display first few rows
    print("\nFirst 5 rows:")
    display(df.head())

## Dataset Overview and Basic Statistics

Let's understand the basic structure, data types, and statistical properties of our features.

In [None]:
# Basic dataset information
print("=== DATASET OVERVIEW ===")
print(f"Total observations: {len(df)}")
print(f"Features: {len(df.columns)}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024:.1f} KB")

print("\n=== DATA TYPES ===")
print(df.dtypes)

print("\n=== BASIC STATISTICS ===")
# Separate numeric and categorical columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
categorical_cols = df.select_dtypes(include=['object']).columns

print(f"Numeric columns: {len(numeric_cols)}")
print(f"Categorical columns: {len(categorical_cols)}")

# Display descriptive statistics for numeric columns
print("\nNumeric features summary:")
display(df[numeric_cols].describe())

## Coverage Tables Analysis

Analyzing data availability across features, time periods, and geographic regions. This helps identify gaps that could bias our model.

In [None]:
# Coverage analysis
print("=== DATA COVERAGE ANALYSIS ===")

# 1. Feature completeness
print("\n1. Feature Completeness (% non-null values):")
feature_coverage = (df.notna().sum() / len(df) * 100).round(1)
coverage_df = pd.DataFrame({
    'Feature': feature_coverage.index,
    'Coverage_%': feature_coverage.values,
    'Missing_Count': df.isnull().sum().values
}).sort_values('Coverage_%')

print(coverage_df.to_string(index=False))

# 2. Temporal coverage
print(f"\n2. Temporal Coverage:")
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

temporal_coverage = df.groupby(['year', 'month']).size().reset_index(name='observations')
print("Observations per month:")
print(temporal_coverage.to_string(index=False))

# 3. Spatial coverage  
print(f"\n3. Spatial Coverage:")
lat_range = df['lat'].max() - df['lat'].min()
lon_range = df['lon'].max() - df['lon'].min()
print(f"Latitude range: {df['lat'].min():.4f} to {df['lat'].max():.4f} (span: {lat_range:.4f}°)")
print(f"Longitude range: {df['lon'].min():.4f} to {df['lon'].max():.4f} (span: {lon_range:.4f}°)")
print(f"Unique pond locations: {df[['lat', 'lon']].drop_duplicates().shape[0]}")

# 4. Key feature availability
key_features = ['chlorophyll_proxy_14d', 'phosphate_mean_7d', 'nitrate_mean_7d', 
                'eDNA_mcy_detected', 'toxin_detected']
print(f"\n4. Key Feature Availability:")
for feature in key_features:
    if feature in df.columns:
        coverage = (df[feature].notna().sum() / len(df)) * 100
        print(f"{feature}: {coverage:.1f}% complete")

## Temporal Data Analysis and Plots

Examining how our data changes over time and identifying seasonal patterns that might affect toxin risk.

In [None]:
# Temporal analysis and visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Observations over time
obs_by_date = df.groupby('date').size()
axes[0,0].plot(obs_by_date.index, obs_by_date.values, 'o-')
axes[0,0].set_title('Observations per Date')
axes[0,0].set_xlabel('Date')
axes[0,0].set_ylabel('Number of Observations')
axes[0,0].tick_params(axis='x', rotation=45)

# 2. Chlorophyll levels over time
axes[0,1].plot(df['date'], df['chlorophyll_proxy_14d'], 'o', alpha=0.6)
axes[0,1].set_title('Chlorophyll Proxy Over Time')
axes[0,1].set_xlabel('Date')
axes[0,1].set_ylabel('Chlorophyll Proxy (14d avg)')
axes[0,1].tick_params(axis='x', rotation=45)

# 3. Toxin detection by month
toxin_by_month = df.groupby(df['date'].dt.month)['toxin_detected'].agg(['sum', 'count'])
toxin_rate = toxin_by_month['sum'] / toxin_by_month['count'] * 100

axes[1,0].bar(toxin_rate.index, toxin_rate.values, alpha=0.7)
axes[1,0].set_title('Toxin Detection Rate by Month')
axes[1,0].set_xlabel('Month')
axes[1,0].set_ylabel('Detection Rate (%)')
axes[1,0].set_xticks(range(1, 13))

# 4. Key nutrients over time
axes[1,1].scatter(df['date'], df['phosphate_mean_7d'], alpha=0.6, label='Phosphate', s=30)
axes[1,1].scatter(df['date'], df['nitrate_mean_7d'], alpha=0.6, label='Nitrate', s=30)
axes[1,1].set_title('Nutrient Levels Over Time')
axes[1,1].set_xlabel('Date')
axes[1,1].set_ylabel('Concentration')
axes[1,1].legend()
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Summary statistics by time period
print("\n=== TEMPORAL PATTERNS ===")
print("Toxin detection by month:")
monthly_stats = df.groupby(df['date'].dt.month).agg({
    'toxin_detected': ['count', 'sum', 'mean'],
    'chlorophyll_proxy_14d': 'mean',
    'phosphate_mean_7d': 'mean'
}).round(3)
print(monthly_stats)

## Label Balance Analysis

Critical analysis of our target variable (toxin_detected). Imbalanced datasets need special handling during model training.

In [None]:
# Label balance analysis
print("=== LABEL BALANCE ANALYSIS ===")

# Basic label distribution
label_counts = df['toxin_detected'].value_counts()
label_pct = df['toxin_detected'].value_counts(normalize=True) * 100

print(f"Total observations: {len(df)}")
print(f"Toxin detected (1): {label_counts.get(1, 0)} ({label_pct.get(1, 0):.1f}%)")
print(f"No toxin (0): {label_counts.get(0, 0)} ({label_pct.get(0, 0):.1f}%)")

# Check for severe imbalance
minority_class_pct = min(label_pct.values) if len(label_pct) > 1 else 0
if minority_class_pct < 10:
    print(f"⚠️  WARNING: Severe class imbalance detected ({minority_class_pct:.1f}% minority class)")
    print("   Consider: SMOTE, class weights, or different evaluation metrics")

# Visualizations
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# 1. Bar chart
label_counts.plot(kind='bar', ax=axes[0], color=['lightcoral', 'skyblue'])
axes[0].set_title('Label Distribution (Count)')
axes[0].set_xlabel('Toxin Detected')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=0)

# 2. Pie chart  
axes[1].pie(label_counts.values, labels=[f'No Toxin\n({label_counts.get(0, 0)})', 
                                        f'Toxin Detected\n({label_counts.get(1, 0)})'],
            autopct='%1.1f%%', colors=['lightcoral', 'skyblue'])
axes[1].set_title('Label Distribution (%)')

# 3. Label distribution by key feature
if 'eDNA_mcy_detected' in df.columns:
    cross_tab = pd.crosstab(df['eDNA_mcy_detected'], df['toxin_detected'], normalize='columns') * 100
    cross_tab.plot(kind='bar', ax=axes[2], color=['lightcoral', 'skyblue'])
    axes[2].set_title('Toxin Detection by eDNA Evidence')
    axes[2].set_xlabel('eDNA mcy Gene Detected')
    axes[2].set_ylabel('Percentage')
    axes[2].legend(['No Toxin', 'Toxin Detected'])
    axes[2].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

# Additional label analysis
print("\n=== LABEL CORRELATION WITH KEY FEATURES ===")
corr_features = ['chlorophyll_proxy_14d', 'phosphate_mean_7d', 'nitrate_mean_7d', 
                'eDNA_mcy_detected', 'ndvi_mean_14d']

correlations = []
for feature in corr_features:
    if feature in df.columns:
        corr = df[feature].corr(df['toxin_detected'])
        correlations.append((feature, corr))
        
correlations.sort(key=lambda x: abs(x[1]), reverse=True)
print("Features most correlated with toxin detection:")
for feature, corr in correlations:
    print(f"{feature}: {corr:.3f}")

## Missingness Heatmap Visualization

Visualizing patterns in missing data to understand systematic gaps in our monitoring coverage.

In [None]:
# Missing data visualization
print("=== MISSING DATA PATTERNS ===")

# Calculate missing data percentages
missing_pct = (df.isnull().sum() / len(df)) * 100
print("Missing data percentages by feature:")
for col in missing_pct.index:
    if missing_pct[col] > 0:
        print(f"{col}: {missing_pct[col]:.1f}%")

# Create missingness heatmap
fig, axes = plt.subplots(2, 1, figsize=(12, 10))

# 1. Overall missingness heatmap
missing_matrix = df.isnull()
sns.heatmap(missing_matrix.T, 
           cbar=True, 
           ax=axes[0],
           cmap='RdYlBu_r',
           xticklabels=False,
           yticklabels=True)
axes[0].set_title('Missing Data Heatmap (Red = Missing)')
axes[0].set_xlabel('Observation Index')

# 2. Missing data correlation matrix
# Show which features tend to be missing together
missing_corr = missing_matrix.corr()
mask = np.triu(np.ones_like(missing_corr, dtype=bool))  # Hide upper triangle
sns.heatmap(missing_corr, 
           mask=mask,
           annot=True, 
           cmap='RdYlBu_r', 
           center=0,
           square=True,
           ax=axes[1],
           cbar_kws={"shrink": .8})
axes[1].set_title('Missing Data Correlation Matrix')

plt.tight_layout()
plt.show()

# Missing data patterns by date
print("\n=== MISSING DATA BY TIME PERIOD ===")
df_with_missing = df.copy()
df_with_missing['missing_count'] = df.isnull().sum(axis=1)

missing_by_date = df_with_missing.groupby('date')['missing_count'].agg(['mean', 'max']).round(2)
print("Average and max missing features per observation by date:")
print(missing_by_date)

# Identify problematic features
high_missing_features = missing_pct[missing_pct > 20].index.tolist()
if high_missing_features:
    print(f"\n⚠️  Features with >20% missing data: {high_missing_features}")
    print("Consider: imputation, feature removal, or alternative data sources")
else:
    print("\n✅ Good news: No features have >20% missing data")

## Spatial Coverage Mapping

Interactive map showing where our ponds are located and their toxin detection status. This helps identify spatial biases in monitoring.

In [None]:
# Spatial analysis and mapping
print("=== SPATIAL COVERAGE ANALYSIS ===")

# Get unique pond locations
pond_locations = df.groupby('pond_id').agg({
    'lat': 'first',
    'lon': 'first', 
    'toxin_detected': ['sum', 'count'],
    'pond_area_m2': 'first'
}).round(4)

# Flatten column names
pond_locations.columns = ['lat', 'lon', 'toxin_events', 'total_obs', 'area_m2']
pond_locations['toxin_rate'] = pond_locations['toxin_events'] / pond_locations['total_obs']

print(f"Total unique ponds: {len(pond_locations)}")
print(f"Average observations per pond: {pond_locations['total_obs'].mean():.1f}")
print(f"Ponds with any toxin detection: {(pond_locations['toxin_events'] > 0).sum()}")

# Create interactive map
center_lat = df['lat'].mean()
center_lon = df['lon'].mean()

m = folium.Map(location=[center_lat, center_lon], zoom_start=10)

# Add pond markers
for pond_id, row in pond_locations.iterrows():
    # Color based on toxin detection rate
    if row['toxin_rate'] == 0:
        color = 'green'
        icon = 'ok-sign'
    elif row['toxin_rate'] < 0.5:
        color = 'orange' 
        icon = 'warning-sign'
    else:
        color = 'red'
        icon = 'exclamation-sign'
    
    # Marker size based on pond area
    radius = max(5, min(20, row['area_m2'] / 2000))
    
    popup_text = f"""
    Pond ID: {pond_id}<br>
    Observations: {row['total_obs']}<br>
    Toxin Events: {row['toxin_events']}<br>
    Detection Rate: {row['toxin_rate']:.1%}<br>
    Area: {row['area_m2']:,} m²
    """
    
    folium.CircleMarker(
        location=[row['lat'], row['lon']],
        radius=radius,
        popup=popup_text,
        color=color,
        fill=True,
        fillColor=color,
        fillOpacity=0.6
    ).add_to(m)

# Add legend
legend_html = '''
<div style="position: fixed; 
     top: 10px; right: 10px; width: 200px; height: 120px; 
     background-color: white; border:2px solid grey; z-index:9999; 
     font-size:14px; padding: 10px">
<b>Toxin Detection Rate</b><br>
<i class="fa fa-circle" style="color:green"></i> No toxin detected<br>
<i class="fa fa-circle" style="color:orange"></i> 1-50% detection<br>
<i class="fa fa-circle" style="color:red"></i> >50% detection<br>
<br><b>Circle size:</b> Pond area
</div>
'''
m.get_root().html.add_child(folium.Element(legend_html))

# Display map
display(m)

# Spatial statistics
print(f"\n=== SPATIAL STATISTICS ===")
print(f"Latitude range: {df['lat'].min():.4f}° to {df['lat'].max():.4f}°")
print(f"Longitude range: {df['lon'].min():.4f}° to {df['lon'].max():.4f}°")
print(f"Geographic span: ~{lat_range*111:.1f} km N-S, ~{lon_range*85:.1f} km E-W")

# Check for spatial clustering
from scipy.spatial.distance import pdist
distances = pdist(pond_locations[['lat', 'lon']].values)
print(f"Average distance between ponds: {np.mean(distances)*111:.1f} km")
print(f"Minimum distance between ponds: {np.min(distances)*111:.1f} km")

## Before/After Thumbnail Comparisons

Visual comparisons showing pond conditions during different toxin states. These help validate that our features capture meaningful changes.

In [None]:
# Before/After thumbnail analysis
print("=== BEFORE/AFTER CONDITION ANALYSIS ===")

# Find ponds with both toxin and non-toxin observations
pond_transitions = df.groupby('pond_id')['toxin_detected'].agg(['min', 'max', 'mean'])
transitional_ponds = pond_transitions[(pond_transitions['min'] == 0) & (pond_transitions['max'] == 1)]

print(f"Ponds with both clean and toxin periods: {len(transitional_ponds)}")

if len(transitional_ponds) >= 3:
    # Select 3 example ponds for before/after comparison
    example_ponds = transitional_ponds.head(3).index.tolist()
    
    fig, axes = plt.subplots(3, 4, figsize=(16, 12))
    fig.suptitle('Before/After Condition Comparisons (3 Example Ponds)', fontsize=16)
    
    for i, pond_id in enumerate(example_ponds):
        pond_data = df[df['pond_id'] == pond_id].sort_values('date')
        
        # Get before (no toxin) and after (toxin) conditions
        before = pond_data[pond_data['toxin_detected'] == 0].iloc[0] if len(pond_data[pond_data['toxin_detected'] == 0]) > 0 else None
        after = pond_data[pond_data['toxin_detected'] == 1].iloc[0] if len(pond_data[pond_data['toxin_detected'] == 1]) > 0 else None
        
        if before is not None and after is not None:
            # Feature comparison bars
            features_to_compare = ['chlorophyll_proxy_14d', 'phosphate_mean_7d', 
                                 'nitrate_mean_7d', 'ndvi_mean_14d']
            
            for j, feature in enumerate(features_to_compare):
                if feature in df.columns:
                    before_val = before[feature]
                    after_val = after[feature] 
                    
                    axes[i,j].bar(['Before\n(No Toxin)', 'After\n(Toxin)'], 
                                [before_val, after_val],
                                color=['lightblue', 'salmon'])
                    axes[i,j].set_title(f'Pond {pond_id}\n{feature}')
                    axes[i,j].set_ylabel('Value')
                    
                    # Add value labels on bars
                    axes[i,j].text(0, before_val + 0.01*max(before_val, after_val), 
                                 f'{before_val:.2f}', ha='center')
                    axes[i,j].text(1, after_val + 0.01*max(before_val, after_val), 
                                 f'{after_val:.2f}', ha='center')
    
    plt.tight_layout()
    plt.show()
    
    # Statistical comparison
    print("\n=== BEFORE/AFTER STATISTICAL COMPARISON ===")
    before_conditions = df[df['toxin_detected'] == 0]
    after_conditions = df[df['toxin_detected'] == 1]
    
    comparison_features = ['chlorophyll_proxy_14d', 'phosphate_mean_7d', 
                          'nitrate_mean_7d', 'turbidity_latest', 'ndvi_mean_14d']
    
    print("Average values comparison:")
    print(f"{'Feature':<25} {'No Toxin':<12} {'Toxin':<12} {'Change':<12}")
    print("-" * 65)
    
    for feature in comparison_features:
        if feature in df.columns:
            before_mean = before_conditions[feature].mean()
            after_mean = after_conditions[feature].mean()
            change_pct = ((after_mean - before_mean) / before_mean * 100) if before_mean != 0 else 0
            
            print(f"{feature:<25} {before_mean:<12.3f} {after_mean:<12.3f} {change_pct:+.1f}%")

else:
    print("Not enough transitional ponds for before/after analysis")
    print("Creating synthetic comparison based on available data...")
    
    # Show distributions instead
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    key_features = ['chlorophyll_proxy_14d', 'phosphate_mean_7d', 'nitrate_mean_7d']
    
    for i, feature in enumerate(key_features):
        if feature in df.columns:
            no_toxin = df[df['toxin_detected'] == 0][feature]
            toxin = df[df['toxin_detected'] == 1][feature]
            
            axes[i].hist(no_toxin, alpha=0.7, label='No Toxin', bins=10, color='lightblue')
            axes[i].hist(toxin, alpha=0.7, label='Toxin', bins=10, color='salmon')
            axes[i].set_title(f'{feature}\nDistribution Comparison')
            axes[i].set_xlabel('Value')
            axes[i].set_ylabel('Frequency')
            axes[i].legend()
    
    plt.tight_layout()
    plt.show()

## Data Quality Assessment

Final assessment of data quality issues and recommendations for preprocessing before model training.

In [None]:
# Data quality assessment and recommendations
print("=== DATA QUALITY ASSESSMENT ===")

# 1. Outlier detection
print("\n1. OUTLIER DETECTION")
numeric_features = ['chlorophyll_proxy_14d', 'phosphate_mean_7d', 'nitrate_mean_7d', 
                   'turbidity_latest', 'ndvi_mean_14d', 'pond_area_m2']

outliers_summary = {}
for feature in numeric_features:
    if feature in df.columns:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
        outliers_summary[feature] = {
            'count': len(outliers),
            'percentage': len(outliers) / len(df) * 100,
            'range': f"[{df[feature].min():.3f}, {df[feature].max():.3f}]"
        }

for feature, stats in outliers_summary.items():
    print(f"{feature}: {stats['count']} outliers ({stats['percentage']:.1f}%) - Range: {stats['range']}")

# 2. Data consistency checks  
print(f"\n2. DATA CONSISTENCY CHECKS")

# Check for impossible values
issues = []
if 'ndvi_mean_14d' in df.columns:
    invalid_ndvi = df[(df['ndvi_mean_14d'] < -1) | (df['ndvi_mean_14d'] > 1)]
    if len(invalid_ndvi) > 0:
        issues.append(f"Invalid NDVI values (should be -1 to 1): {len(invalid_ndvi)} cases")

if 'pond_area_m2' in df.columns:
    zero_area = df[df['pond_area_m2'] <= 0]
    if len(zero_area) > 0:
        issues.append(f"Zero or negative pond areas: {len(zero_area)} cases")

# Check for duplicate records
duplicates = df.duplicated(subset=['pond_id', 'date'])
if duplicates.sum() > 0:
    issues.append(f"Duplicate pond-date combinations: {duplicates.sum()} cases")

if issues:
    for issue in issues:
        print(f"⚠️  {issue}")
else:
    print("✅ No obvious data consistency issues found")

# 3. Feature relationships
print(f"\n3. FEATURE RELATIONSHIP ANALYSIS")
correlation_matrix = df[numeric_features].corr()

# Find highly correlated features (potential redundancy)
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_val = abs(correlation_matrix.iloc[i, j])
        if corr_val > 0.8:  # High correlation threshold
            feat1 = correlation_matrix.columns[i]
            feat2 = correlation_matrix.columns[j] 
            high_corr_pairs.append((feat1, feat2, corr_val))

if high_corr_pairs:
    print("Highly correlated feature pairs (>0.8):")
    for feat1, feat2, corr in high_corr_pairs:
        print(f"  {feat1} ↔ {feat2}: {corr:.3f}")
else:
    print("✅ No highly correlated features found")

# 4. Recommendations
print(f"\n=== PREPROCESSING RECOMMENDATIONS ===")

recommendations = []

# Missing data
missing_pct = df.isnull().sum() / len(df) * 100
high_missing = missing_pct[missing_pct > 10]
if len(high_missing) > 0:
    recommendations.append(f"Handle missing data in: {list(high_missing.index)}")

# Class imbalance  
toxin_pct = df['toxin_detected'].mean() * 100
if toxin_pct < 20 or toxin_pct > 80:
    recommendations.append(f"Address class imbalance (toxin rate: {toxin_pct:.1f}%)")

# Outliers
high_outlier_features = [f for f, stats in outliers_summary.items() if stats['percentage'] > 5]
if high_outlier_features:
    recommendations.append(f"Consider outlier handling for: {high_outlier_features}")

# Feature scaling
feature_ranges = {}
for feature in numeric_features:
    if feature in df.columns:
        feature_ranges[feature] = df[feature].max() - df[feature].min()

max_range = max(feature_ranges.values())
min_range = min(feature_ranges.values()) 
if max_range / min_range > 100:
    recommendations.append("Apply feature scaling (wide range differences detected)")

# Final recommendations
if recommendations:
    for i, rec in enumerate(recommendations, 1):
        print(f"{i}. {rec}")
else:
    print("✅ Data quality looks good for model training!")

print(f"\n=== EDA SUMMARY ===")
print(f"✅ Dataset loaded: {len(df)} observations, {len(df.columns)} features")
print(f"✅ Spatial coverage: {df['pond_id'].nunique()} unique ponds")
print(f"✅ Temporal coverage: {df['date'].min()} to {df['date'].max()}")
print(f"✅ Target balance: {df['toxin_detected'].mean()*100:.1f}% positive cases")
print(f"✅ Missing data: {(df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100):.1f}% overall")

if len(recommendations) == 0:
    print(f"\n🎯 Data quality is sufficient - ready to proceed with feature engineering and modeling!")
else:
    print(f"\n⚠️  Address {len(recommendations)} data quality issues before modeling")