<a href="https://colab.research.google.com/github/Simacoder/data-phandas-outbreak-challenge/blob/main/Model7%20updated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model7

In [2]:
from sklearn.base import BaseEstimator, RegressorMixin
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import numpy as np

# Custom wrapper for XGBRegressor to ensure compatibility
class CustomXGBRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, **params):
        self.model = XGBRegressor(**params)

    def fit(self, X, y):
        self.model.fit(X, y)
        return self

    def predict(self, X):
        return self.model.predict(X)

    def get_params(self, deep=True):
        return self.model.get_params(deep)

    def set_params(self, **params):
        self.model.set_params(**params)
        return self

class DiseasePredictionPipeline:
    def __init__(self):
        self.train = None
        self.test = None
        self.pipeline = None
        self.feature_columns = None

    def load_data(self):
        # Load datasets
        self.train = pd.read_csv("Train.csv")
        self.test = pd.read_csv("Test.csv")

        print("Columns in training dataset:", self.train.columns)
        print("Columns in testing dataset:", self.test.columns)

        # Clean target variable
        print("\nInitial target variable statistics:")
        print(self.train['Total'].describe())

        # Remove rows where Total is NaN, infinite, or extremely large
        self.train = self.train[
            np.isfinite(self.train['Total'])
        ]

        print("\nTarget variable statistics after cleaning:")
        print(self.train['Total'].describe())
        print(f"\nRemoved {len(self.train) - len(self.train)} rows with invalid Total values")

        # Define feature columns (excluding 'Total' and 'ID')
        self.feature_columns = [col for col in self.train.columns
                              if col not in ['Total', 'ID']]

        # Verify all feature columns exist in test dataset
        missing_cols = [col for col in self.feature_columns
                       if col not in self.test.columns]
        if missing_cols:
            raise ValueError(f"Missing columns in test dataset: {missing_cols}")

        # Handle missing values separately for train and test
        self._handle_missing_values(self.train)
        self._handle_missing_values(self.test)

    def _handle_missing_values(self, df):
        """Handle missing values for a given dataframe"""
        # Numerical columns
        numerical_cols = df[self.feature_columns].select_dtypes(
            include=['float64', 'int64']).columns
        for col in numerical_cols:
            # Replace infinite values with NaN first
            df[col] = df[col].replace([np.inf, -np.inf], np.nan)
            # Then fill NaN with median
            df[col].fillna(df[col].median(), inplace=True)

        # Categorical columns
        categorical_cols = df[self.feature_columns].select_dtypes(
            include=['object']).columns
        for col in categorical_cols:
            df[col].fillna(df[col].mode()[0], inplace=True)

    def create_pipeline(self):
        # Define preprocessing for numerical features
        numerical_features = [col for col in self.feature_columns
                            if self.train[col].dtype in ['float64', 'int64']]
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])

        # Define preprocessing for categorical features
        categorical_features = [col for col in self.feature_columns
                              if self.train[col].dtype == 'object']
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        # Combine preprocessing steps
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )

        # Define the pipeline
        self.pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', CustomXGBRegressor(
                n_estimators=100,
                learning_rate=0.1,
                max_depth=5,
                random_state=42,
                objective='count:poisson',  # Better for count data
                tree_method='hist',         # Faster training
                min_child_weight=1,         # Help with many zero values
                subsample=0.8,              # Prevent overfitting
                colsample_bytree=0.8        # Prevent overfitting
            ))
        ])

    def run_pipeline(self):
        try:
            # Load and preprocess data
            self.load_data()

            # Prepare training data
            X = self.train[self.feature_columns]
            y = self.train['Total']

            # Split the training data
            X_train, X_val, y_train, y_val = train_test_split(
                X, y, test_size=0.2, random_state=42
            )

            # Create and train the pipeline
            self.create_pipeline()
            self.pipeline.fit(X_train, y_train)

            # Evaluate on validation set
            val_predictions = self.pipeline.predict(X_val)
            mae = np.mean(np.abs(val_predictions - y_val))
            print(f"Validation MAE: {mae}")

            # Cross-validation for robustness
            cv_scores = cross_val_score(
                self.pipeline, X, y,
                cv=5,
                scoring='neg_mean_absolute_error'
            )
            print(f"Cross-validation MAE: {-np.mean(cv_scores):.2f} ± {np.std(cv_scores):.2f}")

            # Predict on test set
            test_predictions = self.pipeline.predict(self.test[self.feature_columns])

            # Create submission dataframe
            submission = pd.DataFrame({
                'ID': self.test['ID'],
                'Total': test_predictions
            })
            submission.to_csv('predictions.csv', index=False)
            print("Predictions saved to 'predictions.csv'")

        except Exception as e:
            print(f"An error occurred: {str(e)}")
            raise

if __name__ == "__main__":
    pipeline = DiseasePredictionPipeline()
    pipeline.run_pipeline()


Columns in training dataset: Index(['ID', 'Total', 'Location', 'Category_Health_Facility_UUID', 'Disease',
       'Month', 'Year', 'Transformed_Latitude', 'Transformed_Longitude'],
      dtype='object')
Columns in testing dataset: Index(['Location', 'Disease', 'Month', 'Category_Health_Facility_UUID', 'Year',
       'Transformed_Latitude', 'Transformed_Longitude', 'ID'],
      dtype='object')

Initial target variable statistics:
count    23847.000000
mean         8.355600
std         28.076713
min          0.000000
25%          0.000000
50%          0.000000
75%          3.000000
max        489.000000
Name: Total, dtype: float64

Target variable statistics after cleaning:
count    23847.000000
mean         8.355600
std         28.076713
min          0.000000
25%          0.000000
50%          0.000000
75%          3.000000
max        489.000000
Name: Total, dtype: float64

Removed 0 rows with invalid Total values


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting value

Validation MAE: 9.90429589219813




Cross-validation MAE: 11.14 ± 2.23
Predictions saved to 'predictions.csv'




In [None]:
# Second model

In [4]:
!pip3 install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [8]:
from sklearn.base import BaseEstimator, RegressorMixin
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

class EnhancedRegressorBase(BaseEstimator, RegressorMixin):
    """Base class for enhanced regressors with robust error handling and logging"""
    def __init__(self, model_class, **params):
        self.params = params
        self.model = model_class(**self.params)
        self.feature_importance_ = None

    def fit(self, X, y):
        try:
            self.model.fit(X, y)
            if hasattr(self.model, 'feature_importances_'):
                self.feature_importance_ = self.model.feature_importances_
            return self
        except Exception as e:
            print(f"Error during fitting: {str(e)}")
            raise

    def predict(self, X):
        try:
            return self.model.predict(X)
        except Exception as e:
            print(f"Error during prediction: {str(e)}")
            raise

    def get_params(self, deep=True):
        return {**{"model_class": self.model.__class__}, **self.params}

    def set_params(self, **params):
        for param, value in params.items():
            if param == "model_class":
                continue
            self.params[param] = value
        self.model.set_params(**self.params)
        return self

class EnhancedXGBRegressor(EnhancedRegressorBase):
    """Enhanced XGBoost regressor with error handling"""
    def __init__(self, **params):
        super().__init__(XGBRegressor, **params)

class EnhancedLGBMRegressor(EnhancedRegressorBase):
    """Enhanced LightGBM regressor with error handling"""
    def __init__(self, **params):
        super().__init__(LGBMRegressor, **params)

class EnhancedCatBoostRegressor(EnhancedRegressorBase):
    """Enhanced CatBoost regressor with error handling"""
    def __init__(self, **params):
        super().__init__(CatBoostRegressor, **params)

class ImprovedDiseasePredictionPipeline:
    """Main pipeline class for disease prediction"""
    def __init__(self):
        self.train = None
        self.test = None
        self.pipeline = None
        self.feature_columns = None
        self.numeric_features = None
        self.categorical_features = None
        self.feature_importance = None
        self.model_metrics = {}
        self.target_transformed = False

    def load_and_analyze_data(self, train_path="Train.csv", test_path="Test.csv", waste_path="waste_management.csv",
                              toilet_path="toilet.csv", water_path="water_sources.csv"):
        """Load and perform initial statistical analysis of the data"""
        try:
            # Load datasets
            self.train = pd.read_csv(train_path)
            self.test = pd.read_csv(test_path)
            self.test = pd.read_csv(waste_path)
            self.test = pd.read_csv(toilet_path)
            self.test = pd.read_csv(water_path)

            # Statistical analysis
            self._perform_statistical_analysis()

            # Clean target variable
            self._clean_target_variable()

            # Define and verify features
            self._setup_features()

            # Handle missing and anomalous values
            self._handle_data_quality()

            return self.train, self.test

        except Exception as e:
            print(f"Error in load_and_analyze_data: {str(e)}")
            raise

    def _perform_statistical_analysis(self):
        """Perform comprehensive statistical analysis"""
        try:
            stats_report = {
                'basic_stats': self.train.describe(),
                'missing_values': self.train.isnull().sum(),
                'skewness': self.train.select_dtypes(include=[np.number]).skew(),
                'kurtosis': self.train.select_dtypes(include=[np.number]).kurtosis()
            }

            # Detect outliers using IQR method
            numeric_cols = self.train.select_dtypes(include=[np.number]).columns
            outliers_report = {}
            for col in numeric_cols:
                Q1 = self.train[col].quantile(0.25)
                Q3 = self.train[col].quantile(0.75)
                IQR = Q3 - Q1
                outliers = ((self.train[col] < (Q1 - 1.5 * IQR)) |
                           (self.train[col] > (Q3 + 1.5 * IQR))).sum()
                outliers_report[col] = outliers

            stats_report['outliers'] = outliers_report

            # Print summary statistics
            print("\nData Analysis Summary:")
            print(f"Total samples: {len(self.train)}")
            print(f"Features: {len(self.train.columns)}")
            print("\nMissing Values Summary:")
            print(stats_report['missing_values'])
            print("\nOutliers Summary:")
            print(stats_report['outliers'])

            return stats_report

        except Exception as e:
            print(f"Error in _perform_statistical_analysis: {str(e)}")
            raise

    def _clean_target_variable(self):
        """Clean and transform target variable"""
        try:
            print("\nTarget Variable Analysis:")
            print(self.train['Total'].describe())

            # Remove invalid values (only remove extreme outliers)
            original_len = len(self.train)
            self.train = self.train[
                (np.isfinite(self.train['Total'])) &
                (self.train['Total'] >= 0) &
                (self.train['Total'] <= self.train['Total'].quantile(0.995))  # Less aggressive outlier removal
            ]

            # Apply log transformation if highly skewed
            if stats.skew(self.train['Total']) > 1:
                self.train['Total'] = np.log1p(self.train['Total'])
                self.target_transformed = True
                print("\nApplied log transformation to target variable due to high skewness")

            print(f"\nRemoved {original_len - len(self.train)} rows with invalid or extreme values")

        except Exception as e:
            print(f"Error in _clean_target_variable: {str(e)}")
            raise

    def _setup_features(self):
        """Setup and verify features"""
        try:
            self.feature_columns = [col for col in self.train.columns
                                  if col not in ['Total', 'ID']]

            # Verify features
            missing_cols = [col for col in self.feature_columns
                           if col not in self.test.columns]
            if missing_cols:
                raise ValueError(f"Missing columns in test dataset: {missing_cols}")

            # Analyze feature types
            self.numeric_features = self.train[self.feature_columns].select_dtypes(
                include=['float64', 'int64']).columns.tolist()
            self.categorical_features = self.train[self.feature_columns].select_dtypes(
                include=['object']).columns.tolist()

        except Exception as e:
            print(f"Error in _setup_features: {str(e)}")
            raise

    def _handle_data_quality(self):
        """Handle missing values and anomalies"""
        try:
            for df in [self.train, self.test]:
                # Handle numeric features
                for col in self.numeric_features:
                    # Replace infinite values
                    df[col] = df[col].replace([np.inf, -np.inf], np.nan)

                    # Handle outliers using winsorization
                    if col in df.columns:
                        q1 = df[col].quantile(0.01)
                        q3 = df[col].quantile(0.99)
                        df[col] = df[col].clip(q1, q3)

                # Handle categorical features
                for col in self.categorical_features:
                    if col in df.columns:
                        # Convert rare categories to 'Other'
                        value_counts = df[col].value_counts()
                        rare_categories = value_counts[value_counts < len(df) * 0.01].index
                        df[col] = df[col].replace(rare_categories, 'Other')

        except Exception as e:
            print(f"Error in _handle_data_quality: {str(e)}")
            raise

    def create_advanced_pipeline(self):
        """Create an advanced pipeline with robust preprocessing and stacking"""
        try:
            # Numeric preprocessing
            numeric_transformer = Pipeline(steps=[
                ('imputer', KNNImputer(n_neighbors=5)),
                ('scaler', RobustScaler()),
                ('power', PowerTransformer(standardize=True))
            ])

            # Categorical preprocessing
            categorical_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))
            ])

            # Create preprocessor
            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', numeric_transformer, self.numeric_features),
                    ('cat', categorical_transformer, self.categorical_features)
                ]
            )

            # Define base models with correct initialization
            base_models = [
                ('xgb', EnhancedXGBRegressor(
                    n_estimators=200,
                    learning_rate=0.05,
                    max_depth=6,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    random_state=42
                )),
                ('lgbm', EnhancedLGBMRegressor(
                    n_estimators=200,
                    learning_rate=0.05,
                    num_leaves=31,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    random_state=42
                )),
                ('catboost', EnhancedCatBoostRegressor(
                    iterations=200,
                    learning_rate=0.05,
                    depth=6,
                    subsample=0.8,
                    random_state=42,
                    verbose=False
                ))
            ]

            # Create stacking model
            final_estimator = HuberRegressor()
            stacking = StackingRegressor(
                estimators=base_models,
                final_estimator=final_estimator,
                cv=5,
                n_jobs=-1
            )

            # Create final pipeline
            self.pipeline = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('stacking', stacking)
            ])

        except Exception as e:
            print(f"Error in create_advanced_pipeline: {str(e)}")
            raise

    def evaluate_model(self, X, y, X_val=None, y_val=None):
        """Evaluate model performance with detailed metrics"""
        try:
            if X_val is not None and y_val is not None:
                # Validation set metrics
                val_pred = self.pipeline.predict(X_val)
                self.model_metrics['validation'] = {
                    'mae': mean_absolute_error(y_val, val_pred),
                    'rmse': np.sqrt(mean_squared_error(y_val, val_pred)),
                    'r2': r2_score(y_val, val_pred)
                }

            # Cross-validation metrics
            cv = KFold(n_splits=5, shuffle=True, random_state=42)
            cv_scores = cross_val_score(
                self.pipeline, X, y,
                cv=cv,
                scoring='neg_mean_absolute_error',
                n_jobs=-1
            )

            self.model_metrics['cross_validation'] = {
                'mae_mean': -np.mean(cv_scores),
                'mae_std': np.std(cv_scores)
            }

            # Print evaluation results
            print("\nModel Evaluation Results:")
            if 'validation' in self.model_metrics:
                print("\nValidation Metrics:")
                print(f"MAE: {self.model_metrics['validation']['mae']:.4f}")
                print(f"RMSE: {self.model_metrics['validation']['rmse']:.4f}")
                print(f"R²: {self.model_metrics['validation']['r2']:.4f}")

            print("\nCross-validation Metrics:")
            print(f"MAE: {self.model_metrics['cross_validation']['mae_mean']:.4f} ± "
                  f"{self.model_metrics['cross_validation']['mae_std']:.4f}")

        except Exception as e:
            print(f"Error in evaluate_model: {str(e)}")
            raise

    def run_pipeline(self, train_path="Train.csv", test_path="Test.csv"):
        """Run the complete pipeline"""
        try:
            # Load and analyze data
            self.load_and_analyze_data(train_path, test_path)

            # Prepare data
            X = self.train[self.feature_columns]
            y = self.train['Total']

            # Split data
            X_train, X_val, y_train, y_val = train_test_split(
                X, y, test_size=0.2, random_state=42
            )

            # Create and train pipeline
            self.create_advanced_pipeline()
            self.pipeline.fit(X_train, y_train)

            # Evaluate model
            self.evaluate_model(X_train, y_train, X_val, y_val)

            # Generate predictions
            test_predictions = self.pipeline.predict(self.test[self.feature_columns])

            # If target was log-transformed, reverse transform predictions
            if self.target_transformed:
                test_predictions = np.expm1(test_predictions)

            # Create submission
            submission = pd.DataFrame({
                'ID': self.test['ID'],
                'Total': test_predictions
            })
            submission.to_csv('predictions.csv', index=False)
            print("\nPredictions saved to 'predictions.csv'")

            return submission

        except Exception as e:
            print(f"An error occurred in run_pipeline: {str(e)}")
            raise

if __name__ == "__main__":
    try:
        pipeline = ImprovedDiseasePredictionPipeline()
        pipeline.run_pipeline()
    except Exception as e:
        print(f"Error in main: {str(e)}")


Data Analysis Summary:
Total samples: 23848
Features: 9

Missing Values Summary:
ID                               0
Total                            1
Location                         0
Category_Health_Facility_UUID    0
Disease                          0
Month                            0
Year                             0
Transformed_Latitude             0
Transformed_Longitude            0
dtype: int64

Outliers Summary:
{'Total': 4609, 'Month': 0, 'Year': 0, 'Transformed_Latitude': 3564, 'Transformed_Longitude': 2112}

Target Variable Analysis:
count    23847.000000
mean         8.355600
std         28.076713
min          0.000000
25%          0.000000
50%          0.000000
75%          3.000000
max        489.000000
Name: Total, dtype: float64

Applied log transformation to target variable due to high skewness

Removed 120 rows with invalid or extreme values
An error occurred in run_pipeline: The estimator EnhancedXGBRegressor should be a regressor.
Error in main: The estimator E

In [None]:
# updating it model 3

In [21]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings
import os
from datetime import datetime
warnings.filterwarnings('ignore')

class EnhancedRegressorBase(BaseEstimator, RegressorMixin):
    """Base class for enhanced regressors with robust error handling and logging"""
    def __init__(self, model_class, **params):
        self.params = params
        self.model = model_class(**self.params)
        self.feature_importance_ = None

    def fit(self, X, y):
        try:
            self.model.fit(X, y)
            if hasattr(self.model, 'feature_importances_'):
                self.feature_importance_ = self.model.feature_importances_
            return self
        except Exception as e:
            print(f"Error during fitting: {str(e)}")
            raise

    def predict(self, X):
        try:
            return self.model.predict(X)
        except Exception as e:
            print(f"Error during prediction: {str(e)}")
            raise

    def get_params(self, deep=True):
        return {**{"model_class": self.model.__class__}, **self.params}

    def set_params(self, **params):
        for param, value in params.items():
            if param == "model_class":
                continue
            self.params[param] = value
        self.model = self.model.__class__(**self.params)
        return self

class EnhancedXGBRegressor(EnhancedRegressorBase):
    def __init__(self, **params):
        super().__init__(XGBRegressor, **params)

class EnhancedLGBMRegressor(EnhancedRegressorBase):
    def __init__(self, **params):
        super().__init__(LGBMRegressor, **params)

class EnhancedCatBoostRegressor(EnhancedRegressorBase):
    def __init__(self, **params):
        super().__init__(CatBoostRegressor, **params)

class ImprovedDiseasePredictionPipeline:
    def __init__(self, base_dir='.'):
        self.train = None
        self.test = None
        self.pipeline = None
        self.feature_columns = None
        self.numeric_features = None
        self.categorical_features = None
        self.feature_importance = None
        self.model_metrics = {}
        self.target_transformed = False
        self.base_dir = base_dir

    def load_and_analyze_data(self, train_path="Train.csv", test_path="Test.csv",
                            waste_path="waste_management.csv", toilet_path="toilets.csv",
                            water_path="water_sources.csv"):
        try:
            # Convert relative paths to absolute paths
            train_path = os.path.join(self.base_dir, train_path)
            test_path = os.path.join(self.base_dir, test_path)
            waste_path = os.path.join(self.base_dir, waste_path)
            toilet_path = os.path.join(self.base_dir, toilet_path)
            water_path = os.path.join(self.base_dir, water_path)

            self.train = pd.read_csv(train_path)
            self.test = pd.read_csv(test_path)
            waste_data = pd.read_csv(waste_path)
            toilet_data = pd.read_csv(toilet_path)
            water_data = pd.read_csv(water_path)

            # Proper merge with error handling
            for df in [self.train, self.test]:
                df = df.merge(waste_data, on='ID', how='left')
                df = df.merge(toilet_data, on='ID', how='left')
                df = df.merge(water_data, on='ID', how='left')

            self._perform_statistical_analysis()
            self._clean_target_variable()
            self._setup_features()
            self._handle_data_quality()

            return self.train, self.test
        except Exception as e:
            print(f"Error in load_and_analyze_data: {str(e)}")
            raise

    def evaluate_and_predict(self, output_path=None):
        try:
            # Generate default output path in the same directory
            if output_path is None:
                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                output_path = os.path.join(self.base_dir, f"predictions_{timestamp}.csv")

            X = self.train[self.feature_columns]
            y = self.train['Total']
            X_test = self.test[self.feature_columns]

            # Fit the pipeline and generate predictions
            print("\nFitting model...")
            self.pipeline.fit(X, y)

            print("Generating predictions...")
            predictions = self.pipeline.predict(X_test)

            # Reverse log transformation if applied
            if self.target_transformed:
                predictions = np.expm1(predictions)

            # Prepare submission DataFrame
            submission_df = pd.DataFrame({
                'ID': self.test['ID'].astype(str) + "_2023_Diarrhea",
                'Target': predictions
            })

            # Save predictions
            submission_df.to_csv(output_path, index=False)
            print(f"\nPredictions saved to: {output_path}")
            print("\nFirst 10 predictions:")
            print(submission_df.head(10))

            # Evaluate model
            y_pred = self.pipeline.predict(X)
            if self.target_transformed:
                y_pred = np.expm1(y_pred)
                y = np.expm1(y)

            # Calculate and store metrics
            self.model_metrics.update({
                'MAE': mean_absolute_error(y, y_pred),
                'MSE': mean_squared_error(y, y_pred),
                'RMSE': np.sqrt(mean_squared_error(y, y_pred)),
                'R2': r2_score(y, y_pred)
            })

            # Print evaluation metrics
            print("\nModel Evaluation Metrics:")
            for metric, value in self.model_metrics.items():
                print(f"{metric}: {value:.4f}")

            return submission_df

        except Exception as e:
            print(f"Error in evaluate_and_predict: {str(e)}")
            raise

# Example usage
if __name__ == "__main__":
    pipeline = ImprovedDiseasePredictionPipeline()
    pipeline.load_and_analyze_data()
    pipeline.create_advanced_pipeline()
    predictions = pipeline.evaluate_and_predict()

Error in load_and_analyze_data: [Errno 2] No such file or directory: './toilet.csv'


FileNotFoundError: [Errno 2] No such file or directory: './toilet.csv'