<a href="https://colab.research.google.com/github/Simacoder/data-phandas-outbreak-challenge/blob/main/Model7%20updated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model7

In [None]:
from sklearn.base import BaseEstimator, RegressorMixin
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import numpy as np

# Custom wrapper for XGBRegressor to ensure compatibility
class CustomXGBRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, **params):
        self.model = XGBRegressor(**params)

    def fit(self, X, y):
        self.model.fit(X, y)
        return self

    def predict(self, X):
        return self.model.predict(X)

    def get_params(self, deep=True):
        return self.model.get_params(deep)

    def set_params(self, **params):
        self.model.set_params(**params)
        return self

class DiseasePredictionPipeline:
    def __init__(self):
        self.train = None
        self.test = None
        self.pipeline = None
        self.feature_columns = None

    def load_data(self):
        # Load datasets
        self.train = pd.read_csv("Train.csv")
        self.test = pd.read_csv("Test.csv")

        print("Columns in training dataset:", self.train.columns)
        print("Columns in testing dataset:", self.test.columns)

        # Clean target variable
        print("\nInitial target variable statistics:")
        print(self.train['Total'].describe())

        # Remove rows where Total is NaN, infinite, or extremely large
        self.train = self.train[
            np.isfinite(self.train['Total'])
        ]

        print("\nTarget variable statistics after cleaning:")
        print(self.train['Total'].describe())
        print(f"\nRemoved {len(self.train) - len(self.train)} rows with invalid Total values")

        # Define feature columns (excluding 'Total' and 'ID')
        self.feature_columns = [col for col in self.train.columns
                              if col not in ['Total', 'ID']]

        # Verify all feature columns exist in test dataset
        missing_cols = [col for col in self.feature_columns
                       if col not in self.test.columns]
        if missing_cols:
            raise ValueError(f"Missing columns in test dataset: {missing_cols}")

        # Handle missing values separately for train and test
        self._handle_missing_values(self.train)
        self._handle_missing_values(self.test)

    def _handle_missing_values(self, df):
        """Handle missing values for a given dataframe"""
        # Numerical columns
        numerical_cols = df[self.feature_columns].select_dtypes(
            include=['float64', 'int64']).columns
        for col in numerical_cols:
            # Replace infinite values with NaN first
            df[col] = df[col].replace([np.inf, -np.inf], np.nan)
            # Then fill NaN with median
            df[col].fillna(df[col].median(), inplace=True)

        # Categorical columns
        categorical_cols = df[self.feature_columns].select_dtypes(
            include=['object']).columns
        for col in categorical_cols:
            df[col].fillna(df[col].mode()[0], inplace=True)

    def create_pipeline(self):
        # Define preprocessing for numerical features
        numerical_features = [col for col in self.feature_columns
                            if self.train[col].dtype in ['float64', 'int64']]
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])

        # Define preprocessing for categorical features
        categorical_features = [col for col in self.feature_columns
                              if self.train[col].dtype == 'object']
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        # Combine preprocessing steps
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )

        # Define the pipeline
        self.pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', CustomXGBRegressor(
                n_estimators=100,
                learning_rate=0.1,
                max_depth=5,
                random_state=42,
                objective='count:poisson',  # Better for count data
                tree_method='hist',         # Faster training
                min_child_weight=1,         # Help with many zero values
                subsample=0.8,              # Prevent overfitting
                colsample_bytree=0.8        # Prevent overfitting
            ))
        ])

    def run_pipeline(self):
        try:
            # Load and preprocess data
            self.load_data()

            # Prepare training data
            X = self.train[self.feature_columns]
            y = self.train['Total']

            # Split the training data
            X_train, X_val, y_train, y_val = train_test_split(
                X, y, test_size=0.2, random_state=42
            )

            # Create and train the pipeline
            self.create_pipeline()
            self.pipeline.fit(X_train, y_train)

            # Evaluate on validation set
            val_predictions = self.pipeline.predict(X_val)
            mae = np.mean(np.abs(val_predictions - y_val))
            print(f"Validation MAE: {mae}")

            # Cross-validation for robustness
            cv_scores = cross_val_score(
                self.pipeline, X, y,
                cv=5,
                scoring='neg_mean_absolute_error'
            )
            print(f"Cross-validation MAE: {-np.mean(cv_scores):.2f} ± {np.std(cv_scores):.2f}")

            # Predict on test set
            test_predictions = self.pipeline.predict(self.test[self.feature_columns])

            # Create submission dataframe
            submission = pd.DataFrame({
                'ID': self.test['ID'],
                'Total': test_predictions
            })
            submission.to_csv('predictions.csv', index=False)
            print("Predictions saved to 'predictions.csv'")

        except Exception as e:
            print(f"An error occurred: {str(e)}")
            raise

if __name__ == "__main__":
    pipeline = DiseasePredictionPipeline()
    pipeline.run_pipeline()


Columns in training dataset: Index(['ID', 'Total', 'Location', 'Category_Health_Facility_UUID', 'Disease',
       'Month', 'Year', 'Transformed_Latitude', 'Transformed_Longitude'],
      dtype='object')
Columns in testing dataset: Index(['Location', 'Disease', 'Month', 'Category_Health_Facility_UUID', 'Year',
       'Transformed_Latitude', 'Transformed_Longitude', 'ID'],
      dtype='object')

Initial target variable statistics:
count    23847.000000
mean         8.355600
std         28.076713
min          0.000000
25%          0.000000
50%          0.000000
75%          3.000000
max        489.000000
Name: Total, dtype: float64

Target variable statistics after cleaning:
count    23847.000000
mean         8.355600
std         28.076713
min          0.000000
25%          0.000000
50%          0.000000
75%          3.000000
max        489.000000
Name: Total, dtype: float64

Removed 0 rows with invalid Total values


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting value

Validation MAE: 9.90429589219813




Cross-validation MAE: 11.14 ± 2.23
Predictions saved to 'predictions.csv'




In [None]:
# Second model

In [None]:
!pip3 install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [1]:
from sklearn.base import BaseEstimator, RegressorMixin
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

class EnhancedRegressorBase(BaseEstimator, RegressorMixin):
    """Base class for enhanced regressors with robust error handling and logging"""
    def __init__(self, model_class, **params):
        self.params = params
        self.model = model_class(**self.params)
        self.feature_importance_ = None

    def fit(self, X, y):
        try:
            self.model.fit(X, y)
            if hasattr(self.model, 'feature_importances_'):
                self.feature_importance_ = self.model.feature_importances_
            return self
        except Exception as e:
            print(f"Error during fitting: {str(e)}")
            raise

    def predict(self, X):
        try:
            return self.model.predict(X)
        except Exception as e:
            print(f"Error during prediction: {str(e)}")
            raise

    def get_params(self, deep=True):
        return {**{"model_class": self.model.__class__}, **self.params}

    def set_params(self, **params):
        for param, value in params.items():
            if param == "model_class":
                continue
            self.params[param] = value
        self.model.set_params(**self.params)
        return self

class EnhancedXGBRegressor(EnhancedRegressorBase):
    """Enhanced XGBoost regressor with error handling"""
    def __init__(self, **params):
        super().__init__(XGBRegressor, **params)

class EnhancedLGBMRegressor(EnhancedRegressorBase):
    """Enhanced LightGBM regressor with error handling"""
    def __init__(self, **params):
        super().__init__(LGBMRegressor, **params)

class EnhancedCatBoostRegressor(EnhancedRegressorBase):
    """Enhanced CatBoost regressor with error handling"""
    def __init__(self, **params):
        super().__init__(CatBoostRegressor, **params)

class ImprovedDiseasePredictionPipeline:
    """Main pipeline class for disease prediction"""
    def __init__(self):
        self.train = None
        self.test = None
        self.pipeline = None
        self.feature_columns = None
        self.numeric_features = None
        self.categorical_features = None
        self.feature_importance = None
        self.model_metrics = {}
        self.target_transformed = False

    def load_and_analyze_data(self, waste_path="waste_management.csv",
                              toilet_path="toilet.csv", water_path="water_sources.csv"):
        """Load and perform initial statistical analysis of the data"""
        try:
            # Load datasets

            self.test = pd.read_csv(waste_path)
            self.test = pd.read_csv(toilet_path)
            self.test = pd.read_csv(water_path)

            # Statistical analysis
            self._perform_statistical_analysis()

            # Clean target variable
            self._clean_target_variable()

            # Define and verify features
            self._setup_features()

            # Handle missing and anomalous values
            self._handle_data_quality()

            return self.train, self.test

        except Exception as e:
            print(f"Error in load_and_analyze_data: {str(e)}")
            raise

    def _perform_statistical_analysis(self):
        """Perform comprehensive statistical analysis"""
        try:
            stats_report = {
                'basic_stats': self.train.describe(),
                'missing_values': self.train.isnull().sum(),
                'skewness': self.train.select_dtypes(include=[np.number]).skew(),
                'kurtosis': self.train.select_dtypes(include=[np.number]).kurtosis()
            }

            # Detect outliers using IQR method
            numeric_cols = self.train.select_dtypes(include=[np.number]).columns
            outliers_report = {}
            for col in numeric_cols:
                Q1 = self.train[col].quantile(0.25)
                Q3 = self.train[col].quantile(0.75)
                IQR = Q3 - Q1
                outliers = ((self.train[col] < (Q1 - 1.5 * IQR)) |
                           (self.train[col] > (Q3 + 1.5 * IQR))).sum()
                outliers_report[col] = outliers

            stats_report['outliers'] = outliers_report

            # Print summary statistics
            print("\nData Analysis Summary:")
            print(f"Total samples: {len(self.train)}")
            print(f"Features: {len(self.train.columns)}")
            print("\nMissing Values Summary:")
            print(stats_report['missing_values'])
            print("\nOutliers Summary:")
            print(stats_report['outliers'])

            return stats_report

        except Exception as e:
            print(f"Error in _perform_statistical_analysis: {str(e)}")
            raise

    def _clean_target_variable(self):
        """Clean and transform target variable"""
        try:
            print("\nTarget Variable Analysis:")
            print(self.train['Total'].describe())

            # Remove invalid values (only remove extreme outliers)
            original_len = len(self.train)
            self.train = self.train[
                (np.isfinite(self.train['Total'])) &
                (self.train['Total'] >= 0) &
                (self.train['Total'] <= self.train['Total'].quantile(0.995))  # Less aggressive outlier removal
            ]

            # Apply log transformation if highly skewed
            if stats.skew(self.train['Total']) > 1:
                self.train['Total'] = np.log1p(self.train['Total'])
                self.target_transformed = True
                print("\nApplied log transformation to target variable due to high skewness")

            print(f"\nRemoved {original_len - len(self.train)} rows with invalid or extreme values")

        except Exception as e:
            print(f"Error in _clean_target_variable: {str(e)}")
            raise

    def _setup_features(self):
        """Setup and verify features"""
        try:
            self.feature_columns = [col for col in self.train.columns
                                  if col not in ['Total', 'ID']]

            # Verify features
            missing_cols = [col for col in self.feature_columns
                           if col not in self.test.columns]
            if missing_cols:
                raise ValueError(f"Missing columns in test dataset: {missing_cols}")

            # Analyze feature types
            self.numeric_features = self.train[self.feature_columns].select_dtypes(
                include=['float64', 'int64']).columns.tolist()
            self.categorical_features = self.train[self.feature_columns].select_dtypes(
                include=['object']).columns.tolist()

        except Exception as e:
            print(f"Error in _setup_features: {str(e)}")
            raise

    def _handle_data_quality(self):
        """Handle missing values and anomalies"""
        try:
            for df in [self.train, self.test]:
                # Handle numeric features
                for col in self.numeric_features:
                    # Replace infinite values
                    df[col] = df[col].replace([np.inf, -np.inf], np.nan)

                    # Handle outliers using winsorization
                    if col in df.columns:
                        q1 = df[col].quantile(0.01)
                        q3 = df[col].quantile(0.99)
                        df[col] = df[col].clip(q1, q3)

                # Handle categorical features
                for col in self.categorical_features:
                    if col in df.columns:
                        # Convert rare categories to 'Other'
                        value_counts = df[col].value_counts()
                        rare_categories = value_counts[value_counts < len(df) * 0.01].index
                        df[col] = df[col].replace(rare_categories, 'Other')

        except Exception as e:
            print(f"Error in _handle_data_quality: {str(e)}")
            raise

    def create_advanced_pipeline(self):
        """Create an advanced pipeline with robust preprocessing and stacking"""
        try:
            # Numeric preprocessing
            numeric_transformer = Pipeline(steps=[
                ('imputer', KNNImputer(n_neighbors=5)),
                ('scaler', RobustScaler()),
                ('power', PowerTransformer(standardize=True))
            ])

            # Categorical preprocessing
            categorical_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))
            ])

            # Create preprocessor
            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', numeric_transformer, self.numeric_features),
                    ('cat', categorical_transformer, self.categorical_features)
                ]
            )

            # Define base models with correct initialization
            base_models = [
                ('xgb', EnhancedXGBRegressor(
                    n_estimators=200,
                    learning_rate=0.05,
                    max_depth=6,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    random_state=42
                )),
                ('lgbm', EnhancedLGBMRegressor(
                    n_estimators=200,
                    learning_rate=0.05,
                    num_leaves=31,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    random_state=42
                )),
                ('catboost', EnhancedCatBoostRegressor(
                    iterations=200,
                    learning_rate=0.05,
                    depth=6,
                    subsample=0.8,
                    random_state=42,
                    verbose=False
                ))
            ]

            # Create stacking model
            final_estimator = HuberRegressor()
            stacking = StackingRegressor(
                estimators=base_models,
                final_estimator=final_estimator,
                cv=5,
                n_jobs=-1
            )

            # Create final pipeline
            self.pipeline = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('stacking', stacking)
            ])

        except Exception as e:
            print(f"Error in create_advanced_pipeline: {str(e)}")
            raise

    def evaluate_model(self, X, y, X_val=None, y_val=None):
        """Evaluate model performance with detailed metrics"""
        try:
            if X_val is not None and y_val is not None:
                # Validation set metrics
                val_pred = self.pipeline.predict(X_val)
                self.model_metrics['validation'] = {
                    'mae': mean_absolute_error(y_val, val_pred),
                    'rmse': np.sqrt(mean_squared_error(y_val, val_pred)),
                    'r2': r2_score(y_val, val_pred)
                }

            # Cross-validation metrics
            cv = KFold(n_splits=5, shuffle=True, random_state=42)
            cv_scores = cross_val_score(
                self.pipeline, X, y,
                cv=cv,
                scoring='neg_mean_absolute_error',
                n_jobs=-1
            )

            self.model_metrics['cross_validation'] = {
                'mae_mean': -np.mean(cv_scores),
                'mae_std': np.std(cv_scores)
            }

            # Print evaluation results
            print("\nModel Evaluation Results:")
            if 'validation' in self.model_metrics:
                print("\nValidation Metrics:")
                print(f"MAE: {self.model_metrics['validation']['mae']:.4f}")
                print(f"RMSE: {self.model_metrics['validation']['rmse']:.4f}")
                print(f"R²: {self.model_metrics['validation']['r2']:.4f}")

            print("\nCross-validation Metrics:")
            print(f"MAE: {self.model_metrics['cross_validation']['mae_mean']:.4f} ± "
                  f"{self.model_metrics['cross_validation']['mae_std']:.4f}")

        except Exception as e:
            print(f"Error in evaluate_model: {str(e)}")
            raise

    def run_pipeline(self, train_path="Train.csv", test_path="Test.csv"):
        """Run the complete pipeline"""
        try:
            # Load and analyze data
            self.load_and_analyze_data(train_path, test_path)

            # Prepare data
            X = self.train[self.feature_columns]
            y = self.train['Total']

            # Split data
            X_train, X_val, y_train, y_val = train_test_split(
                X, y, test_size=0.2, random_state=42
            )

            # Create and train pipeline
            self.create_advanced_pipeline()
            self.pipeline.fit(X_train, y_train)

            # Evaluate model
            self.evaluate_model(X_train, y_train, X_val, y_val)

            # Generate predictions
            test_predictions = self.pipeline.predict(self.test[self.feature_columns])

            # If target was log-transformed, reverse transform predictions
            if self.target_transformed:
                test_predictions = np.expm1(test_predictions)

            # Create submission
            submission = pd.DataFrame({
                'ID': self.test['ID'],
                'Total': test_predictions
            })
            submission.to_csv('predictions.csv', index=False)
            print("\nPredictions saved to 'predictions.csv'")

            return submission

        except Exception as e:
            print(f"An error occurred in run_pipeline: {str(e)}")
            raise

if __name__ == "__main__":
    try:
        pipeline = ImprovedDiseasePredictionPipeline()
        pipeline.run_pipeline()
    except Exception as e:
        print(f"Error in main: {str(e)}")

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



ModuleNotFoundError: No module named 'catboost'

In [None]:
# updating it model 3

In [None]:
!pip install xgboost==1.7.5
!pip install scikit-learn==1.2.2

Collecting xgboost==1.7.5
  Downloading xgboost-1.7.5-py3-none-manylinux2014_x86_64.whl.metadata (1.9 kB)
Downloading xgboost-1.7.5-py3-none-manylinux2014_x86_64.whl (200.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.3/200.3 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 2.1.3
    Uninstalling xgboost-2.1.3:
      Successfully uninstalled xgboost-2.1.3
Successfully installed xgboost-1.7.5


Collecting scikit-learn==1.2.2
  Downloading scikit_learn-1.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mlxtend 0.23.3 requires scikit-learn>=1.3.1, but you have scikit-learn 1.2.2 which is incompatible.
imbalanced-learn 0.13.0 requires scikit-learn<2,>=1.3.2, but you have scikit-learn 1.2.2 which is incompatible.[0m[31m
[0mSuccessfully 

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import VotingRegressor

# Load datasets
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")
toilets = pd.read_csv("toilets.csv")
waste_management = pd.read_csv("waste_management.csv")
water_sources = pd.read_csv("water_sources.csv")

# Combine train and test datasets for consistent preprocessing
hospital_data = pd.concat([train, test])

# Drop unnecessary columns from supplementary datasets
for df in [toilets, waste_management, water_sources]:
    df.drop(columns=['Year', 'Month'], inplace=True)

# Rename columns for clarity
def rename_columns(df, prefix):
    for col in df.columns:
        if col not in ['Month_Year_lat_lon', 'lat_lon']:
            df.rename(columns={col: f"{prefix}_{col}"}, inplace=True)

rename_columns(toilets, "toilet")
rename_columns(waste_management, "waste")
rename_columns(water_sources, "water")

# Ensure unique identifier columns exist in all supplementary datasets



# Fill missing values in the 'Total' column
hospital_data['Total'].fillna(0, inplace=True)

# Drop rows with missing latitude and longitude in water sources
water_sources.dropna(subset=['water_Transformed_Latitude'], inplace=True)

# Function to find nearest locations
def find_nearest(hospital_data, location_df, lat_col, lon_col, id_col):
    # Create a cKDTree for efficient nearest neighbour search
    tree = cKDTree(location_df[[lat_col, lon_col]].values)
    nearest = {}
    # Loop through each hospital and find the nearest site in location_df
    for _, row in hospital_data.iterrows():
        _, idx = tree.query([row['Transformed_Latitude'], row['Transformed_Longitude']])
        nearest[row['ID']] = location_df.iloc[idx][id_col]
    return nearest

for df, prefix in [(toilets, 'toilet'), (waste_management, 'waste'), (water_sources, 'water')]:
   df[f"{prefix}_Month_Year_lat_lon"] = (
      df[f"{prefix}_Month_Year"] + '_' +
      df[f"{prefix}_Transformed_Latitude"].astype(str) + '_' +
      df[f"{prefix}_Transformed_Longitude"].astype(str)
    )


# Merge datasets with nearest locations
merged_data = hospital_data.copy()
datasets = [
    (toilets, 'toilet', 'toilet_Month_Year_lat_lon'),
    (waste_management, 'waste', 'waste_Month_Year_lat_lon'),
    (water_sources, 'water', 'water_Month_Year_lat_lon'),
]

for df, prefix, id_col in datasets:
    nearest = find_nearest(merged_data, df, f"{prefix}_Transformed_Latitude", f"{prefix}_Transformed_Longitude", id_col)
    nearest_df = pd.DataFrame(list(nearest.items()), columns=['ID', id_col])
    merged_data = merged_data.merge(nearest_df, on="ID").merge(df, on=id_col)

# Split merged data into train and test sets
train_df = merged_data[merged_data['Year'] < 2023]
test_df = merged_data[merged_data['Year'] == 2023]

# Specify the target column
target_column = 'Total'

# Feature and target split
X = train_df.drop(columns=[target_column, 'ID', 'Location'], errors='ignore')
y = train_df[target_column]

# Handle categorical features
categorical_cols = X.select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for Random Forest
rf = RandomForestRegressor(random_state=42)
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}
rf_grid = GridSearchCV(rf, rf_params, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_

# Hyperparameter tuning for XGBoost
xgb = XGBRegressor(random_state=42, verbosity=0)
xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0],
}
xgb_grid = GridSearchCV(xgb, xgb_params, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
xgb_grid.fit(X_train, y_train)
best_xgb = xgb_grid.best_estimator_

# Create a hybrid model using Voting Regressor
hybrid_model = VotingRegressor([('RandomForest', best_rf), ('XGBoost', best_xgb)])
hybrid_model.fit(X_train, y_train)

# Make predictions
y_pred = hybrid_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")

# Prepare test data
X_test_final = test_df.drop(columns=['Total', 'ID', 'Location'], errors='ignore')

# Handle categorical features in test data
for col in categorical_cols:
    if col in X_test_final.columns:
        le = LabelEncoder()
        X_test_final[col] = le.fit_transform(X_test_final[col])

# Align test dataset with training features
for col in X.columns:
    if col not in X_test_final.columns:
        X_test_final[col] = 0  # Add missing feature with default value (e.g., zero)

# Ensure columns are in the same order as training
X_test_final = X_test_final[X.columns]

# Make predictions on test data
predictions = hybrid_model.predict(X_test_final)

# Save predictions to a CSV file
test_predictions = pd.DataFrame({
    'ID': test_df['ID'],
    'Predictions': predictions
})

test_predictions.to_csv("test_predictions.csv", index=False)
print("Predictions saved to 'test_predictions.csv'")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hospital_data['Total'].fillna(0, inplace=True)


KeyboardInterrupt: 

In [None]:
# MLflow

In [3]:
!pip install mlflow scikit-fuzzy deap xgboost

Collecting mlflow
  Downloading mlflow-2.20.0-py3-none-any.whl.metadata (30 kB)
Collecting scikit-fuzzy
  Downloading scikit_fuzzy-0.5.0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting deap
  Downloading deap-1.4.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting mlflow-skinny==2.20.0 (from mlflow)
  Downloading mlflow_skinny-2.20.0-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.20.0->mlflow)
  Downloading databricks_sdk-0.41.0-py3-none-any.whl.metadat

In [6]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from datetime import datetime
import logging
import os

# [Additional imports if needed]

def identify_target_column(train_path):
    """
    Identify the target column dynamically.

    Args:
        train_path (str): Path to the training dataset.

    Returns:
        str: Name of the target column.
    """
    # Potential target column names
    potential_targets = [
        'Disease_Cases', 'Cases', 'Target', 'Outcome',
        'Prediction', 'Label', 'Result'
    ]

    train = pd.read_csv(train_path)

    # Log available columns for debugging
    logging.info(f"Available columns in train dataset: {list(train.columns)}")

    # Check for predefined target columns
    for col in potential_targets:
        if col in train.columns:
            logging.info(f"Found target column: {col}")
            return col

    # Check for numeric columns as fallback
    numeric_columns = train.select_dtypes(include=[np.number]).columns
    numeric_target_candidates = [
        col for col in numeric_columns
        if col not in ['ID', 'Latitude', 'Longitude']
    ]

    if numeric_target_candidates:
        target_column = numeric_target_candidates[0]
        logging.warning(f"No explicit target column found. Using: {target_column}")
        return target_column

    raise ValueError("No suitable target column found in the dataset.")

class ImprovedDiseasePredictionPipeline:
    def __init__(self, experiment_name, target_column):
        self.experiment_name = experiment_name
        self.target_column = target_column
        # Initialize additional attributes if needed

    def load_and_preprocess_data(self, train_path, test_path, toilets_path, waste_path, water_path):
        """
        Load and preprocess datasets.
        """
        train = pd.read_csv(train_path)
        test = pd.read_csv(test_path)
        # Load additional datasets and merge if required
        # Preprocessing steps
        return train, test

    def train_hybrid_model(self, X_train, y_train, X_val, y_val):
        """
        Train the hybrid model.
        """
        # Example training process
        mae, rmse, r2 = 0.0, 0.0, 0.0  # Replace with actual model training logic
        return mae, rmse, r2

    def predict(self, X_test):
        """
        Generate predictions for the test set.
        """
        # Replace with model prediction logic
        predictions = np.zeros(len(X_test))  # Placeholder
        return predictions

    def save_predictions(self, predictions, ids, output_path):
        """
        Save predictions to a CSV file.
        """
        output = pd.DataFrame({'ID': ids, 'Prediction': predictions})
        output.to_csv(output_path, index=False)
        logging.info(f"Predictions saved to {output_path}")

    def save_model(self, model_path):
        """
        Save the trained model.
        """
        # Replace with actual model saving logic
        os.makedirs(model_path, exist_ok=True)
        logging.info(f"Model saved to {model_path}")

def main():
    """Main function to run the disease prediction pipeline."""
    try:
        # Set up logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('disease_prediction.log'),
                logging.StreamHandler()
            ]
        )

        # Define data paths
        data_paths = {
            'train': "Train.csv",
            'test': "Test.csv",
            'toilets': "toilets.csv",
            'waste': "waste_management.csv",
            'water': "water_sources.csv"
        }

        # Dynamically identify the target column
        target_column = identify_target_column(data_paths['train'])

        # Initialize the pipeline
        pipeline = ImprovedDiseasePredictionPipeline(
            experiment_name="disease_prediction_experiment",
            target_column=target_column
        )

        # Load and preprocess data
        logging.info("Loading and preprocessing data...")
        train, test = pipeline.load_and_preprocess_data(
            data_paths['train'],
            data_paths['test'],
            data_paths['toilets'],
            data_paths['waste'],
            data_paths['water']
        )

        # Split data for training
        logging.info("Splitting data for training...")
        try:
            X_train = train.drop(['ID', target_column], axis=1)
            y_train = train[target_column]
            X_val, X_test, y_val, y_test = train_test_split(
                X_train, y_train, test_size=0.2, random_state=42
            )
        except KeyError as e:
            logging.error(f"Column missing: {e}")
            raise

        # Train the model
        logging.info("Training hybrid model...")
        mae, rmse, r2 = pipeline.train_hybrid_model(X_train, y_train, X_val, y_val)

        # Generate predictions for the test set
        logging.info("Generating predictions for the test set...")
        X_test = test.drop(['ID'], axis=1)
        predictions = pipeline.predict(X_test)

        # Save predictions
        output_path = f"predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        pipeline.save_predictions(predictions, test['ID'], output_path)

        # Save the trained model
        pipeline.save_model("models")

        logging.info("Pipeline execution completed successfully!")

    except Exception as e:
        logging.error(f"Pipeline execution failed: {str(e)}")
        raise

if __name__ == "__main__":
    main()




In [7]:
# try

In [16]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, VotingRegressor
from xgboost import XGBRegressor
import logging
import os
from datetime import datetime

# Function to identify the target column
def identify_target_column(train_path):
    potential_targets = ['Disease_Cases', 'Cases', 'Target', 'Outcome', 'Prediction', 'Label', 'Result']
    train = pd.read_csv(train_path)
    logging.info(f"Available columns in train dataset: {list(train.columns)}")

    for col in potential_targets:
        if col in train.columns:
            logging.info(f"Found target column: {col}")
            return col

    numeric_columns = train.select_dtypes(include=[np.number]).columns
    numeric_target_candidates = [
        col for col in numeric_columns if col not in ['ID', 'Latitude', 'Longitude']
    ]

    if numeric_target_candidates:
        target_column = numeric_target_candidates[0]
        logging.warning(f"No explicit target column found. Using: {target_column}")
        return target_column

    raise ValueError("No suitable target column found in the dataset.")

class ImprovedDiseasePredictionPipeline:
    def __init__(self, experiment_name, target_column):
        self.experiment_name = experiment_name
        self.target_column = target_column

    def load_and_preprocess_data(self, train_path, test_path, toilets_path, waste_path, water_path):
        train = pd.read_csv(train_path)
        test = pd.read_csv(test_path)
        # Add any necessary preprocessing here
        return train, test

    def train_hybrid_model(self, X_train, y_train, X_val, y_val):
        xgb = XGBRegressor(n_estimators=500, max_depth=6, learning_rate=0.05, random_state=42)
        gbr = GradientBoostingRegressor(n_estimators=500, max_depth=4, learning_rate=0.05, random_state=42)
        rf = RandomForestRegressor(n_estimators=500, max_depth=10, random_state=42)

        model = VotingRegressor(estimators=[
            ('xgb', xgb),
            ('gbr', gbr),
            ('rf', rf)
        ])

        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)
        mae = mean_absolute_error(y_val, y_pred)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        r2 = r2_score(y_val, y_pred)

        logging.info(f"Validation MAE: {mae}, RMSE: {rmse}, R2: {r2}")
        return mae, rmse, r2

    def predict(self, X_test):
        predictions = self.model.predict(X_test)
        return predictions

    def save_predictions(self, predictions, ids, output_path):
        output = pd.DataFrame({'ID': ids, 'Prediction': predictions})
        output.to_csv(output_path, index=False)
        logging.info(f"Predictions saved to {output_path}")

    def save_model(self, model_path):
        os.makedirs(model_path, exist_ok=True)
        # Save models here
        logging.info(f"Model saved to {model_path}")

def main():
    try:
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('disease_prediction.log'),
                logging.StreamHandler()
            ]
        )

        data_paths = {
            'train': "Train.csv",
            'test': "Test.csv",
            'toilets': "toilets.csv",
            'waste': "waste_management.csv",
            'water': "water_sources.csv"
        }

        target_column = identify_target_column(data_paths['train'])

        pipeline = ImprovedDiseasePredictionPipeline(
            experiment_name="disease_prediction_experiment",
            target_column=target_column
        )

        train, test = pipeline.load_and_preprocess_data(
            data_paths['train'],
            data_paths['test'],
            data_paths['toilets'],
            data_paths['waste'],
            data_paths['water']
        )

        logging.info("Splitting data for training...")
        X_train = train.drop(['ID', target_column], axis=1)
        y_train = train[target_column]
        X_val, X_test, y_val, y_test = train_test_split(
            X_train, y_train, test_size=0.2, random_state=42
        )

        logging.info("Training hybrid model...")
        mae, rmse, r2 = pipeline.train_hybrid_model(X_train, y_train, X_val, y_val)

        logging.info("Generating predictions for the test set...")
        X_test = test.drop(['ID'], axis=1)
        predictions = pipeline.predict(X_test)

        output_path = f"predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        pipeline.save_predictions(predictions, test['ID'], output_path)

        pipeline.save_model("models")

        logging.info("Pipeline execution completed successfully!")

    except Exception as e:
        logging.error(f"Pipeline execution failed: {str(e)}")
        raise

if __name__ == "__main__":
    main()


ERROR:root:Pipeline execution failed: 'super' object has no attribute '__sklearn_tags__'


AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [17]:
!pip install scikit-learn==1.1.3
!pip install xgboost==1.7.5

Collecting scikit-learn==1.1.3
  Downloading scikit_learn-1.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading scikit_learn-1.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.0/32.0 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sklearn-compat 0.1.3 requires scikit-learn<1.7,>=1.2, but you have scikit-learn 1.1.3 which is incompatible.
bigframes 1.33.0 requires scikit-learn>=1.2.2, but you have scikit-learn 1.1.3 which is incompatible.
imbalanced-learn 0.13.0 

Collecting xgboost==1.7.5
  Downloading xgboost-1.7.5-py3-none-manylinux2014_x86_64.whl.metadata (1.9 kB)
Downloading xgboost-1.7.5-py3-none-manylinux2014_x86_64.whl (200.3 MB)
[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m193.1/200.3 MB[0m [31m7.1 MB/s[0m eta [36m0:00:02[0m[31mERROR: Operation cancelled by user[0m[31m
[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m193.1/200.3 MB[0m [31m7.1 MB/s[0m eta [36m0:00:02[0m
[?25h

In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, VotingRegressor
from xgboost import XGBRegressor
import logging
import os
from datetime import datetime

# Function to identify the target column
def identify_target_column(train_path):
    potential_targets = ['Disease_Cases', 'Cases', 'Target', 'Outcome', 'Prediction', 'Label', 'Result']
    train = pd.read_csv(train_path)
    logging.info(f"Available columns in train dataset: {list(train.columns)}")

    for col in potential_targets:
        if col in train.columns:
            logging.info(f"Found target column: {col}")
            return col

    numeric_columns = train.select_dtypes(include=[np.number]).columns
    numeric_target_candidates = [
        col for col in numeric_columns if col not in ['ID', 'Latitude', 'Longitude']
    ]

    if numeric_target_candidates:
        target_column = numeric_target_candidates[0]
        logging.warning(f"No explicit target column found. Using: {target_column}")
        return target_column

    raise ValueError("No suitable target column found in the dataset.")

class ImprovedDiseasePredictionPipeline:
    def __init__(self, experiment_name, target_column):
        self.experiment_name = experiment_name
        self.target_column = target_column

    def load_and_preprocess_data(self, train_path, test_path, toilets_path, waste_path, water_path):
        train = pd.read_csv(train_path)
        test = pd.read_csv(test_path)
        # Add any necessary preprocessing here
        return train, test

    def train_hybrid_model(self, X_train, y_train, X_val, y_val):
        xgb = XGBRegressor(n_estimators=500, max_depth=6, learning_rate=0.05, random_state=42)
        gbr = GradientBoostingRegressor(n_estimators=500, max_depth=4, learning_rate=0.05, random_state=42)
        rf = RandomForestRegressor(n_estimators=500, max_depth=10, random_state=42)

        model = VotingRegressor(estimators=[
            ('xgb', xgb),
            ('gbr', gbr),
            ('rf', rf)
        ])

        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)
        mae = mean_absolute_error(y_val, y_pred)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        r2 = r2_score(y_val, y_pred)

        logging.info(f"Validation MAE: {mae}, RMSE: {rmse}, R2: {r2}")
        return mae, rmse, r2

    def predict(self, X_test):
        predictions = self.model.predict(X_test)
        return predictions

    def save_predictions(self, predictions, ids, output_path):
        output = pd.DataFrame({'ID': ids, 'Prediction': predictions})
        output.to_csv(output_path, index=False)
        logging.info(f"Predictions saved to {output_path}")

    def save_model(self, model_path):
        os.makedirs(model_path, exist_ok=True)
        # Save models here
        logging.info(f"Model saved to {model_path}")

def main():
    try:
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('disease_prediction.log'),
                logging.StreamHandler()
            ]
        )

        data_paths = {
            'train': "Train.csv",
            'test': "Test.csv",
            'toilets': "toilets.csv",
            'waste': "waste_management.csv",
            'water': "water_sources.csv"
        }

        target_column = identify_target_column(data_paths['train'])

        pipeline = ImprovedDiseasePredictionPipeline(
            experiment_name="disease_prediction_experiment",
            target_column=target_column
        )

        train, test = pipeline.load_and_preprocess_data(
            data_paths['train'],
            data_paths['test'],
            data_paths['toilets'],
            data_paths['waste'],
            data_paths['water']
        )

        logging.info("Splitting data for training...")
        X_train = train.drop(['ID', target_column], axis=1)
        y_train = train[target_column]
        X_val, X_test, y_val, y_test = train_test_split(
            X_train, y_train, test_size=0.2, random_state=42
        )

        logging.info("Training hybrid model...")
        mae, rmse, r2 = pipeline.train_hybrid_model(X_train, y_train, X_val, y_val)

        logging.info("Generating predictions for the test set...")
        X_test = test.drop(['ID'], axis=1)
        predictions = pipeline.predict(X_test)

        output_path = f"predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        pipeline.save_predictions(predictions, test['ID'], output_path)

        pipeline.save_model("models")

        logging.info("Pipeline execution completed successfully!")

    except Exception as e:
        logging.error(f"Pipeline execution failed: {str(e)}")
        raise

if __name__ == "__main__":
    main()


ERROR:root:Pipeline execution failed: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:Location: object, Category_Health_Facility_UUID: object, Disease: object


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:Location: object, Category_Health_Facility_UUID: object, Disease: object