# Model7

In [None]:
from sklearn.base import BaseEstimator, RegressorMixin
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import numpy as np

# Custom wrapper for XGBRegressor to ensure compatibility
class CustomXGBRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, **params):
        self.model = XGBRegressor(**params)

    def fit(self, X, y):
        self.model.fit(X, y)
        return self

    def predict(self, X):
        return self.model.predict(X)

    def get_params(self, deep=True):
        return self.model.get_params(deep)

    def set_params(self, **params):
        self.model.set_params(**params)
        return self

class DiseasePredictionPipeline:
    def __init__(self):
        self.train = None
        self.test = None
        self.pipeline = None
        self.feature_columns = None

    def load_data(self):
        # Load datasets
        self.train = pd.read_csv("train.csv")
        self.test = pd.read_csv("test.csv")
        
        print("Columns in training dataset:", self.train.columns)
        print("Columns in testing dataset:", self.test.columns)
        
        # Clean target variable
        print("\nInitial target variable statistics:")
        print(self.train['Total'].describe())
        
        # Remove rows where Total is NaN, infinite, or extremely large
        self.train = self.train[
            np.isfinite(self.train['Total'])
        ]
        
        print("\nTarget variable statistics after cleaning:")
        print(self.train['Total'].describe())
        print(f"\nRemoved {len(self.train) - len(self.train)} rows with invalid Total values")
        
        # Define feature columns (excluding 'Total' and 'ID')
        self.feature_columns = [col for col in self.train.columns 
                              if col not in ['Total', 'ID']]
        
        # Verify all feature columns exist in test dataset
        missing_cols = [col for col in self.feature_columns 
                       if col not in self.test.columns]
        if missing_cols:
            raise ValueError(f"Missing columns in test dataset: {missing_cols}")

        # Handle missing values separately for train and test
        self._handle_missing_values(self.train)
        self._handle_missing_values(self.test)

    def _handle_missing_values(self, df):
        """Handle missing values for a given dataframe"""
        # Numerical columns
        numerical_cols = df[self.feature_columns].select_dtypes(
            include=['float64', 'int64']).columns
        for col in numerical_cols:
            # Replace infinite values with NaN first
            df[col] = df[col].replace([np.inf, -np.inf], np.nan)
            # Then fill NaN with median
            df[col].fillna(df[col].median(), inplace=True)

        # Categorical columns
        categorical_cols = df[self.feature_columns].select_dtypes(
            include=['object']).columns
        for col in categorical_cols:
            df[col].fillna(df[col].mode()[0], inplace=True)

    def create_pipeline(self):
        # Define preprocessing for numerical features
        numerical_features = [col for col in self.feature_columns 
                            if self.train[col].dtype in ['float64', 'int64']]
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])

        # Define preprocessing for categorical features
        categorical_features = [col for col in self.feature_columns 
                              if self.train[col].dtype == 'object']
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        # Combine preprocessing steps
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )

        # Define the pipeline
        self.pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', CustomXGBRegressor(
                n_estimators=100,
                learning_rate=0.1,
                max_depth=5,
                random_state=42,
                objective='count:poisson',  # Better for count data
                tree_method='hist',         # Faster training
                min_child_weight=1,         # Help with many zero values
                subsample=0.8,              # Prevent overfitting
                colsample_bytree=0.8        # Prevent overfitting
            ))
        ])

    def run_pipeline(self):
        try:
            # Load and preprocess data
            self.load_data()
            
            # Prepare training data
            X = self.train[self.feature_columns]
            y = self.train['Total']
            
            # Split the training data
            X_train, X_val, y_train, y_val = train_test_split(
                X, y, test_size=0.2, random_state=42
            )

            # Create and train the pipeline
            self.create_pipeline()
            self.pipeline.fit(X_train, y_train)

            # Evaluate on validation set
            val_predictions = self.pipeline.predict(X_val)
            mae = np.mean(np.abs(val_predictions - y_val))
            print(f"Validation MAE: {mae}")

            # Cross-validation for robustness
            cv_scores = cross_val_score(
                self.pipeline, X, y, 
                cv=5, 
                scoring='neg_mean_absolute_error'
            )
            print(f"Cross-validation MAE: {-np.mean(cv_scores):.2f} ± {np.std(cv_scores):.2f}")

            # Predict on test set
            test_predictions = self.pipeline.predict(self.test[self.feature_columns])
            
            # Create submission dataframe
            submission = pd.DataFrame({
                'ID': self.test['ID'],
                'Total': test_predictions
            })
            submission.to_csv('predictions.csv', index=False)
            print("Predictions saved to 'predictions.csv'")
            
        except Exception as e:
            print(f"An error occurred: {str(e)}")
            raise

if __name__ == "__main__":
    pipeline = DiseasePredictionPipeline()
    pipeline.run_pipeline()



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\User\anaconda3\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\User\anaconda3\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\User\anaconda3\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "c:\Users\User\anaconda3\lib\site-packages\traitlets\config\application.py", line 992, in launch_instance
    app.start()
  File "c:\Users\User\anaconda3\lib\site-packages\

AttributeError: _ARRAY_API not found

ImportError: numpy.core.multiarray failed to import