# Model test

In [None]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from scipy.spatial import cKDTree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.impute import KNNImputer
import skfuzzy as fuzz
from deap import base, creator, tools, algorithms
import warnings
import logging
from datetime import datetime
import os

warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO)

class ANFISLayer:
    def __init__(self, n_inputs, n_rules, learning_rate=0.01):
        self.n_inputs = n_inputs
        self.n_rules = n_rules
        self.learning_rate = learning_rate
        self.membership_params = np.random.randn(n_inputs, n_rules, 3)  # center, width, type
        self.consequence_params = np.random.randn(n_rules, n_inputs + 1)
        
    def membership_function(self, x, params):
        center, width, mf_type = params
        if mf_type > 0.5:  # Gaussian
            return fuzz.gaussmf(x, center, width)
        else:  # Bell-shaped
            return fuzz.gbellmf(x, width, 2, center)
    
    def forward(self, X):
        # Layer 1: Fuzzification
        membership_values = np.zeros((X.shape[0], self.n_inputs, self.n_rules))
        for i in range(self.n_inputs):
            for j in range(self.n_rules):
                membership_values[:, i, j] = self.membership_function(X[:, i], 
                                                                    self.membership_params[i, j])
        
        # Layer 2: Rules
        rule_outputs = np.prod(membership_values, axis=1)
        
        # Layer 3: Normalization
        normalized_firing_strengths = rule_outputs / (np.sum(rule_outputs, axis=1, keepdims=True) + 1e-10)
        
        # Layer 4: Consequence
        extended_X = np.column_stack([X, np.ones(X.shape[0])])
        consequent_outputs = np.dot(extended_X, self.consequence_params.T)
        
        # Layer 5: Output
        final_output = np.sum(normalized_firing_strengths * consequent_outputs, axis=1)
        return final_output

class GeneticOptimizer:
    def __init__(self, population_size=50, n_generations=30):
        self.population_size = population_size
        self.n_generations = n_generations
        creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
        creator.create("Individual", list, fitness=creator.FitnessMin)
        
    def optimize(self, anfis, X, y):
        toolbox = base.Toolbox()
        n_params = np.prod(anfis.membership_params.shape) + np.prod(anfis.consequence_params.shape)
        
        toolbox.register("attr_float", np.random.uniform, -1, 1)
        toolbox.register("individual", tools.initRepeat, creator.Individual, 
                        toolbox.attr_float, n=n_params)
        toolbox.register("population", tools.initRepeat, list, toolbox.individual)
        
        def evaluate(individual):
            # Update ANFIS parameters
            membership_params = np.array(individual[:np.prod(anfis.membership_params.shape)])
            consequence_params = np.array(individual[np.prod(anfis.membership_params.shape):])
            anfis.membership_params = membership_params.reshape(anfis.membership_params.shape)
            anfis.consequence_params = consequence_params.reshape(anfis.consequence_params.shape)
            
            # Calculate fitness
            predictions = anfis.forward(X)
            return (mean_absolute_error(y, predictions),)
        
        toolbox.register("evaluate", evaluate)
        toolbox.register("mate", tools.cxTwoPoint)
        toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=0.1, indpb=0.1)
        toolbox.register("select", tools.selTournament, tournsize=3)
        
        # Evolution
        population = toolbox.population(n=self.population_size)
        algorithms.eaSimple(population, toolbox, cxpb=0.7, mutpb=0.3, 
                          ngen=self.n_generations, verbose=False)

class ImprovedDiseasePredictionPipeline:
    def __init__(self, experiment_name="disease_prediction"):
        self.experiment_name = experiment_name
        mlflow.set_experiment(experiment_name)
        self.models = {}
        self.scalers = {}
        self.label_encoders = {}
        
    def load_and_preprocess_data(self, train_path, test_path, toilets_path, 
                                waste_path, water_path):
        with mlflow.start_run(run_name="data_preprocessing"):
            # Load datasets
            train = pd.read_csv(train_path)
            test = pd.read_csv(test_path)
            toilets = pd.read_csv(toilets_path)
            waste = pd.read_csv(waste_path)
            water = pd.read_csv(water_path)
            
            # Log data statistics
            mlflow.log_param("train_shape", train.shape)
            mlflow.log_param("test_shape", test.shape)
            
            # Clean and preprocess data
            for df in [toilets, waste, water]:
                df.drop(columns=['Year', 'Month'], inplace=True, errors='ignore')
                
            # Advanced feature engineering
            self._create_temporal_features(train)
            self._create_temporal_features(test)
            self._create_spatial_features(train, [toilets, waste, water])
            self._create_spatial_features(test, [toilets, waste, water])
            
            # Handle missing values using KNN imputation
            imputer = KNNImputer(n_neighbors=5)
            numeric_columns = train.select_dtypes(include=[np.number]).columns
            train[numeric_columns] = imputer.fit_transform(train[numeric_columns])
            test[numeric_columns] = imputer.transform(test[numeric_columns])
            
            # Scale features
            self.scalers['standard'] = StandardScaler()
            self.scalers['robust'] = RobustScaler()
            
            train[numeric_columns] = self.scalers['standard'].fit_transform(train[numeric_columns])
            test[numeric_columns] = self.scalers['standard'].transform(test[numeric_columns])
            
            # Encode categorical variables
            categorical_columns = train.select_dtypes(include=['object']).columns
            for col in categorical_columns:
                self.label_encoders[col] = LabelEncoder()
                train[col] = self.label_encoders[col].fit_transform(train[col])
                test[col] = self.label_encoders[col].transform(test[col])
            
            return train, test
            
    def _create_temporal_features(self, df):
        if 'Date' in df.columns:
            df['Date'] = pd.to_datetime(df['Date'])
            df['Month'] = df['Date'].dt.month
            df['Season'] = df['Date'].dt.month % 12 // 3 + 1
            df['DayOfYear'] = df['Date'].dt.dayofyear
            
    def _create_spatial_features(self, df, auxiliary_dfs):
        for aux_df in auxiliary_dfs:
            if 'Latitude' in aux_df.columns and 'Longitude' in aux_df.columns:
                tree = cKDTree(aux_df[['Latitude', 'Longitude']].values)
                distances, _ = tree.query(df[['Latitude', 'Longitude']].values, k=3)
                df[f'{aux_df}_nearest_dist'] = distances[:, 0]
                df[f'{aux_df}_avg_3_nearest'] = distances.mean(axis=1)
                
    def train_hybrid_model(self, X_train, y_train, X_val, y_val):
        with mlflow.start_run(run_name="model_training"):
            # Initialize base models
            self.models['rf'] = RandomForestRegressor(n_estimators=200, max_depth=15)
            self.models['xgb'] = XGBRegressor(n_estimators=200, learning_rate=0.05)
            self.models['gbm'] = GradientBoostingRegressor(n_estimators=200)
            
            # Train base models
            for name, model in self.models.items():
                model.fit(X_train, y_train)
                val_pred = model.predict(X_val)
                mae = mean_absolute_error(y_val, val_pred)
                mlflow.log_metric(f"{name}_mae", mae)
            
            # Initialize and train ANFIS-GA hybrid
            anfis = ANFISLayer(n_inputs=X_train.shape[1], n_rules=5)
            genetic_optimizer = GeneticOptimizer()
            genetic_optimizer.optimize(anfis, X_train, y_train)
            
            # Create ensemble predictions
            ensemble_predictions = np.zeros(len(X_val))
            for model in self.models.values():
                ensemble_predictions += model.predict(X_val)
            anfis_predictions = anfis.forward(X_val)
            
            # Weighted combination
            final_predictions = 0.7 * (ensemble_predictions / len(self.models)) + 0.3 * anfis_predictions
            
            # Log metrics
            mae = mean_absolute_error(y_val, final_predictions)
            rmse = np.sqrt(mean_squared_error(y_val, final_predictions))
            r2 = r2_score(y_val, final_predictions)
            
            mlflow.log_metrics({
                "final_mae": mae,
                "final_rmse": rmse,
                "final_r2": r2
            })
            
            return mae, rmse, r2
            
    def predict(self, X_test):
        ensemble_predictions = np.zeros(len(X_test))
        for model in self.models.values():
            ensemble_predictions += model.predict(X_test)
        ensemble_predictions /= len(self.models)
        
        # Add ANFIS predictions
        anfis_predictions = self.anfis.forward(X_test)
        final_predictions = 0.7 * ensemble_predictions + 0.3 * anfis_predictions
        
        return final_predictions
        
    def save_predictions(self, predictions, ids, output_path):
        submission = pd.DataFrame({
            'ID': ids,
            'Target': predictions
        })
        submission.to_csv(output_path, index=False)
        mlflow.log_artifact(output_path)

def main():
    # Initialize pipeline
    pipeline = ImprovedDiseasePredictionPipeline()
    
    # Load and preprocess data
    train, test = pipeline.load_and_preprocess_data(
        "Train.csv", "Test.csv", "toilets.csv", 
        "waste_management.csv", "water_sources.csv"
    )
    
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(
        train.drop(['ID', 'Total'], axis=1),
        train['Total'],
        test_size=0.2,
        random_state=42
    )
    
    # Train model
    mae, rmse, r2 = pipeline.train_hybrid_model(X_train, y_train, X_val, y_val)
    print(f"Validation MAE: {mae:.4f}")
    print(f"Validation RMSE: {rmse:.4f}")
    print(f"Validation R2: {r2:.4f}")
    
    # Generate predictions
    X_test = test.drop(['ID'], axis=1)
    predictions = pipeline.predict(X_test)
    
    # Save predictions
    pipeline.save_predictions(
        predictions,
        test['ID'],
        f"predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    )

if __name__ == "__main__":
    main()



ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject