In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import tensorflow as tf
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')
from sklearn.base import BaseEstimator, RegressorMixin


class DataLoader:
    @staticmethod
    def load_rainfall_data(file_path):
        """Load and preprocess rainfall data"""
        try:
            rainfall_df = pd.read_csv(file_path)
            rainfall_features = rainfall_df.groupby('YEAR').agg({
                'ANNUAL': 'mean',
                'June-September': 'mean',
                'Mar-May': 'mean',
                'Jan-Feb': 'mean',
                'Oct-Dec': 'mean'
            }).reset_index()
            print("Rainfall data loaded successfully")
            return rainfall_features
        except Exception as e:
            print(f"Error loading rainfall data: {e}")
            return None

    @staticmethod
    def load_canal_data(file_path):
        """Load and preprocess canal data"""
        try:
            canal_df = pd.read_csv(file_path)
            canal_stats = pd.DataFrame({
                'Canal_Flow_Mean': [canal_df['Canal Flow'].mean()],
                'Canal_Flow_Std': [canal_df['Canal Flow'].std()],
                'Soil_Moisture_Mean': [canal_df['Soil Moisture'].mean()],
                'Aquifer_Thickness_Mean': [canal_df['Aquifer Thickness'].mean()],
                'Hydraulic_Conductivity_Mean': [canal_df['Hydraulic Conductivity'].mean()]
            })
            print("Canal data loaded successfully")
            return canal_stats
        except Exception as e:
            print(f"Error loading canal data: {e}")
            return None

    @staticmethod
    def load_water_level_data(file_path):
        """Load and preprocess water level data"""
        try:
            water_level_df = pd.read_excel(file_path)
            processed_data = []

            for year in range(2016, 2024):
                if year == 2022:
                    pre_col = f'Pre_{year}'
                    post_col = 'Post_2022'
                elif year == 2023:
                    pre_col = f'Pre_{year}'
                    post_col = 'Post_2023'
                else:
                    pre_col = f'Pre_{year}'
                    post_col = f'Pst_{year}'

                if pre_col not in water_level_df.columns or post_col not in water_level_df.columns:
                    print(f"Warning: Missing data for year {year}")
                    continue

                year_data = water_level_df[[
                    'Latitude', 'Longitude', 'Well_Depth',
                    pre_col, post_col
                ]].copy()

                year_data[pre_col] = pd.to_numeric(year_data[pre_col].replace(' ', np.nan), errors='coerce')
                year_data[post_col] = pd.to_numeric(year_data[post_col].replace(' ', np.nan), errors='coerce')

                year_data['Year'] = year
                year_data['Water_Level'] = (year_data[pre_col] + year_data[post_col]) / 2

                year_data = year_data.dropna(subset=['Water_Level'])

                if len(year_data) > 0:
                    year_data = year_data.drop([pre_col, post_col], axis=1)
                    processed_data.append(year_data)

            if processed_data:
                final_df = pd.concat(processed_data, ignore_index=True)
                print(f"Water level data loaded successfully. Total records: {len(final_df)}")
                return final_df
            else:
                raise ValueError("No valid data processed")

        except Exception as e:
            print(f"Error loading water level data: {e}")
            return None

class DataPreprocessor:
    def __init__(self):
        self.scaler = StandardScaler()

    def preprocess_data(self, rainfall_df, canal_df, water_level_df):
        try:
            if rainfall_df is None or canal_df is None or water_level_df is None:
                raise ValueError("One or more input DataFrames are None")

            print("\nInitial shapes:")
            print(f"Rainfall data: {rainfall_df.shape}")
            print(f"Canal data: {canal_df.shape}")
            print(f"Water level data: {water_level_df.shape}")

            rainfall_df = rainfall_df.reset_index(drop=True)
            water_level_df = water_level_df.reset_index(drop=True)
            rainfall_df['YEAR'] = rainfall_df['YEAR'].astype(int)
            water_level_df['Year'] = water_level_df['Year'].astype(int)

            merged_data = pd.merge(
                water_level_df,
                rainfall_df[['YEAR', 'ANNUAL', 'June-September', 'Mar-May', 'Jan-Feb', 'Oct-Dec']],
                left_on='Year',
                right_on='YEAR',
                how='left'
            )

            for col in canal_df.columns:
                merged_data[col] = canal_df[col].iloc[0]

            merged_data = merged_data.fillna(method='ffill').fillna(method='bfill')
            merged_data['MonthOfYear'] = 6
            merged_data['DayOfYear'] = 182

            feature_columns = [
                'Year', 'MonthOfYear', 'DayOfYear',
                'Latitude', 'Longitude', 'Well_Depth',
                'ANNUAL', 'June-September', 'Mar-May',
                'Jan-Feb', 'Oct-Dec'
            ] + [col for col in merged_data.columns if 'Canal' in col or 'Soil' in col]

            existing_columns = [col for col in feature_columns if col in merged_data.columns]

            X = merged_data[existing_columns].copy()
            y = merged_data['Water_Level'].copy()

            print("\nPreprocessing completed successfully")
            print(f"Final shapes - X: {X.shape}, y: {y.shape}")

            return X, y

        except Exception as e:
            print(f"Error in data preprocessing: {e}")
            return None, None

class HyperparameterTuner:
    def __init__(self):
        self.rf_best_params = None
        self.xgb_best_params = None
        # self.lstm_best_params = None

    def tune_random_forest(self, X_train, y_train):
        """Quick Random Forest tuning"""
        print("\nTuning Random Forest...")

        # Very focused parameter grid
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [10, 20],
            'min_samples_split': [2],
            'max_features': ['sqrt']
        }

        # Initialize RF model
        rf_model = RandomForestRegressor(random_state=42)

        # Quick grid search with minimal CV
        grid_search = GridSearchCV(
            estimator=rf_model,
            param_grid=param_grid,
            cv=3,  # Reduced from 5 to 3
            n_jobs=-1,
            verbose=1,
            scoring='neg_mean_squared_error'
        )

        grid_search.fit(X_train, y_train)
        self.rf_best_params = grid_search.best_params_

        print("Best RF Parameters:", self.rf_best_params)
        print("Best RF Score:", np.sqrt(-grid_search.best_score_))

        return self.rf_best_params

    def tune_xgboost(self, X_train, y_train):
        """Quick XGBoost tuning using manual validation"""
        print("\nTuning XGBoost...")

        # Define parameter combinations
        param_combinations = [
            {
                'n_estimators': 100,
                'max_depth': 3,
                'learning_rate': 0.01,
                'subsample': 0.8,
                'colsample_bytree': 0.8,
                'min_child_weight': 1
            },
            {
                'n_estimators': 200,
                'max_depth': 5,
                'learning_rate': 0.1,
                'subsample': 0.8,
                'colsample_bytree': 0.8,
                'min_child_weight': 1
            }
        ]

        best_score = float('inf')
        best_params = None

        # Manual cross-validation
        for params in param_combinations:
            try:
                # Create and train model
                model = XGBRegressor(**params, random_state=42,n_estimators=500)
                model.fit(
                    X_train, y_train,
                    eval_set=[(X_train, y_train)],
                    verbose=False
                )

                # Make predictions
                predictions = model.predict(X_train)

                # Calculate score
                score = mean_squared_error(y_train, predictions)

                # Update best parameters if better score found
                if score < best_score:
                    best_score = score
                    best_params = params

                print(f"Parameters: {params}")
                print(f"MSE Score: {score:.4f}")

            except Exception as e:
                print(f"Error with parameters {params}: {e}")
                continue

        if best_params is None:
            print("Warning: Using default XGBoost parameters")
            best_params = {
                'n_estimators': 100,
                'max_depth': 3,
                'learning_rate': 0.1,
                'subsample': 0.8,
                'colsample_bytree': 0.8,
                'min_child_weight': 1,
                'random_state': 42
            }

        print("\nBest XGBoost Parameters:", best_params)
        print("Best MSE Score:", best_score)

        self.xgb_best_params = best_params
        return self.xgb_best_params

    # def tune_lstm(self, X_train, y_train, X_val, y_val):
    #     """Quick LSTM tuning with fixed parameters"""
    #     print("\nConfiguring LSTM...")

    #     self.lstm_best_params = {
    #         'lstm_units_1': 64,
    #         'lstm_units_2': 32,
    #         'dense_units': 16,
    #         'dropout_1': 0.2,
    #         'dropout_2': 0.2,
    #         'learning_rate': 0.001,
    #         'batch_size': 32
    #     }

    #     print("Using default LSTM parameters:", self.lstm_best_params)
    #     return self.lstm_best_params


class ImprovedStackedEnsemble(BaseEstimator, RegressorMixin):
    def __init__(self, rf_params=None, xgb_params=None):
        self.rf_params = rf_params if rf_params else {'random_state': 42}
        self.xgb_params = xgb_params if xgb_params else {'random_state': 42}
        # self.lstm_params = lstm_params if lstm_params else {
        #     'lstm_units_1': 64,
        #     'lstm_units_2': 32,
        #     'dense_units': 16,
        #     'dropout_1': 0.2,
        #     'dropout_2': 0.2,
        #     'learning_rate': 0.001,
        #     'batch_size': 32
        # }

        self.rf_model = RandomForestRegressor(**self.rf_params)
        self.xgb_model = XGBRegressor(**self.xgb_params)
        # self.lstm_model = None
        self.scaler = StandardScaler()

    # def prepare_lstm_data(self, X, rf_pred, xgb_pred, y=None, lookback=3):
    #     """Prepare sequences for LSTM with proper pandas handling"""
    #     X_seq, y_seq = [], []

    #     # Convert inputs to numpy arrays
    #     X_array = X.values if isinstance(X, pd.DataFrame) else np.array(X)
    #     rf_pred_array = rf_pred.reshape(-1, 1)
    #     xgb_pred_array = xgb_pred.reshape(-1, 1)
    #     combined_features = np.column_stack([X_array, rf_pred_array, xgb_pred_array])

    #     if y is not None:
    #         y_array = y.values if isinstance(y, pd.Series) else np.array(y)

    #     for i in range(len(combined_features) - lookback):
    #         X_seq.append(combined_features[i:(i + lookback)])
    #         if y is not None:
    #             y_seq.append(y_array[i + lookback])

    #     X_seq = np.array(X_seq)
    #     if y is not None:
    #         y_seq = np.array(y_seq)
    #         return X_seq, y_seq
    #     return X_seq

    # def build_lstm(self, input_shape):
    #     """Build LSTM model"""
    #     model = Sequential([
    #         LSTM(self.lstm_params['lstm_units_1'],
    #              activation='relu',
    #              input_shape=input_shape,
    #              return_sequences=True),
    #         Dropout(self.lstm_params['dropout_1']),
    #         LSTM(self.lstm_params['lstm_units_2'],
    #              activation='relu'),
    #         Dropout(self.lstm_params['dropout_2']),
    #         Dense(self.lstm_params['dense_units'],
    #              activation='relu'),
    #         Dense(1)
    #     ])

    #     model.compile(
    #         optimizer=tf.keras.optimizers.Adam(
    #             learning_rate=self.lstm_params['learning_rate']
    #         ),
    #         loss='mse'
    #     )
    #     return model

    def fit(self, X, y):
        """Train the stacked ensemble"""
        print("\nTraining Level 1 - Random Forest...")
        self.rf_model.fit(X, y)
        rf_predictions = self.rf_model.predict(X)
        print(rf_predictions)
        print("Training Level 2 - XGBoost...")
        xgb_features = np.column_stack([X, rf_predictions])
        print(xgb_features)
        self.xgb_model.fit(xgb_features, y)
        xgb_predictions = self.xgb_model.predict(xgb_features)

        # print("Training Level 3 - LSTM...")
        # X_lstm, y_lstm = self.prepare_lstm_data(X, rf_predictions, xgb_predictions, y)
        # self.lstm_model = self.build_lstm((X_lstm.shape[1], X_lstm.shape[2]))

        early_stopping = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )

        # self.lstm_model.fit(
        #     X_lstm, y_lstm,
        #     epochs=20,
        #     batch_size=self.lstm_params['batch_size'],
        #     validation_split=0.2,
        #     callbacks=[early_stopping],
        #     verbose=1
        # )

        return self

    def predict(self, X):
        """Generate predictions"""
        rf_predictions = self.rf_model.predict(X)
        xgb_features = np.column_stack([X, rf_predictions])
        xgb_predictions = self.xgb_model.predict(xgb_features)
        # X_lstm = self.prepare_lstm_data(X, rf_predictions, xgb_predictions)

        # if len(X_lstm) > 0:
        #     return self.lstm_model.predict(X_lstm).flatten()
        return rf_predictions

    def predict_future(self, X_last, years=5):
        """Predict future values"""
        future_predictions = []
        current_data = X_last.copy()

        for year in range(years):
          # 1. Make prediction for current year
          pred = self.predict(current_data)
          future_predictions.extend(pred)  # Store prediction

          # 2. Update data for next year's prediction:
          current_data = np.roll(current_data, -1, axis=0)  # Shift data one step
          current_data[-1, 0] = 2024 + year + 1  # Update Year
          current_data[-1, 3:] = current_data[-2, 3:]  # Hold Latitude, Longitude, Well_Depth constant
          current_data[-1, 6:11] = current_data[-2, 6:11]  # Hold rainfall features constant
          current_data[-1, 11:] = current_data[-2, 11:]  # Hold canal features constant
          current_data[-1, -2] = pred[-1]  # Update with the predicted water level
        return future_predictions

        # except Exception as e:
        #     print(f"Error in prediction pipeline: {e}")
        #     return np.full(years, last_prediction)

    def score(self, X, y):
        """Calculate R² score"""
        predictions = self.predict(X)
        return r2_score(y[-len(predictions):], predictions)

    def evaluate(self, X_test, y_test):
        """Evaluate model performance"""
        rf_predictions = self.rf_model.predict(X_test)
        xgb_features = np.column_stack([X_test, rf_predictions])
        xgb_predictions = self.xgb_model.predict(xgb_features)
        final_predictions = self.predict(X_test)

        return {
            'RF_MSE': mean_squared_error(y_test, rf_predictions),
            'RF_R2': r2_score(y_test, rf_predictions),
            'RF_MAE': mean_absolute_error(y_test, rf_predictions),
            'XGB_MSE': mean_squared_error(y_test, xgb_predictions),
            'XGB_R2': r2_score(y_test, xgb_predictions),
            'XGB_MAE': mean_absolute_error(y_test, xgb_predictions)
        }

def main():
    try:
        # Load data
        data_loader = DataLoader()
        print("\nLoading data...")
        rainfall_data = data_loader.load_rainfall_data('/Users/aditya/Desktop/anoushka/test/Rainfall_Data_LL.csv')
        canal_data = data_loader.load_canal_data('/Users/aditya/Desktop/anoushka/test/CleanedCanalData.csv')
        water_level_data = data_loader.load_water_level_data('/Users/aditya/Desktop/anoushka/test/Rajasthan_Water_Level.xlsx')

        if rainfall_data is None or canal_data is None or water_level_data is None:
            raise ValueError("One or more datasets failed to load")

        # Preprocess
        preprocessor = DataPreprocessor()
        print("\nPreprocessing data...")
        X, y = preprocessor.preprocess_data(rainfall_data, canal_data, water_level_data)
        print("\nPreprocessed Data (X):")
        print(X.head())  # Display the first few rows of X
        print("\nPreprocessed Target (y):")
        print(y.head())
        if X is None or y is None:
            raise ValueError("Preprocessing failed to produce valid output")

        # Take a smaller sample for tuning
        sample_size = min(3000, len(X))
        indices = np.random.choice(len(X), sample_size, replace=False)
        X_sample = X.iloc[indices]
        y_sample = y.iloc[indices]

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X_sample, y_sample, test_size=0.2, random_state=42
        )
        X_train, X_val, y_train, y_val = train_test_split(
            X_train, y_train, test_size=0.2, random_state=42
        )

        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)
        X_test_scaled = scaler.transform(X_test)

        # Quick tuning
        print("\nStarting quick hyperparameter tuning...")
        tuner = HyperparameterTuner()
        rf_params = tuner.tune_random_forest(X_train_scaled, y_train)
        xgb_params = tuner.tune_xgboost(X_train_scaled, y_train)
        # lstm_params = tuner.tune_lstm(X_train_scaled, y_train, X_val_scaled, y_val)

        # Train final model
        print("\nTraining final model with tuned parameters...")
        model = ImprovedStackedEnsemble(rf_params, xgb_params)
        model.fit(X_train_scaled, y_train)

        # Evaluate
        metrics = model.evaluate(X_test_scaled, y_test)
        print("\nModel Performance Metrics:")
        for metric, value in metrics.items():
            print(f"{metric}: {value:.4f}")

        # Generate predictions
        print("\nGenerating future predictions...")
        future_preds =enumerate(model.predict_future(X_test_scaled[-3:],years=5),1)
        current_year=2024
        print("\nPredictions for next 5 years:")
        for i, pred in future_preds:
          if current_year + i <= 2029:  # Print only up to 2029
            # print(f"Year {current_year + i}: {pred:.2f}m")
            print(f"Year {2024 + i}: {pred:.2f}m")

    except Exception as e:
        print(f"Error in main execution: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()


Loading data...
Error loading rainfall data: [Errno 2] No such file or directory: '/Users/aditya/Desktop/anoushka/Rainfall_Data_LL.csv'
Error loading water level data: [Errno 2] No such file or directory: '/Users/aditya/Desktop/anoushka/Rajasthan_Water_Level.xlsx'
Error in main execution: One or more datasets failed to load


Traceback (most recent call last):
  File "/var/folders/nt/q7hgyc8x3gz788mgyt5hmh2h0000gn/T/ipykernel_23153/4204895364.py", line 454, in main
    raise ValueError("One or more datasets failed to load")
ValueError: One or more datasets failed to load
