In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
import os

class Epic2_3_GA_Range_RecommendationSystem:
    def __init__(self, path_raw):
        # Load input CSV and extract plant name from filename
        self.df_raw = pd.read_csv(path_raw)
        self.df_raw['plant_name'] = self._extract_plant_name(path_raw)

        # Standardize column names and add temporal features
        self.standardize_columns()
        self.add_temporal_derivatives()

        # Initialize placeholders
        self.df_filtered = None
        self.df_ranges = None
        self.X = None
        self.y = None
        self.models = {}
        self.y_pred = None
        self.outlier_warnings = []

    def _extract_plant_name(self, path):
        # Extract plant name by removing filler tokens from filename
        fname = os.path.basename(path)
        name = os.path.splitext(fname)[0]
        tokens = [t for t in name.split('_') if t.lower() not in ['filled', 'root', 'data', 'csv']]
        return ' '.join(tokens).strip()

    def standardize_columns(self):
        # Rename inconsistent column names to standardized format
        rename_map = {
            'GA_content_%': 'Yield',
            'Yield': 'Yield',
            'epsilon_r_ext': 'epsilon_r_est',
            'pH_neutral_ext': 'pH_neutral_est',
            'Temperature °C': 'temperature_C',
            'Time min': 'time_min',
            'Time s': 'time_s',
            'concen tar dis': 'concentration_est',
            'option 1 est': 'option1_est',
            'rbd marker est': 'rbd_marker_est'
        }
        self.df_raw.rename(columns=rename_map, inplace=True)
        if 'plant_name' not in self.df_raw.columns:
            raise ValueError("Missing 'plant_name' column in input data.")

    def add_temporal_derivatives(self):
        # Add delta features to capture temporal changes in key process variables
        for col in ['temperature_C', 'pressure_bar_abs', 'pH_neutral_est']:
            delta_col = f'delta_{col}'
            self.df_raw[delta_col] = self.df_raw[col].diff().fillna(0)

    def validate_and_filter(self):
        # Ensure all required columns are present
        required = ['plant_name', 'Yield', 'time_min', 'temperature_C', 'pressure_bar_abs', 'pH_neutral_est']
        missing = [col for col in required if col not in self.df_raw.columns]
        if missing:
            raise ValueError(f"Missing required columns: {missing}")

        # Drop rows with nulls in required fields
        null_rows = self.df_raw[required].isnull().any(axis=1)
        if null_rows.any():
            print(f"Warning: Dropping {null_rows.sum()} rows with nulls in required fields.")
            self.df_raw = self.df_raw[~null_rows].copy()

        # Identify and log outlier values using z-score threshold
        numeric_cols = ['Yield', 'time_min', 'temperature_C', 'pressure_bar_abs', 'pH_neutral_est']
        for col in numeric_cols:
            mean = self.df_raw[col].mean()
            std = self.df_raw[col].std(ddof=0)
            z = (self.df_raw[col] - mean) / std
            outliers = self.df_raw[np.abs(z) > 3]
            if not outliers.empty:
                flagged_values = outliers[col].round(2).tolist()
                warning = (
                    f"{len(flagged_values)} rows in '{col}' had extreme values: {flagged_values}. "
                    f"These deviate significantly from the mean ({mean:.2f}) and may indicate measurement anomalies."
                )
                self.outlier_warnings.append(warning)

        # Apply safety filter if available
        if 'subcritical_safe_flag' in self.df_raw.columns:
            self.df_filtered = self.df_raw[self.df_raw['subcritical_safe_flag'] == True].copy()
        else:
            self.df_filtered = self.df_raw.copy()

    def extract_optimal_ranges(self, threshold_ratio=0.85):
        # Identify high-yield parameter ranges for each plant type
        rows = []
        for plant, group in self.df_filtered.groupby('plant_name'):
            yield_high = group['Yield'].max()
            yield_thresh = threshold_ratio * yield_high
            high_yield = group[group['Yield'] >= yield_thresh]
            if high_yield.empty:
                continue
            rows.append({
                'plant_name': plant,
                'Yield_high': yield_high,
                'Yield_threshold': yield_thresh,
                **{f"{col}_min": high_yield[col].min() for col in ['time_min', 'temperature_C', 'pressure_bar_abs', 'pH_neutral_est']},
                **{f"{col}_max": high_yield[col].max() for col in ['time_min', 'temperature_C', 'pressure_bar_abs', 'pH_neutral_est']}
            })
        self.df_ranges = pd.DataFrame(rows)

    def prepare_features(self):
        # Build feature matrix with one-hot encoding and optional process variables
        optional = ['epsilon_r_est', 'Brix_No', 'ambient_temp', 'humidity', 'equipment_type',
                    'delta_temperature_C', 'delta_pressure_bar_abs', 'delta_pH_neutral_est']
        available = [f for f in optional if f in self.df_filtered.columns]
        feature_df = self.df_ranges[['plant_name']].copy()
        for f in available:
            feature_df[f] = self.df_filtered.groupby('plant_name')[f].first().reindex(self.df_ranges['plant_name']).values
        encoder = OneHotEncoder(sparse_output=False)
        encoded = encoder.fit_transform(feature_df[['plant_name']])
        self.X = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['plant_name']))
        for f in available:
            self.X[f] = feature_df[f].values

    def train_model(self):
        # Train separate XGBoost models for each target parameter (min and max)
        target_cols = [f"{col}_{ext}" for col in ['time_min', 'temperature_C', 'pressure_bar_abs', 'pH_neutral_est'] for ext in ['min', 'max']]
        self.y = self.df_ranges[target_cols]
        self.y_pred = np.zeros_like(self.y.values)
        for i, col in enumerate(target_cols):
            model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, objective='reg:squarederror', random_state=42)
            model.fit(self.X, self.y[col])
            self.models[col] = model
            self.y_pred[:, i] = model.predict(self.X)

    def build_output(self):
        # Format and print recommended settings for each plant type
        output = self.df_ranges[['plant_name', 'Yield_high', 'Yield_threshold']].copy()
        param_names = ['time_min', 'temperature_C', 'pressure_bar_abs', 'pH_neutral_est']
        for i, name in enumerate(param_names):
            output[f'{name}_range'] = [f"{row[0]:.2f} – {row[1]:.2f}" for row in self.y_pred[:, [2*i, 2*i+1]]]

        for plant in output['plant_name'].unique():
            row = output[output['plant_name'] == plant].iloc[0]
            print(f'\nProcess: "{plant}" - Recommended Settings:')
            print(f"  Yield_high: {row['Yield_high']:.2f}")
            print(f"  Yield_threshold: {row['Yield_threshold']:.2f}")
            for p in param_names:
                print(f"  {p}_range: {row[f'{p}_range']}")

        return output

    def run(self):
        # Execute full pipeline and print warnings if any
        self.validate_and_filter()
        self.extract_optimal_ranges()
        self.prepare_features()
        self.train_model()
        output = self.build_output()

        #if self.outlier_warnings:
            #print("\nData Quality Warnings:")
            #for w in self.outlier_warnings:
                #print(f"  - {w}")

        return output

def run_multiple_plants(file_paths):
    # Run pipeline for multiple plant files and summarize best-performing process
    results = []
    filenames = [os.path.basename(p) for p in file_paths]
    print(f"\nProcessed files: {', '.join(filenames)}")

    for path in file_paths:
        pipeline = Epic2_3_GA_Range_RecommendationSystem(path)
        output = pipeline.run()
        results.append((pipeline.df_ranges, pipeline.outlier_warnings))

    # Combine all plant results into one DataFrame
    all_ranges = pd.concat([r for r, _ in results], ignore_index=True)

    # Print summary recommendation based on highest yield
    if not all_ranges.empty:
        best = all_ranges.loc[all_ranges['Yield_high'].idxmax()]
        print(f"\nSummary Recommendation:")
        print(f"The most effective process was for '{best['plant_name']}', achieving the highest yield of {best['Yield_high']:.2f}%.")
        print("Consider prioritizing this extraction setup for future optimization.")


In [4]:
run_multiple_plants([
    r"C:\Users\Syeed\Desktop\Innovo\filled_Glycyrrhiza_Glabra_Root.csv",
    r"C:\Users\Syeed\Desktop\Innovo\filled_Ellagic_Acid_Peel.csv"
])
##Insert your correct codepath here!!



Processed files: filled_Glycyrrhiza_Glabra_Root.csv, filled_Ellagic_Acid_Peel.csv

Process: "Glycyrrhiza Glabra" - Recommended Settings:
  Yield_high: 14.12
  Yield_threshold: 12.00
  time_min_range: 4.75 – 5.42
  temperature_C_range: 143.56 – 145.97
  pressure_bar_abs_range: 19.70 – 19.88
  pH_neutral_est_range: 5.74 – 5.77

Process: "Ellagic Acid Peel" - Recommended Settings:
  Yield_high: 4.64
  Yield_threshold: 3.94
  time_min_range: 17.00 – 19.00
  temperature_C_range: 128.24 – 132.07
  pressure_bar_abs_range: 9.09 – 9.80
  pH_neutral_est_range: 5.94 – 6.01

Summary Recommendation:
The most effective process was for 'Glycyrrhiza Glabra', achieving the highest yield of 14.12%.
Consider prioritizing this extraction setup for future optimization.
