In [37]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, make_scorer, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# --- Keyword-based Feature Engineering Function ---
def calculate_severity_score(text):
    """Enhanced keyword-based severity scoring"""
    score = 0
    text = str(text).lower()

    critical = {
        'arrest': 25, 'unconscious': 20, 'cpr': 25, 'hemorrhage': 18,
        'infarction': 18, 'stroke': 20, 'seizure': 16, 'anaphylaxis': 18,
        'shock': 20, 'paralysis': 18, 'hemiparesis': 16, 'code': 25
    }
    high = {
        'dyspnea': 12, 'chest pain': 14, 'hematemesis': 13, 'melena': 12,
        'syncope': 12, 'bleeding': 12, 'confusion': 13, 'acute': 11
    }
    moderate = {
        'fever': 6, 'vomiting': 6, 'headache': 5, 'dizzy': 5, 'diarrhea': 5,
        'laceration': 6, 'burn': 7, 'pain': 3, 'rash': 4
    }
    
    for keyword, value in {**critical, **high, **moderate}.items():
        if re.search(r'\b' + re.escape(keyword) + r'\b', text):
            score += value
    
    return score


def engineer_features(df):
    """Clinical feature engineering"""
    
    # Clean vitals
    for col in ['SBP', 'DBP', 'HR', 'RR', 'Saturation', 'BT']:
        if col in df.columns:
            df[col] = df[col].replace(0, np.nan).fillna(df[col].median())
    
    # Derived vitals
    df['shock_index'] = df['HR'] / (df['SBP'] + 1)
    df['map'] = (df['SBP'] + 2 * df['DBP']) / 3
    df['pulse_pressure'] = df['SBP'] - df['DBP']
    
    # Count abnormal vitals
    df['abnormal_vitals'] = (
        ((df['SBP'] < 90) | (df['SBP'] > 180)) +
        ((df['HR'] < 50) | (df['HR'] > 120)) +
        ((df['RR'] < 10) | (df['RR'] > 24)) +
        (df['Saturation'] < 92) +
        ((df['BT'] < 36) | (df['BT'] > 38.5))
    ).astype(int)
    
    # Interactions
    df['severity_x_pain'] = df['severity_score'] * np.log1p(df['NRS_pain'])
    df['severity_x_mental'] = df['severity_score'] * (df['Mental'] != 1).astype(int)
    df['age_x_severity'] = (df['Age'] / 100) * df['severity_score']
    
    # Age risk
    df['high_risk_age'] = ((df['Age'] < 2) | (df['Age'] > 70)).astype(int)
    
    # Pain categories
    df['high_pain'] = (df['NRS_pain'] >= 7).astype(int)
    
    return df

# --- Custom Scorer for GridSearchCV to focus on KTAS 1 recall ---
ktas1_recall_scorer = make_scorer(recall_score, labels=[0], average='macro', zero_division=0)


# --- Main ---
try:
    print("="*75)
    print("FINAL OPTIMIZED 4-CLASS ENSEMBLE CLASSIFIER")
    print("="*75)
    
    df = pd.read_csv(r"C:\DATASETS\triage\data.csv", delimiter=';', encoding='windows-1254')
    print(f"\nDataset: {df.shape[0]} rows × {df.shape[1]} columns")

    # Preprocessing
    df.replace('#BOŞ!', np.nan, inplace=True)
    
    for col in ['KTAS duration_min', 'Length of stay_min']:
        if col in df.columns and df[col].dtype == 'object':
            df[col] = df[col].str.replace(',', '.').astype(float)
    
    df['Chief_complain'] = df['Chief_complain'].fillna('')
    df['severity_score'] = df['Chief_complain'].apply(calculate_severity_score)
    
    for col in df.columns:
        if df[col].dtype == 'object' and col != 'Chief_complain':
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    for col in df.columns:
        if df[col].dtype in ['int64', 'float64']:
            df[col] = df[col].fillna(df[col].median())
    
    df.dropna(subset=['KTAS_expert'], inplace=True)

    # --- MERGE CLASS 4 AND 5 ---
    print("\nMerging KTAS classes 4 and 5 into a single class (4)...")
    df['KTAS_expert'] = df['KTAS_expert'].replace(5, 4)
    
    df = engineer_features(df)
    
    print("\nNew Class Distribution:")
    for cls in sorted(df['KTAS_expert'].unique()):
        count = (df['KTAS_expert'] == cls).sum()
        print(f"  KTAS {int(cls)}: {count:4d} ({count/len(df)*100:5.1f}%)")
    
    # Features
    numeric_features = [
        'Age', 'Sex', 'Group', 'Arrival mode', 'Injury', 'Mental', 
        'Pain', 'NRS_pain', 'SBP', 'DBP', 'HR', 'RR', 'BT', 'Saturation',
        'severity_score', 'shock_index', 'map', 'pulse_pressure',
        'abnormal_vitals', 'severity_x_pain', 'severity_x_mental',
        'age_x_severity', 'high_risk_age', 'high_pain'
    ]
    text_feature = 'Chief_complain'
    target = 'KTAS_expert'

    # Using a Pipeline to integrate preprocessing and classifier
    pipeline = Pipeline([
        ('preprocessor', ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numeric_features),
                ('text', TfidfVectorizer(stop_words='english', max_features=250)),
            ])),
        ('classifier', XGBClassifier(
            random_state=42,
            use_label_encoder=False,
            eval_metric='mlogloss',
            n_jobs=-1))
    ])
    
    # Split Data
    X = df.drop(target, axis=1)
    y = (df[target].values - 1).astype(int) # 0-indexed for training (0,1,2,3)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Hyperparameter Grid for XGBoost
    param_grid = {
        'classifier__n_estimators': [400, 500],
        'classifier__max_depth': [5, 7],
        'classifier__learning_rate': [0.05],
        'classifier__subsample': [0.8, 1.0],
        'classifier__colsample_bytree': [0.8]
    }

    # --- Train Model 1: Generalist (for overall accuracy) ---
    print("\n" + "="*75)
    print("1. TUNING GENERALIST MODEL FOR BEST OVERALL F1-SCORE")
    print("="*75)
    grid_search_general = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, scoring='f1_weighted', verbose=1)
    grid_search_general.fit(X_train, y_train)
    best_general_model = grid_search_general.best_estimator_
    print(f"\nBest generalist parameters found: {grid_search_general.best_params_}")

    # --- Train Model 2: Specialist (for KTAS 1 recall) ---
    print("\n" + "="*75)
    print("2. TUNING SPECIALIST MODEL TO MAXIMIZE KTAS 1 RECALL")
    print("="*75)
    grid_search_specialist = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, scoring=ktas1_recall_scorer, verbose=1)
    grid_search_specialist.fit(X_train, y_train)
    best_specialist_model = grid_search_specialist.best_estimator_
    print(f"\nBest specialist parameters found: {grid_search_specialist.best_params_}")

    # --- Combine Predictions ---
    print("\n" + "="*75)
    print("3. COMBINING PREDICTIONS FROM GENERALIST & SPECIALIST MODELS")
    print("="*75)
    
    # Get standard predictions from the accurate generalist model
    y_pred_general = best_general_model.predict(X_test)
    
    # Get probabilities from the sensitive specialist model
    y_pred_proba_specialist = best_specialist_model.predict_proba(X_test)

    # Start with the generalist's predictions
    final_predictions = y_pred_general.copy()
    
    # If the specialist model is confident enough about KTAS 1, override the generalist
    threshold = 0.28
    print(f"\nApplying custom prediction threshold of {threshold} for KTAS 1...")
    override_count = 0
    for i in range(len(y_test)):
        if y_pred_proba_specialist[i][0] >= threshold:
            if final_predictions[i] != 0:
                override_count += 1
            final_predictions[i] = 0 # Predict KTAS 1
            
    print(f"Specialist model overrode the generalist's prediction for {override_count} case(s).")
    y_pred = final_predictions
    
    # --- Results Section ---
    print("\n" + "="*75)
    print("FINAL ENSEMBLE RESULTS")
    print("="*75)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    
    print(f"\n{'Metric':<25} {'Score':<15}")
    print("-"*40)
    print(f"{'Accuracy':<25} {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"{'F1-Score (Weighted)':<25} {f1_weighted:.4f}")
    print(f"{'F1-Score (Macro)':<25} {f1_macro:.4f}")
    
    print("\n" + "-"*75)
    print("CLASSIFICATION REPORT")
    print("-"*75)
    target_names = [f"KTAS {i}" for i in range(1, 5)] # Now 4 classes
    print(classification_report(y_test, y_pred, target_names=target_names, zero_division=0))
    
    print("\n" + "="*75)
    print("CONFUSION MATRIX")
    print("-"*75)
    cm = confusion_matrix(y_test, y_pred)
    classes = sorted(np.unique(y_test))
    
    print("\n          Predicted KTAS")
    print("Actual    ", "   ".join([f"{int(c)+1:3d}" for c in classes]))
    print("----------------------------------------")
    for i, cls in enumerate(classes):
        print(f" KTAS {int(cls)+1}  {cm[i]}")

except ModuleNotFoundError as e:
    if 'xgboost' in str(e):
        print("\nERROR: XGBoost not found. Please install it by running: pip install xgboost")
    else:
        print(f"An error occurred: {e}")
except Exception as e:
    print(f"\nAn error occurred: {e}")
    import traceback
    traceback.print_exc()



FINAL OPTIMIZED 4-CLASS ENSEMBLE CLASSIFIER

Dataset: 1267 rows × 24 columns

Merging KTAS classes 4 and 5 into a single class (4)...

New Class Distribution:
  KTAS 1:   26 (  2.1%)
  KTAS 2:  220 ( 17.4%)
  KTAS 3:  487 ( 38.4%)
  KTAS 4:  534 ( 42.1%)

1. TUNING GENERALIST MODEL FOR BEST OVERALL F1-SCORE
Fitting 3 folds for each of 8 candidates, totalling 24 fits

An error occurred: 
All the 24 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sythe\anaconda3\envs\tensor2\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sythe\anaconda3\envs\tensor2\Lib\site-packages\sklearn\base.py", line 

Traceback (most recent call last):
  File "C:\Users\sythe\AppData\Local\Temp\ipykernel_11344\2109808032.py", line 164, in <module>
    grid_search_general.fit(X_train, y_train)
  File "C:\Users\sythe\anaconda3\envs\tensor2\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\sythe\anaconda3\envs\tensor2\Lib\site-packages\sklearn\model_selection\_search.py", line 1051, in fit
    self._run_search(evaluate_candidates)
  File "C:\Users\sythe\anaconda3\envs\tensor2\Lib\site-packages\sklearn\model_selection\_search.py", line 1605, in _run_search
    evaluate_candidates(ParameterGrid(self.param_grid))
  File "C:\Users\sythe\anaconda3\envs\tensor2\Lib\site-packages\sklearn\model_selection\_search.py", line 1028, in evaluate_candidates
    _warn_or_raise_about_fit_failures(out, self.error_score)
  File "C:\Users\sythe\anaconda3\envs\tensor2\Lib\site-packages\sklearn\model_s

In [17]:
# Set pandas display options to show full content
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Display only the two columns you're interested in
print(df[['Chief_complain', 'KTAS_expert']])

                             Chief_complain  KTAS_expert
0                         right ocular pain            4
1                        right forearm burn            5
2                              arm pain, Lt            5
3                           ascites tapping            5
4                           distension, abd            5
5                                     fever            4
6                     With chest discomfort            3
7                               pain, chest            3
8                       LBP - Low back pain            5
9                            Eczema, Eyelid            5
10                    acute epigastric pain            4
11                                pain, leg            5
12                          epigastric pain            4
13                                 abd pain            4
14                                 headache            4
15                                 headache            4
16                             