In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import numpy as np

# We need a special Pipeline and SMOTE from the 'imbalanced-learn' library
# If you don't have it, run: pip install imbalanced-learn
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# Load the dataset with the 'windows-1254' encoding
try:
    df = pd.read_csv(r"C:\DATASETS\triage\data.csv", delimiter=';', encoding='windows-1254')

    # --- Data Preprocessing ---
    df.replace('#BOŞ!', np.nan, inplace=True)
    for col in ['KTAS duration_min', 'Length of stay_min']:
        if df[col].dtype == 'object':
            df[col] = df[col].str.replace(',', '.').astype(float)
    
    df['Chief_complain'].fillna('', inplace=True)
            
    for col in df.columns:
        if df[col].dtype == 'object' and col != 'Chief_complain':
            df[col] = pd.to_numeric(df[col], errors='coerce')

    for col in df.columns:
        if df[col].dtype in ['int64', 'float64']:
            df[col].fillna(df[col].median(), inplace=True)
            
    df.dropna(subset=['KTAS_expert'], inplace=True)
    
    # --- Feature Engineering and Final Model Pipeline ---
    numeric_features = [
        'Group', 'Sex', 'Age', 'Patients number per hour', 'Arrival mode', 
        'Injury', 'Mental', 'Pain', 'NRS_pain', 'SBP', 'DBP', 'HR', 'RR', 
        'BT', 'Saturation'
    ]
    text_feature = 'Chief_complain'
    target = 'KTAS_expert'

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('text', TfidfVectorizer(), text_feature)
        ])

    # Create the final model pipeline with a SMOTE step
    final_model_with_smote = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', RandomForestClassifier(
            random_state=42,
            class_weight='balanced', 
            max_depth=None,
            min_samples_leaf=1,
            min_samples_split=5,
            n_estimators=100
        ))
    ])

    X = df.drop(target, axis=1)
    y = df[target]

    # --- Train and Evaluate the SMOTE Model ---
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the final, optimized model
    print("Training the final recommended model with SMOTE...")
    final_model_with_smote.fit(X_train, y_train)

    # Evaluate the final model
    y_pred = final_model_with_smote.predict(X_test)
    
    print("\n--- Final Recommended Model - Evaluation Results ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

except FileNotFoundError:
    print("Error: 'data.csv' not found. Please make sure the file is in the correct directory.")
except ModuleNotFoundError:
    print("Error: The 'imblearn' library is required. Please install it by running: pip install imbalanced-learn")
except Exception as e:
    print(f"An error occurred: {e}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Chief_complain'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values 

Training the final recommended model with SMOTE...

--- Final Recommended Model - Evaluation Results ---
Accuracy: 0.70

Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00         6
           2       0.76      0.71      0.74        49
           3       0.67      0.75      0.70        91
           4       0.69      0.78      0.73        83
           5       0.67      0.16      0.26        25

    accuracy                           0.70       254
   macro avg       0.76      0.68      0.69       254
weighted avg       0.70      0.70      0.68       254



In [6]:
df

Unnamed: 0,Group,Sex,Age,Patients number per hour,Arrival mode,Injury,Chief_complain,Mental,Pain,NRS_pain,...,BT,Saturation,KTAS_RN,Diagnosis in ED,Disposition,KTAS_expert,Error_group,Length of stay_min,KTAS duration_min,mistriage
0,2,2,71,3,3,2,right ocular pain,1,1,2.0,...,36.6,100.0,2,,1,4,2,86,5.00,1
1,1,1,56,12,3,2,right forearm burn,1,1,2.0,...,36.5,98.0,4,,1,5,4,64,3.95,1
2,2,1,68,8,2,2,"arm pain, Lt",1,1,2.0,...,36.6,98.0,4,,2,5,4,862,1.00,1
3,1,2,71,8,1,1,ascites tapping,1,1,3.0,...,36.5,98.0,4,,1,5,6,108,9.83,1
4,1,2,58,4,3,1,"distension, abd",1,1,3.0,...,36.5,98.0,4,,1,5,8,109,6.60,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1262,2,2,79,5,2,1,mental change,1,0,4.0,...,36.4,95.0,2,,2,2,0,1995,3.00,0
1263,2,2,81,2,3,1,dysuria,1,0,4.0,...,36.4,97.0,4,,1,4,0,1000,2.00,0
1264,2,2,81,17,2,1,dizziness,1,0,4.0,...,36.2,99.0,3,,1,3,0,310,4.00,0
1265,2,1,81,2,2,2,"Sensory, Decreased",1,0,4.0,...,36.6,98.0,3,,7,3,0,475,5.00,0
