TRAINING THE MODEL AND CHECKING THE ACCURACY

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score

In [2]:
#Set random seed for reproducibility
np.random.seed(42)

In [3]:
#We load the dataset
combined_df = pd.read_csv("combined_synthetic_dataset.csv")
combined_df

Unnamed: 0,patient_id,age,gender,diagnosis,medications,treatment_plan,heart_rate,blood_pressure_systolic,blood_pressure_diastolic,oxygen_saturation,...,cholesterol_level,hemoglobin,white_blood_cell_count,ventilator_setting,dialysis_machine,cardiac_monitor,level_of_consciousness,breathing_rate,pulse_rate,outcome
0,1,69,1,1,0,2,98,135,78,88,...,201,12.484757,8.765534,0,0,1,1,22,90,1
1,2,32,1,0,3,1,91,158,76,90,...,212,13.756085,5.239948,2,1,1,0,16,86,0
2,3,89,0,3,0,2,76,168,92,92,...,223,16.186200,6.545394,1,0,0,2,18,76,0
3,4,78,1,1,3,1,94,187,71,89,...,212,12.333985,8.614250,1,0,0,2,19,82,1
4,5,38,1,2,1,0,96,140,62,74,...,166,16.711718,12.952196,3,1,0,1,12,72,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,27,0,4,0,0,79,148,84,86,...,211,14.351245,6.895513,0,1,0,1,19,70,0
996,997,51,0,0,0,1,77,172,75,85,...,203,16.360149,12.666945,0,1,0,2,18,93,1
997,998,72,0,2,1,1,97,102,78,80,...,209,13.125202,9.766583,3,1,0,0,16,75,0
998,999,49,1,2,0,0,62,165,60,91,...,195,14.383976,7.367358,3,1,0,2,14,113,0


In [4]:
#Check fisrt 5 rows
combined_df.head()

Unnamed: 0,patient_id,age,gender,diagnosis,medications,treatment_plan,heart_rate,blood_pressure_systolic,blood_pressure_diastolic,oxygen_saturation,...,cholesterol_level,hemoglobin,white_blood_cell_count,ventilator_setting,dialysis_machine,cardiac_monitor,level_of_consciousness,breathing_rate,pulse_rate,outcome
0,1,69,1,1,0,2,98,135,78,88,...,201,12.484757,8.765534,0,0,1,1,22,90,1
1,2,32,1,0,3,1,91,158,76,90,...,212,13.756085,5.239948,2,1,1,0,16,86,0
2,3,89,0,3,0,2,76,168,92,92,...,223,16.1862,6.545394,1,0,0,2,18,76,0
3,4,78,1,1,3,1,94,187,71,89,...,212,12.333985,8.61425,1,0,0,2,19,82,1
4,5,38,1,2,1,0,96,140,62,74,...,166,16.711718,12.952196,3,1,0,1,12,72,1


In [5]:
#Check last 5 rows 
combined_df.tail()

Unnamed: 0,patient_id,age,gender,diagnosis,medications,treatment_plan,heart_rate,blood_pressure_systolic,blood_pressure_diastolic,oxygen_saturation,...,cholesterol_level,hemoglobin,white_blood_cell_count,ventilator_setting,dialysis_machine,cardiac_monitor,level_of_consciousness,breathing_rate,pulse_rate,outcome
995,996,27,0,4,0,0,79,148,84,86,...,211,14.351245,6.895513,0,1,0,1,19,70,0
996,997,51,0,0,0,1,77,172,75,85,...,203,16.360149,12.666945,0,1,0,2,18,93,1
997,998,72,0,2,1,1,97,102,78,80,...,209,13.125202,9.766583,3,1,0,0,16,75,0
998,999,49,1,2,0,0,62,165,60,91,...,195,14.383976,7.367358,3,1,0,2,14,113,0
999,1000,67,1,2,3,0,82,141,64,81,...,154,12.662911,8.718183,2,1,1,0,26,94,0


In [6]:
#PREPROCESSING

#Step 1: LABEL ENCODER
label_encoders = {}
for column in combined_df.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    combined_df[column] = label_encoders[column].fit_transform(combined_df[column])

In [7]:
#STEP 2: SCALING NUMERICAL FEATURES
scaler = StandardScaler()
scaled_features = scaler.fit_transform(combined_df.drop(columns=['patient_id','outcome']))
X = pd.DataFrame(scaled_features, columns=combined_df.drop(columns=['patient_id', 'outcome']).columns)


In [8]:
#STEP 3: TARGET VARIABLE
y = combined_df['outcome']

In [9]:
#SPLITTING THE DATA INTO TRAINING AND TESTING SETS
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [10]:
#TRAIN THE MODEL USING RANDOM FOREST MODEL
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [11]:
#WE MAKE PREDICTIONS
y_pred = clf.predict(X_test)

In [12]:
#EVALUATING THE MODEL
accuracy = accuracy_score(y_test,y_pred)
print(f"Model accuracy : {accuracy * 100:.2f}%")


#ENSURE THE ACCURACY IS WITHIN THE DESIRED RANGE OF 80-95%
#(OPTIONAL)

if 80 <= accuracy*100 <= 95:
    print("The model meets the accuracy requirements")
else:
    print("Model accuracy is outside the desired requirements")

Model accuracy : 91.50%
The model meets the accuracy requirements


In [13]:
from sklearn.metrics import classification_report,confusion_matrix

#PREDICTION
y_pred = clf.predict(X_test)

In [14]:
#WE NOW CALCULATE THE CONFUSION MATRIX FOR THE ABOVE PREDICTED MODEL

conf_matrix = confusion_matrix(y_test,y_pred)
print("CONFUSION MATRIX")
print(conf_matrix)

CONFUSION MATRIX
[[109   0]
 [ 17  74]]


In [15]:
# EXPLANATION OF THE ABOVE CONFUSION MATRIX:
# TRUE NEGATIVES(109) - NOT CRITICAL CASES CORRECTLY PREDICTED
# FALSE POSITIVES(0) - CRITICAL CASES INCORRECTLY PREDICTED AS NOT CRITICAL
# FALSE NEGATIVES(17) - NOT CRITICAL CASES INCORRECTLY PREDICTED AS CRITICAL
# TRUE POSITIVES(74) - CRITICAL CASES CORRECTLY PREDICTED

In [16]:
#GENERATING CLASSIFICATION REPORT:

report = classification_report(y_test,y_pred,target_names=['Not Critical','Critical'])
print("CLASSIFICATION REPORT")
print(report)

CLASSIFICATION REPORT
              precision    recall  f1-score   support

Not Critical       0.87      1.00      0.93       109
    Critical       1.00      0.81      0.90        91

    accuracy                           0.92       200
   macro avg       0.93      0.91      0.91       200
weighted avg       0.93      0.92      0.91       200



In [22]:
# WE HAVE TRIED A FUNCTION TO RETRIEVE ALL DATA FOR A GIVEN PATIENT ID

import pandas as pd

def get_patient_data(patient_id, df):
    patient_data = df[df['patient_id'] == patient_id]
    if not patient_data.empty:
        return patient_data
    else:
        return f"Patient ID {patient_id} not found."
#PARAMETERS USED: 
patient_id = int(input("Enter patient ID"))  # GETTING PATIENT ID
patient_info = get_patient_data(patient_id,combined_df)
print(patient_info)

Enter patient ID 446


     patient_id  age  gender  diagnosis  medications  treatment_plan  \
446         447   34       1          0            3               2   

     heart_rate  blood_pressure_systolic  blood_pressure_diastolic  \
446          70                      167                        98   

     oxygen_saturation  ...  cholesterol_level  hemoglobin  \
446                 72  ...                226    15.86652   

     white_blood_cell_count  ventilator_setting  dialysis_machine  \
446                6.793992                   0                 1   

     cardiac_monitor  level_of_consciousness  breathing_rate  pulse_rate  \
446                0                       0              26          71   

     outcome  
446        1  

[1 rows x 25 columns]
