##Import libraries

In [None]:
# 2. Import Libraries
import pandas as pd  # The "Excel" of Python
import numpy as np   # The "Calculator"

# Import AI Tools
from sklearn.impute import KNNImputer  # The smart tool to fill missing blanks
from sklearn.preprocessing import LabelEncoder  # Changes Text -> Numbers
from sklearn.model_selection import train_test_split  # Splits data for studying vs testing
from sklearn.preprocessing import StandardScaler  # Shrinks big numbers to be fair
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report  # Grades the AI's exam

# Import the 4 AI Models (The Brains)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

##Drive mount and data load

In [None]:
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/Thesis/CBC Report.csv'
df = pd.read_csv(file_path)
print('The data is loaded successfully')
df.head()

Mounted at /content/drive
The data is loaded successfully


Unnamed: 0,Serial,Date,Gender,Age,Haemoglobin,ESR,WBC,Neutrophil,Lymphocyte,Monocyte,Eosinophil,Basophil,RBC,Platelets,Result
0,A2308164543,2/8/2023,Female,40.0,11.2,32.0,11.3,65,23.0,10.0,2.0,0.0,4.04,30,Positive
1,A2308164502,2/8/2023,Male,13.0,11.6,59.0,7.8,49,42.0,9.0,0.0,0.0,4.34,40,Positive
2,A2308164673,3/8/2023,Male,23.0,15.1,,3.85,65,25.0,9.0,1.0,0.0,5.43,30,Positive
3,A2308164685,3/8/2023,Male,58.0,8.5,,10.3,85,6.0,6.0,3.0,0.0,3.0,30,Positive
4,A2308164626,2/8/2023,Female,35.0,12.0,28.0,6.7,54,20.0,4.0,1.0,0.0,4.3,40,Positive


##Data cleaning and processing

In [None]:
df = df.drop(columns=['Serial', 'Date'], errors='ignore')

len_gender = LabelEncoder()
df['Gender'] = len_gender.fit_transform(df['Gender'].astype(str))

cols_to_numeric = ['Age', 'Haemoglobin', 'ESR', 'WBC', 'Neutrophil',
                   'Lymphocyte', 'Monocyte', 'Eosinophil', 'Basophil',
                   'RBC', 'Platelets']

for col in cols_to_numeric:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# KNN Imputation
X_temp = df.drop(columns=['Result'])
knn_imputer = KNNImputer(n_neighbors=5)
X_imputed = knn_imputer.fit_transform(X_temp)

df_final = pd.DataFrame(X_imputed, columns=X_temp.columns)
df_final['Result'] = df['Result'].values

le_result = LabelEncoder()
df_final['Result'] = le_result.fit_transform(df_final['Result'].astype(str))

df_final.head()


Unnamed: 0,Gender,Age,Haemoglobin,ESR,WBC,Neutrophil,Lymphocyte,Monocyte,Eosinophil,Basophil,RBC,Platelets,Result
0,0.0,40.0,11.2,32.0,11.3,65.0,23.0,10.0,2.0,0.0,4.04,30.0,1
1,1.0,13.0,11.6,59.0,7.8,49.0,42.0,9.0,0.0,0.0,4.34,40.0,1
2,1.0,23.0,15.1,9.8,3.85,65.0,25.0,9.0,1.0,0.0,5.43,30.0,1
3,1.0,58.0,8.5,46.4,10.3,85.0,6.0,6.0,3.0,0.0,3.0,30.0,1
4,0.0,35.0,12.0,28.0,6.7,54.0,20.0,4.0,1.0,0.0,4.3,40.0,1


##80/20 split

In [None]:
X = df_final.drop(columns=['Result'])
y = df_final['Result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print('Train set:', X_train.shape)
print('Test set:', X_test.shape)


Train set: (240, 12)
Test set: (61, 12)


##Scaling large numbers

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

##Model train

In [None]:
# Define the 4 Models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC()
}

print("--- Final Result ---\n")

for name, model in models.items():
    #Train
    model.fit(X_train_scaled, y_train)

    #Predict
    y_pred = model.predict(X_test_scaled)

    #Generate Reports
    acc = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    #Print Everything
    print(f"------ MODEL: {name} ------")
    print(f"Accuracy: {acc * 100:.2f}%")
    print("\nConfusion Matrix:")
    print(conf_matrix)
    print("\nDetailed Report:")
    print(class_report)
    print("-" * 60) # Separator line
    print("\n")

--- Final Result ---

------ MODEL: Logistic Regression ------
Accuracy: 93.44%

Confusion Matrix:
[[21  0]
 [ 4 36]]

Detailed Report:
              precision    recall  f1-score   support

           0       0.84      1.00      0.91        21
           1       1.00      0.90      0.95        40

    accuracy                           0.93        61
   macro avg       0.92      0.95      0.93        61
weighted avg       0.94      0.93      0.94        61

------------------------------------------------------------


------ MODEL: Decision Tree ------
Accuracy: 85.25%

Confusion Matrix:
[[21  0]
 [ 9 31]]

Detailed Report:
              precision    recall  f1-score   support

           0       0.70      1.00      0.82        21
           1       1.00      0.78      0.87        40

    accuracy                           0.85        61
   macro avg       0.85      0.89      0.85        61
weighted avg       0.90      0.85      0.86        61

---------------------------------------