In [1]:
#Requrired Libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

#Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

#Logistic Regression Models
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [3]:
df = pd.read_csv('Blood_samples_dataset_balanced_2(f).csv')
df.head()

Unnamed: 0,Glucose,Cholesterol,Hemoglobin,Platelets,White Blood Cells,Red Blood Cells,Hematocrit,Mean Corpuscular Volume,Mean Corpuscular Hemoglobin,Mean Corpuscular Hemoglobin Concentration,...,HbA1c,LDL Cholesterol,HDL Cholesterol,ALT,AST,Heart Rate,Creatinine,Troponin,C-reactive Protein,Disease
0,0.739597,0.650198,0.713631,0.868491,0.687433,0.529895,0.290006,0.631045,0.001328,0.795829,...,0.502665,0.21556,0.512941,0.064187,0.610827,0.939485,0.095512,0.465957,0.76923,Healthy
1,0.121786,0.023058,0.944893,0.905372,0.507711,0.403033,0.164216,0.307553,0.207938,0.505562,...,0.85681,0.652465,0.106961,0.942549,0.344261,0.666368,0.65906,0.816982,0.401166,Diabetes
2,0.452539,0.116135,0.54456,0.40064,0.294538,0.382021,0.625267,0.295122,0.868369,0.026808,...,0.466795,0.387332,0.421763,0.007186,0.506918,0.431704,0.417295,0.799074,0.779208,Thalasse
3,0.136609,0.015605,0.419957,0.191487,0.081168,0.166214,0.073293,0.668719,0.125447,0.501051,...,0.016256,0.040137,0.826721,0.265415,0.594148,0.225756,0.490349,0.637061,0.354094,Anemia
4,0.176737,0.75222,0.971779,0.785286,0.44388,0.439851,0.894991,0.442159,0.257288,0.805987,...,0.429431,0.146294,0.221574,0.01528,0.567115,0.841412,0.15335,0.794008,0.09497,Thalasse


In [4]:
# Check duplicates
n_duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {n_duplicates}")

# Remove duplicates
df = df.drop_duplicates()
print(f"Shape after removing duplicates: {df.shape}")

# Check class distribution
print("\nClass distribution:")
print(df['Disease'].value_counts())

Number of duplicate rows: 2286
Shape after removing duplicates: (65, 25)

Class distribution:
Disease
Healthy     16
Diabetes    16
Anemia      16
Thalasse    14
Thromboc     3
Name: count, dtype: int64


In [5]:
#Collecting the features
features = []
target = 'Disease'

for name in df.columns:
    if name != target:
        features.append(name)

print(features)

['Glucose', 'Cholesterol', 'Hemoglobin', 'Platelets', 'White Blood Cells', 'Red Blood Cells', 'Hematocrit', 'Mean Corpuscular Volume', 'Mean Corpuscular Hemoglobin', 'Mean Corpuscular Hemoglobin Concentration', 'Insulin', 'BMI', 'Systolic Blood Pressure', 'Diastolic Blood Pressure', 'Triglycerides', 'HbA1c', 'LDL Cholesterol', 'HDL Cholesterol', 'ALT', 'AST', 'Heart Rate', 'Creatinine', 'Troponin', 'C-reactive Protein']


In [6]:
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
#Defining the models
models = {
    'Logistic Regression': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(max_iter=1000, multi_class='multinomial'))
    ]),

    'SGD Classifier': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', SGDClassifier(max_iter=1000, random_state=42))
    ]),

    'Random Forest': Pipeline([
        ('classifier', RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42
        ))
    ]),

    'Gradient Boosting': Pipeline([
        ('classifier', GradientBoostingClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=5,
            random_state=42
        ))
    ])
}

In [11]:
for name, pipeline in models.items():
    print(f"\nTraining {name}...")

    # Train the model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Perform cross-validation
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)

    # Print results
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Cross-validation score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))


Training Logistic Regression...
Accuracy: 0.1538
Cross-validation score: 0.5000 (+/- 0.2139)

Classification Report:
              precision    recall  f1-score   support

      Anemia       0.00      0.00      0.00         3
    Diabetes       0.50      0.33      0.40         3
     Healthy       0.17      0.33      0.22         3
    Thalasse       0.00      0.00      0.00         3
    Thromboc       0.00      0.00      0.00         1

    accuracy                           0.15        13
   macro avg       0.13      0.13      0.12        13
weighted avg       0.15      0.15      0.14        13


Training SGD Classifier...
Accuracy: 0.0769
Cross-validation score: 0.3636 (+/- 0.2015)

Classification Report:
              precision    recall  f1-score   support

      Anemia       0.00      0.00      0.00         3
    Diabetes       0.00      0.00      0.00         3
     Healthy       0.20      0.33      0.25         3
    Thalasse       0.00      0.00      0.00         3
    Throm

In [13]:
print(pipeline.predict([[0.4525390487229299,0.1161349392345196,0.544559880646533,0.4006402542200498,0.2945381391269565,0.3820213504718714,0.6252668173594422,0.2951217167649673,0.8683693808899848,0.026807746004545,0.582170303033687,0.5774232519535167,0.0457824859332021,0.6636424898368296,0.0889399007845421,0.4667952615580137,0.3873320586791837,0.4217630924518363,0.0071863049879385,0.5069181607878194,0.4317036047706637,0.4172947249526202,0.7990740722024966,0.7792079136984322]],))

['Thalasse']
