In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/anemia-types-classification/diagnosed_cbc_data_v4.csv


In [2]:
import warnings
# Suppress the FutureWarning
warnings.filterwarnings('ignore', category=FutureWarning)

In [3]:
df = pd.read_csv('/kaggle/input/anemia-types-classification/diagnosed_cbc_data_v4.csv')


In [4]:
df.head()


Unnamed: 0,WBC,LYMp,NEUTp,LYMn,NEUTn,RBC,HGB,HCT,MCV,MCH,MCHC,PLT,PDW,PCT,Diagnosis
0,10.0,43.2,50.1,4.3,5.0,2.77,7.3,24.2,87.7,26.3,30.1,189.0,12.5,0.17,Normocytic hypochromic anemia
1,10.0,42.4,52.3,4.2,5.3,2.84,7.3,25.0,88.2,25.7,20.2,180.0,12.5,0.16,Normocytic hypochromic anemia
2,7.2,30.7,60.7,2.2,4.4,3.97,9.0,30.5,77.0,22.6,29.5,148.0,14.3,0.14,Iron deficiency anemia
3,6.0,30.2,63.5,1.8,3.8,4.22,3.8,32.8,77.9,23.2,29.8,143.0,11.3,0.12,Iron deficiency anemia
4,4.2,39.1,53.7,1.6,2.3,3.93,0.4,316.0,80.6,23.9,29.7,236.0,12.8,0.22,Normocytic hypochromic anemia


In [5]:
def get_df_info(df):
    print("\n\033[1mShape of DataFrame:\033[0m ", df.shape)
    print("\n\033[1mColumns in DataFrame:\033[0m ", df.columns.to_list())
    print("\n\033[1mData types of columns:\033[0m\n", df.dtypes)
    
    print("\n\033[1mInformation about DataFrame:\033[0m")
    df.info()
    
    print("\n\033[1mNumber of unique values in each column:\033[0m")
    for col in df.columns:
        print(f"\033[1m{col}\033[0m: {df[col].nunique()}")
        
    print("\n\033[1mNumber of null values in each column:\033[0m\n", df.isnull().sum())
    
    print("\n\033[1mNumber of duplicate rows:\033[0m ", df.duplicated().sum())
    
    print("\n\033[1mDescriptive statistics of DataFrame:\033[0m\n", df.describe().transpose())

# Call the function
get_df_info(df)


[1mShape of DataFrame:[0m  (1281, 15)

[1mColumns in DataFrame:[0m  ['WBC', 'LYMp', 'NEUTp', 'LYMn', 'NEUTn', 'RBC', 'HGB', 'HCT', 'MCV', 'MCH', 'MCHC', 'PLT', 'PDW', 'PCT', 'Diagnosis']

[1mData types of columns:[0m
 WBC          float64
LYMp         float64
NEUTp        float64
LYMn         float64
NEUTn        float64
RBC          float64
HGB          float64
HCT          float64
MCV          float64
MCH          float64
MCHC         float64
PLT          float64
PDW          float64
PCT          float64
Diagnosis     object
dtype: object

[1mInformation about DataFrame:[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1281 entries, 0 to 1280
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   WBC        1281 non-null   float64
 1   LYMp       1281 non-null   float64
 2   NEUTp      1281 non-null   float64
 3   LYMn       1281 non-null   float64
 4   NEUTn      1281 non-null   float64
 5   RBC        1281 

In [6]:
df.drop_duplicates(inplace=True)


In [7]:
# Calculate the mean of the positive values in the 'HGB' column
mean_HGB = df[df['HGB'] > 0]['HGB'].mean()

# Replace negative values in the 'HGB' column with the mean
df['HGB'] = np.where(df['HGB'] < 0, mean_HGB, df['HGB'])

# Calculate the mean of the positive values in the 'MCV' column
mean_MCV = df[df['MCV'] > 0]['MCV'].mean()

# Replace negative values in the 'MCV' column with the mean
df['MCV'] = np.where(df['MCV'] < 0, mean_MCV, df['MCV'])

In [8]:
# Divide the dataframe into features (X) and target (y)
X = df.drop('Diagnosis', axis=1)
y = df['Diagnosis']

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score

In [10]:
def apply_models(X, y):
    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Initialize the LabelEncoder
    le = LabelEncoder()

    # Fit the encoder on the entire dataset
    y = le.fit_transform(y)

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit the scaler on the training data and transform both training and test data
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Define the models
    models = {
        'LogisticRegression': OneVsRestClassifier(LogisticRegression()),
        'SVC': OneVsRestClassifier(SVC()),
        'DecisionTree': OneVsRestClassifier(DecisionTreeClassifier()),
        'RandomForest': OneVsRestClassifier(RandomForestClassifier()),
        'ExtraTrees': OneVsRestClassifier(ExtraTreesClassifier()),
        'AdaBoost': OneVsRestClassifier(AdaBoostClassifier()),
        'GradientBoost': OneVsRestClassifier(GradientBoostingClassifier()),
        'XGBoost': OneVsRestClassifier(XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
        'LightGBM': OneVsRestClassifier(LGBMClassifier(verbose=-1)),
        'CatBoost': OneVsRestClassifier(CatBoostClassifier(verbose=0))  # added CatBoost here
    }

    # Initialize a dictionary to hold the performance of each model
    model_performance = {}

    # Apply each model
    for model_name, model in models.items():
        print(f"\n\033[1mClassification with {model_name}:\033[0m\n{'-' * 30}")
        
        # Fit the model to the training data
        model.fit(X_train, y_train)

        # Make predictions on the test data
        y_pred = model.predict(X_test)

        # Convert the numerical predictions back to the original categorical names
        y_test_orig = le.inverse_transform(y_test)
        y_pred_orig = le.inverse_transform(y_pred)

        # Calculate the accuracy and f1 score
        accuracy = accuracy_score(y_test_orig, y_pred_orig)
        f1 = f1_score(y_test_orig, y_pred_orig, average='weighted')

        # Store the performance in the dictionary
        model_performance[model_name] = (accuracy, f1)

        # Print the accuracy score
        print("\033[1m**Accuracy**:\033[0m\n", accuracy)

        # Print the confusion matrix
        print("\n\033[1m**Confusion Matrix**:\033[0m\n", confusion_matrix(y_test_orig, y_pred_orig))

        # Print the classification report
        print("\n\033[1m**Classification Report**:\033[0m\n", classification_report(y_test_orig, y_pred_orig))

    # Sort the models based on f1 score and pick the top 3
    top_3_models = sorted(model_performance.items(), key=lambda x: x[1][1], reverse=True)[:3]
    print("\n\033[1mTop 3 Models based on Accuracy & F1 Score:\033[0m\n", top_3_models)

In [11]:
apply_models(X, y)



[1mClassification with LogisticRegression:[0m
------------------------------
[1m**Accuracy**:[0m
 0.6396761133603239

[1m**Confusion Matrix**:[0m
 [[54  0  0  0  0  7  0  0  0]
 [ 1 34  0  0  0  2  1  0  0]
 [ 3  1  2  0  0  1  0  0  0]
 [ 0  0  0  0  0  3  0  0  1]
 [ 0  0  0  0  0  1  2  0  0]
 [ 6  2  0  0  0 33  9  0  3]
 [17  0  1  0  0  2 29  0  4]
 [ 1  1  0  0  0  1  6  0  0]
 [ 7  0  0  0  0  5  1  0  6]]

[1m**Classification Report**:[0m
                                 precision    recall  f1-score   support

                       Healthy       0.61      0.89      0.72        61
        Iron deficiency anemia       0.89      0.89      0.89        38
                      Leukemia       0.67      0.29      0.40         7
Leukemia with thrombocytopenia       0.00      0.00      0.00         4
             Macrocytic anemia       0.00      0.00      0.00         3
 Normocytic hypochromic anemia       0.60      0.62      0.61        53
Normocytic normochromic anemia   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[1m**Accuracy**:[0m
 0.757085020242915

[1m**Confusion Matrix**:[0m
 [[57  0  0  0  0  3  1  0  0]
 [ 0 33  0  0  0  4  1  0  0]
 [ 2  1  4  0  0  0  0  0  0]
 [ 0  0  1  1  0  0  0  0  2]
 [ 0  0  0  0  0  3  0  0  0]
 [ 2  3  0  0  0 41  7  0  0]
 [ 0  0  2  0  0  7 41  0  3]
 [ 0  0  0  0  0  1  8  0  0]
 [ 4  0  0  0  0  5  0  0 10]]

[1m**Classification Report**:[0m
                                 precision    recall  f1-score   support

                       Healthy       0.88      0.93      0.90        61
        Iron deficiency anemia       0.89      0.87      0.88        38
                      Leukemia       0.57      0.57      0.57         7
Leukemia with thrombocytopenia       1.00      0.25      0.40         4
             Macrocytic anemia       0.00      0.00      0.00         3
 Normocytic hypochromic anemia       0.64      0.77      0.70        53
Normocytic normochromic anemia       0.71      0.77      0.74        53
       Other microcytic anemia       0.00 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[1m**Accuracy**:[0m
 0.979757085020243

[1m**Confusion Matrix**:[0m
 [[61  0  0  0  0  0  0  0  0]
 [ 0 38  0  0  0  0  0  0  0]
 [ 0  0  7  0  0  0  0  0  0]
 [ 0  0  2  2  0  0  0  0  0]
 [ 0  0  0  0  3  0  0  0  0]
 [ 0  0  0  0  0 52  1  0  0]
 [ 0  0  1  0  0  0 52  0  0]
 [ 0  0  0  0  0  0  0  9  0]
 [ 1  0  0  0  0  0  0  0 18]]

[1m**Classification Report**:[0m
                                 precision    recall  f1-score   support

                       Healthy       0.98      1.00      0.99        61
        Iron deficiency anemia       1.00      1.00      1.00        38
                      Leukemia       0.70      1.00      0.82         7
Leukemia with thrombocytopenia       1.00      0.50      0.67         4
             Macrocytic anemia       1.00      1.00      1.00         3
 Normocytic hypochromic anemia       1.00      0.98      0.99        53
Normocytic normochromic anemia       0.98      0.98      0.98        53
       Other microcytic anemia       1.00 