In [2]:
# COVID X-ray Classification Project Workflow (ML + DL)

# 1. Load & Preprocess Data
# ---------------------------------------------------
# - Load X-ray images and labels
# - Resize to fixed shape (e.g., 299x299)
# - Apply CLAHE, subtract mask
# - Normalize pixel values (0-1)
# - Convert to numpy arrays

In [13]:

# 2. Encode Labels

# input for this cell:    ../data/processed/image_stats_summary.csv        (must exist before running this cell)
# output of this cell is: ../data/processed/image_stats_summary_enc.csv    (is created when running this cell)

%run encode.ipynb


Unnamed: 0,label,file,min_pixel,max_pixel,mean_pixel,std_pixel,blank_image
0,COVID,COVID-1.png,0,255,145.896847,51.816632,False
1,COVID,COVID-2.png,32,238,150.64053,48.867698,False
2,COVID,COVID-3.png,0,255,140.598617,50.200679,False
3,COVID,COVID-4.png,1,255,116.714679,57.719773,False
4,COVID,COVID-5.png,34,246,167.835796,48.413412,False


<class 'pandas.core.series.Series'>
RangeIndex: 21165 entries, 0 to 21164
Series name: label
Non-Null Count  Dtype 
--------------  ----- 
21165 non-null  object
dtypes: object(1)
memory usage: 165.5+ KB


In [14]:
# 3. build data frame for further work

# input for this cell:    ../data/processed/image_stats_summary_enc.csv               (must exist before running this cell)
# output of this cell is: ../data/processed/df_xray_processed_normed_enc.cvs          (is created when running this cell)

# the resulting dataframe contains infos (path, filename) to the processed and normalized data and labels and encoded labels. 
# The following processing has been done to the images referred to in this dataframe:
# - convert images and masks to grayscale
# - resize masks
# - convert masks to binary images
# - apply Gaussian Blur to images
# - apply Clahe to images
# - add masks to previously changed images (grayscale, Gaussian Blur, Clahe)
# - normalize images

# It refers to the images which have been previously stored in the folder ..\data\processed\normalized_xrays

%run create_dataframe.ipynb




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_processed['path'] = df_processed['label'].apply(lambda x: os.path.join(dir_processed_xray, x))


In [15]:
# 4. Train-Test Split

# input for this cell:    ../data/processed/df_xray_processed_normed_enc.cvs         (must exist before running this cell)
# output of this cell is: ../data/processed/df_xray_processed_normed_enc_test.cvs    (is created when running this cell)
#                         ../data/processed/df_xray_processed_normed_enc_test.cvs    (is created when running this cell)

# This cell splits the DataFrame which contains infos to preprocessed and normalized images and labels and encoded labels into test and train.
# No sepeartion of the columns with labels (target) is done here. 
# We have to sepearate the target variable later, bacause we need it in the following parts for data augmentation

%run train_test_split.ipynb


label
Normal             0.481550
Lung_Opacity       0.284054
COVID              0.170848
Viral Pneumonia    0.063548
Name: proportion, dtype: float64
label_enc
2    0.481550
1    0.284054
0    0.170848
3    0.063548
Name: proportion, dtype: float64
processed data size:  21165
train data size:  16932
test data size:  4233
----------------------------------------
distribution of labels for train set:
label_enc
2    0.481573
1    0.284018
0    0.170860
3    0.063548
Name: proportion, dtype: float64
----------------------------------------
distribution of labels for test set:
label_enc
2    0.481455
1    0.284196
0    0.170801
3    0.063548
Name: proportion, dtype: float64


In [None]:
# 5. Class Imbalance Fix (only on training data)

# input for this cell:    ../data/processed/df_xray_processed_normed_enc_train.cvs         (must exist before running this cell)
# output of this cell is: ../data/processed/df_xray_train_norm_plus_augmented.csv          (is created when running this cell)


# Augmented data is created in the folder data\processed\augmented for the minority classes: ["COVID", "Viral Pneumonia", "Lung_Opacity"].
# Additionally a csv-file 'df_xray_train_norm_plus_augmented.csv' is created in which all the images belonging to the training set (incl. augmented)
# are listed with labels and encoded labels

# info: on my computer it took about 1:20 Min to run

%run data_augmentation.ipynb


# you can use the follwing notebook optional to display examples of the processed & normalized images and augmented versions

# data_augmentation_viz.ipynb

  original_init(self, **validated_kwargs)


In [None]:
# 6. Feature Extraction for Train and Test Sets
# ---------------------------------------------------
# - Extract deep features (ResNet, VGG16) and traditional features (HOG, GLCM)
# - Convert processed X-ray images into feature vectors
# - Save features and labels as .npy files for both train and test sets

# Input for this step:
# - Train CSV: ../data/processed/df_xray_train_norm_plus_augmented.csv         # (from Step 5)
# - Test CSV : ../data/processed/df_xray_processed_normed_enc_test.csv         # (from Step 4)

# Output of this step:
# - labels_train.npy
# - labels_test.npy
# - ResNet_HOG.npy
# - ResNet_HOG_test.npy
# - ResNet_HOG_GLCM.npy
# - ResNet_HOG_GLCM_test.npy
# - combined_hog_vgg.npy
# - combined_hog_vgg_test.npy

# Note: These features are saved and also available in the linked Google Drive folders:
# - Train Features: https://drive.google.com/drive/folders/1tEVfGVhBFkIAV_ZpImKqU5HtjrJPQtgr?usp=drive_link
# - Test Features : https://drive.google.com/drive/folders/1dnpbcx8M8pjGWIvnXDOq7TqO1O8CKQ4m?usp=drive_link
# - Label Files   : 
#    - labels_train.npy: https://drive.google.com/file/d/1IcdiNVn529CIGXCPdv7PYEQcbXQScGWy/view?usp=drive_link
#    - labels_test.npy : https://drive.google.com/file/d/12adEjCtiv-OJfmlcmIckPN9m4qAzXELz/view?usp=drive_link

# Notebooks for this step:
%run feature_extraction.ipynb
%run feature_extraction_test.ipynb

# Load results (used in model training later):
X_train = np.load("ResNet_HOG.npy")
y_train = np.load("labels_train.npy")
X_test  = np.load("ResNet_HOG_test.npy")
y_test  = np.load("labels_test.npy")


In [None]:

# 6. ML Modeling + Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
from sklearn.svm import SVC

model = SVC()

# GridSearch
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')

# RandomSearch
param_dist = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
random_search = RandomizedSearchCV(model, param_dist, cv=5, n_iter=6, scoring='accuracy')

# BayesSearch
search_spaces = {'C': (0.1, 50.0), 'kernel': ['linear', 'rbf']}
bayes_search = BayesSearchCV(model, search_spaces, n_iter=10, cv=5, scoring='accuracy')

# Fit
# grid_search.fit(X_train_bal, y_train_bal)
# random_search.fit(X_train_bal, y_train_bal)
# bayes_search.fit(X_train_bal, y_train_bal)

# 7. Evaluate Models
# print(grid_search.best_params_)
# print(random_search.best_params_)
# print(bayes_search.best_params_)
# Predict, Confusion Matrix, Accuracy

# 8. Deep Learning Path (Instead of ML)
# - Use data augmentation on minority classes
# - Fine-tune pretrained CNN (EfficientNet, ResNet, etc.)
# - Evaluate on test set
# - Compare results to ML

# 9. Save Best Model
# from joblib import dump
# dump(grid_search.best_estimator_, 'best_model.joblib')

# OR (DL)
# model.save("best_dl_model.h5")
