In [1]:
# importing all the required Libraries
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from keras.applications import InceptionV3, DenseNet121, MobileNetV2,Xception
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import GlobalAveragePooling2D
from keras.models import Model

# Data Loading and Preprocessing

In [2]:
# Define the dataset_path
dataset_dir = 'cell_images'

In [3]:
# List all subdirectories (each representing a class)
classes = os.listdir(dataset_dir)

# Initialize a dictionary to store the count of images in each class
class_count = {class_name: len(os.listdir(os.path.join(dataset_dir, class_name))) for class_name in classes}

# Print the results
for class_name, num_images in class_count.items():
    print(f"Number of {class_name} Images: {num_images}")

# Print the total number of classes and images
total_classes = len(classes)
total_images = sum(class_count.values())
print(f"\nTotal Classes: {total_classes}, Total Images: {total_images}")

Number of Parasitized Images: 13779
Number of Uninfected Images: 13779

Total Classes: 2, Total Images: 27558


In [4]:
# Define the paths to the folders containing the images
uninfected_dir = os.path.join(dataset_dir, 'Uninfected')
parasitized_dir = os.path.join(dataset_dir, 'Parasitized')

# Function to load images and labels
def load_images(folder, label):
    return [(os.path.join(folder, filename), label) for filename in os.listdir(folder) if filename.endswith(".png")]

# Load images and labels from both folders
file_paths_labels = load_images(uninfected_dir, "Uninfected") + load_images(parasitized_dir, "Parasitized")

# Create a DataFrame from the collected data
data = pd.DataFrame(file_paths_labels, columns=["file_path", "label"])

# Display the first few rows of the DataFrame
print(data.head())

# Optionally, you can save the DataFrame to a CSV file
data.to_csv("image_data.csv", index=False)

                                           file_path       label
0  cell_images\Uninfected\C100P61ThinF_IMG_201509...  Uninfected
1  cell_images\Uninfected\C100P61ThinF_IMG_201509...  Uninfected
2  cell_images\Uninfected\C100P61ThinF_IMG_201509...  Uninfected
3  cell_images\Uninfected\C100P61ThinF_IMG_201509...  Uninfected
4  cell_images\Uninfected\C100P61ThinF_IMG_201509...  Uninfected


In [5]:
data['label'].value_counts()

label
Uninfected     13779
Parasitized    13779
Name: count, dtype: int64

In [6]:
# Load the dataset
data = pd.read_csv("image_data.csv")

# Encode labels
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])

In [7]:
# Splitting the data into training and testing sets
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42, stratify=data['label'])

In [8]:
# Define function for creating data generator
def create_data_generator(preprocessing_function, dataframe, x_col, y_col, batch_size=64, input_shape=(224, 224)):
    datagen = ImageDataGenerator(preprocessing_function=preprocessing_function)
    generator = datagen.flow_from_dataframe(
        dataframe=dataframe,
        x_col=x_col,
        y_col=y_col,
        target_size=input_shape,
        batch_size=batch_size,
        class_mode='binary',
        shuffle=False
    )
    return generator

# Define function for feature extraction
def extract_features(model, generator):
    features = model.predict(generator, verbose=1)
    labels = generator.classes
    return features, labels

def train_and_evaluate_model(model, train_features, train_labels, test_features, test_labels):
    model.fit(train_features, train_labels)
    predictions = model.predict(test_features)
    accuracy = accuracy_score(test_labels, predictions)
    report = classification_report(test_labels, predictions)
    return accuracy, report

# Classifiers & Hyperparameter Tuning

In [9]:
# Define classifiers with default parameters
classifiers = {
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(),
    "k-NN": KNeighborsClassifier(),
    "AdaBoost": AdaBoostClassifier()
}

# Define hyperparameters grid for each classifier
param_grids = {
    "SVM": {'C': [0.1, 1, 10]},
    "Random Forest": {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]},
    "XGBoost": {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7]},
    "k-NN": {'n_neighbors': [3, 5, 10]},
    "AdaBoost": {'n_estimators': [50, 100, 200], 'learning_rate': [0.1, 0.5, 1.0]}
}

# Model Definition, Feature Extraction, Model Training and Evaluation

## MobileNetV2 for feature extraction

In [10]:
# Define MobileNetV2 model
mobilenetv2_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Feature extraction for MobileNetV2
mobilenetv2_model.trainable = False
gap_mobilenetv2 = GlobalAveragePooling2D()(mobilenetv2_model.output)
feature_extractor_mobilenetv2 = Model(inputs=mobilenetv2_model.input, outputs=gap_mobilenetv2)

In [11]:
# Create data generators
train_generator_mobilenetv2 = create_data_generator(preprocess_input, train_df, "file_path", "label", 64, (224, 224))
test_generator_mobilenetv2 = create_data_generator(preprocess_input, test_df, "file_path", "label", 64, (224, 224))

Found 22046 validated image filenames belonging to 2 classes.
Found 5512 validated image filenames belonging to 2 classes.


In [12]:
# Extract features
train_features_mobilenetv2, train_labels_mobilenetv2 = extract_features(feature_extractor_mobilenetv2, train_generator_mobilenetv2)
test_features_mobilenetv2, test_labels_mobilenetv2 = extract_features(feature_extractor_mobilenetv2, test_generator_mobilenetv2)



In [13]:
# Loop through each model
for model_name, (train_features, train_labels, test_features, test_labels) in {
    "MobileNetV2": (train_features_mobilenetv2, train_labels_mobilenetv2, test_features_mobilenetv2, test_labels_mobilenetv2),
    }.items():
    print(f"Model: {model_name}")

    # Perform hyperparameter tuning for each classifier
    for classifier_name, clf in classifiers.items():
        print(f"Tuning hyperparameters for {classifier_name}...")
        param_grid = param_grids[classifier_name]
        grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(train_features, train_labels)
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_
        print(f"Best parameters: {best_params}")
        print(f"Best cross-validation accuracy: {best_score:.2f}")

        # Train classifier with best parameters
        best_clf = grid_search.best_estimator_
        best_clf.fit(train_features, train_labels)

        # Evaluate classifier
        test_accuracy, report = train_and_evaluate_model(best_clf, train_features, train_labels, test_features, test_labels)
        print(f"Test accuracy: {test_accuracy:.2f}")

        # Print classification report
        print(f"Classification Report for {classifier_name}:\n{report}\n")

Model: MobileNetV2
Tuning hyperparameters for SVM...
Best parameters: {'C': 10}
Best cross-validation accuracy: 0.95
Test accuracy: 0.95
Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      2756
           1       0.94      0.96      0.95      2756

    accuracy                           0.95      5512
   macro avg       0.95      0.95      0.95      5512
weighted avg       0.95      0.95      0.95      5512


Tuning hyperparameters for Random Forest...
Best parameters: {'max_depth': 20, 'n_estimators': 200}
Best cross-validation accuracy: 0.93
Test accuracy: 0.93
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.93      0.92      0.93      2756
           1       0.92      0.93      0.93      2756

    accuracy                           0.93      5512
   macro avg       0.93      0.93      0.93      5512
weighted avg       0.93      0.93 

## DenseNet121 as a Feature Extractor

In [10]:
# Define DenseNet121 model
densenet121_model = DenseNet121(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Feature extraction for DenseNet121 model
densenet121_model.trainable = False
gap_densenet121 = GlobalAveragePooling2D()(densenet121_model.output)
feature_extractor_densenet121 = Model(inputs=densenet121_model.input, outputs=gap_densenet121)

In [11]:
# Create data generators
train_generator_densenet121 = create_data_generator(preprocess_input, train_df, "file_path", "label", 64, (224, 224))
test_generator_densenet121 = create_data_generator(preprocess_input, test_df, "file_path", "label", 64, (224, 224))

Found 22046 validated image filenames belonging to 2 classes.
Found 5512 validated image filenames belonging to 2 classes.


In [12]:
# Extract features
train_features_densenet121, train_labels_densenet121 = extract_features(feature_extractor_densenet121, train_generator_densenet121)
test_features_densenet121, test_labels_densenet121 = extract_features(feature_extractor_densenet121, test_generator_densenet121)



In [13]:
# Loop through each model
for model_name, (train_features, train_labels, test_features, test_labels) in {
    "DenseNet121": (train_features_densenet121, train_labels_densenet121, test_features_densenet121, test_labels_densenet121)
    }.items():
    print(f"Model: {model_name}")

    # Perform hyperparameter tuning for each classifier
    for classifier_name, clf in classifiers.items():
        print(f"Tuning hyperparameters for {classifier_name}...")
        param_grid = param_grids[classifier_name]
        grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(train_features, train_labels)
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_
        print(f"Best parameters: {best_params}")
        print(f"Best cross-validation accuracy: {best_score:.2f}")

        # Train classifier with best parameters
        best_clf = grid_search.best_estimator_
        best_clf.fit(train_features, train_labels)

        # Evaluate classifier
        test_accuracy, report = train_and_evaluate_model(best_clf, train_features, train_labels, test_features, test_labels)
        print(f"Test accuracy: {test_accuracy:.2f}")

        # Print classification report
        print(f"Classification Report for {classifier_name}:\n{report}\n")

Model: DenseNet121
Tuning hyperparameters for SVM...
Best parameters: {'C': 10}
Best cross-validation accuracy: 0.95
Test accuracy: 0.95
Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.96      0.93      0.94      2756
           1       0.94      0.96      0.95      2756

    accuracy                           0.95      5512
   macro avg       0.95      0.95      0.95      5512
weighted avg       0.95      0.95      0.95      5512


Tuning hyperparameters for Random Forest...
Best parameters: {'max_depth': None, 'n_estimators': 200}
Best cross-validation accuracy: 0.93
Test accuracy: 0.93
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      2756
           1       0.93      0.93      0.93      2756

    accuracy                           0.93      5512
   macro avg       0.93      0.93      0.93      5512
weighted avg       0.93      0.9

## InceptionV3 for feature extraction

In [14]:
# Define InceptionV3 model
inceptionv3_model = InceptionV3(weights='imagenet', include_top=False, input_shape=(299, 299, 3))

# Feature extraction for InceptionV3
inceptionv3_model.trainable = False
gap_inceptionv3 = GlobalAveragePooling2D()(inceptionv3_model.output)
feature_extractor_inceptionv3 = Model(inputs=inceptionv3_model.input, outputs=gap_inceptionv3)

In [15]:
# Create data generators
train_generator_inceptionv3 = create_data_generator(preprocess_input, train_df, "file_path", "label", 64, (299, 299))
test_generator_inceptionv3 = create_data_generator(preprocess_input, test_df, "file_path", "label", 64, (299, 299))

Found 22046 validated image filenames belonging to 2 classes.
Found 5512 validated image filenames belonging to 2 classes.


In [16]:
# Extract features
train_features_inceptionv3, train_labels_inceptionv3 = extract_features(feature_extractor_inceptionv3, train_generator_inceptionv3)
test_features_inceptionv3, test_labels_inceptionv3 = extract_features(feature_extractor_inceptionv3, test_generator_inceptionv3)



In [17]:
# Loop through each model
for model_name, (train_features, train_labels, test_features, test_labels) in {
    "InceptionV3": (train_features_inceptionv3, train_labels_inceptionv3, test_features_inceptionv3, test_labels_inceptionv3)
    }.items():
    print(f"Model: {model_name}")

    # Perform hyperparameter tuning for each classifier
    for classifier_name, clf in classifiers.items():
        print(f"Tuning hyperparameters for {classifier_name}...")
        param_grid = param_grids[classifier_name]
        grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(train_features, train_labels)
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_
        print(f"Best parameters: {best_params}")
        print(f"Best cross-validation accuracy: {best_score:.2f}")

        # Train classifier with best parameters
        best_clf = grid_search.best_estimator_
        best_clf.fit(train_features, train_labels)

        # Evaluate classifier
        test_accuracy, report = train_and_evaluate_model(best_clf, train_features, train_labels, test_features, test_labels)
        print(f"Test accuracy: {test_accuracy:.2f}")

        # Print classification report
        print(f"Classification Report for {classifier_name}:\n{report}\n")

Model: InceptionV3
Tuning hyperparameters for SVM...
Best parameters: {'C': 10}
Best cross-validation accuracy: 0.95
Test accuracy: 0.94
Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94      2756
           1       0.93      0.96      0.94      2756

    accuracy                           0.94      5512
   macro avg       0.94      0.94      0.94      5512
weighted avg       0.94      0.94      0.94      5512


Tuning hyperparameters for Random Forest...
Best parameters: {'max_depth': 20, 'n_estimators': 200}
Best cross-validation accuracy: 0.93
Test accuracy: 0.92
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      2756
           1       0.91      0.93      0.92      2756

    accuracy                           0.92      5512
   macro avg       0.92      0.92      0.92      5512
weighted avg       0.92      0.92 

## Xception as a Feature Extractor

In [18]:
# Define Xception model
xception_model = Xception(weights='imagenet', include_top=False, input_shape=(299, 299, 3))

# Feature extraction for Xception model
xception_model.trainable = False
gap_xception = GlobalAveragePooling2D()(xception_model.output)
feature_extractor_xception = Model(inputs=xception_model.input, outputs=gap_xception)

In [19]:
# Create data generators
train_generator_xception = create_data_generator(preprocess_input, train_df, "file_path", "label", 64, (299, 299))
test_generator_xception = create_data_generator(preprocess_input, test_df, "file_path", "label", 64, (299, 299))

Found 22046 validated image filenames belonging to 2 classes.
Found 5512 validated image filenames belonging to 2 classes.


In [20]:
# Extract features
train_features_xception, train_labels_xception = extract_features(feature_extractor_xception, train_generator_xception)
test_features_xception, test_labels_xception = extract_features(feature_extractor_xception, test_generator_xception)



In [21]:
# Loop through each model
for model_name, (train_features, train_labels, test_features, test_labels) in {
    "Xception": (train_features_xception, train_labels_xception, test_features_xception, test_labels_xception)
    }.items():
    print(f"Model: {model_name}")

    # Perform hyperparameter tuning for each classifier
    for classifier_name, clf in classifiers.items():
        print(f"Tuning hyperparameters for {classifier_name}...")
        param_grid = param_grids[classifier_name]
        grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(train_features, train_labels)
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_
        print(f"Best parameters: {best_params}")
        print(f"Best cross-validation accuracy: {best_score:.2f}")

        # Train classifier with best parameters
        best_clf = grid_search.best_estimator_
        best_clf.fit(train_features, train_labels)

        # Evaluate classifier
        test_accuracy, report = train_and_evaluate_model(best_clf, train_features, train_labels, test_features, test_labels)
        print(f"Test accuracy: {test_accuracy:.2f}")

        # Print classification report
        print(f"Classification Report for {classifier_name}:\n{report}\n")

Model: Xception
Tuning hyperparameters for SVM...
Best parameters: {'C': 1}
Best cross-validation accuracy: 0.95
Test accuracy: 0.95
Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.97      0.94      0.95      2756
           1       0.94      0.97      0.95      2756

    accuracy                           0.95      5512
   macro avg       0.95      0.95      0.95      5512
weighted avg       0.95      0.95      0.95      5512


Tuning hyperparameters for Random Forest...
Best parameters: {'max_depth': None, 'n_estimators': 200}
Best cross-validation accuracy: 0.93
Test accuracy: 0.92
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.93      0.92      0.92      2756
           1       0.92      0.93      0.93      2756

    accuracy                           0.92      5512
   macro avg       0.92      0.92      0.92      5512
weighted avg       0.92      0.92   