In [2]:
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from skimage.feature import hog
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
import joblib
import json
import random
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, label_binarize

from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, roc_curve, auc
from sklearn.model_selection import cross_val_score, StratifiedKFold
from torchvision.transforms import ToPILImage
from skimage.feature import hog

In [5]:
# Paths
root_path = os.path.join('..', 'dataset', 'malaria')
train_json_path = os.path.join(root_path, 'training.json')
test_json_path = os.path.join(root_path, 'test.json')
image_path = os.path.join(root_path, 'images')
print("Root Path:", root_path)
print("Train JSON Path:", train_json_path)
print("Test JSON Path:", test_json_path)
print("Image Path:", image_path)

Root Path: ..\dataset\malaria
Train JSON Path: ..\dataset\malaria\training.json
Test JSON Path: ..\dataset\malaria\test.json
Image Path: ..\dataset\malaria\images


In [6]:
# paths for individual images
images_by_class = os.path.join(root_path, 'resized_images_by_classes')
rbcs_path = os.path.join(images_by_class, 'red_blood_cell')

In [19]:
# Dynamically detect classes from subdirectories
classes = [d for d in os.listdir(images_by_class) if os.path.isdir(os.path.join(images_by_class, d))]
print("Detected classes:", classes)

# Collect image paths and labels
new_data_for_df = []
for dirpath, dirnames, filenames in os.walk(images_by_class):
    label = os.path.basename(dirpath)
    if label in classes:  # only keep valid class dirs
        for filename in filenames:
            if filename.lower().endswith(('.png', '.jpg', '.jpeg')):  # safeguard
                full_image_path = os.path.join(dirpath, filename)
                new_data_for_df.append({'image_path': full_image_path, 'label': label})

# Create DataFrame
df = pd.DataFrame(new_data_for_df)
print(df.head(), "\nTotal images:", len(df))
print("Class distribution:\n", df['label'].value_counts())


Detected classes: ['difficult', 'gametocyte', 'leukocyte', 'red_blood_cell', 'ring', 'schizont', 'trophozoite']
                                          image_path      label
0  ..\dataset\malaria\resized_images_by_classes\d...  difficult
1  ..\dataset\malaria\resized_images_by_classes\d...  difficult
2  ..\dataset\malaria\resized_images_by_classes\d...  difficult
3  ..\dataset\malaria\resized_images_by_classes\d...  difficult
4  ..\dataset\malaria\resized_images_by_classes\d...  difficult 
Total images: 80111
Class distribution:
 label
red_blood_cell    77418
trophozoite        1473
difficult           441
ring                353
schizont            179
gametocyte          144
leukocyte           103
Name: count, dtype: int64


In [7]:
# --- Feature extractors ---
def extract_color_histogram(image_path, bins=(8, 8, 8)):
    image = cv2.imread(image_path)
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, bins, [0, 180, 0, 256, 0, 256])
    cv2.normalize(hist, hist)
    return hist.flatten()

def extract_hog_features(image_path, pixels_per_cell=(16, 16), cells_per_block=(2, 2)):
    image = cv2.imread(image_path)
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    resized_image = cv2.resize(gray_image, (128, 128))
    features = hog(resized_image, pixels_per_cell=pixels_per_cell,
                   cells_per_block=cells_per_block, visualize=False)
    return features

In [8]:
# --- Choose one ---
feature_extractor = extract_hog_features

features_list, labels_list = [], []

In [None]:

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    # print("row is:", row)
    current_path = os.path.normpath(row['image_path'])
    # print("current path is:", current_path)
    if not os.path.isfile(current_path):  # safety
        continue
    label = row['label']
    features = feature_extractor(current_path)
    features_list.append(features)
    labels_list.append(label)

X = np.array(features_list)
y = np.array(labels_list)

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.25, random_state=42, stratify=y_encoded
)

# --- SVM ---
print("\n--- Training SVM ---")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
svm_model = SVC(kernel='rbf', C=1.0, random_state=42, class_weight='balanced', probability=True)
svm_model.fit(X_train_scaled, y_train)
svm_preds = svm_model.predict(X_test_scaled)
print("\n--- SVM Evaluation ---")
print(f"Accuracy: {accuracy_score(y_test, svm_preds):.4f}")
print(classification_report(y_test, svm_preds, target_names=le.classes_))


  0%|          | 0/80111 [00:00<?, ?it/s]

100%|██████████| 80111/80111 [28:46<00:00, 46.39it/s]  



--- Training SVM ---

--- SVM Evaluation ---
Accuracy: 0.9733
                precision    recall  f1-score   support

     difficult       0.32      0.16      0.22       110
    gametocyte       0.00      0.00      0.00        36
     leukocyte       0.82      0.54      0.65        26
red_blood_cell       0.99      0.99      0.99     19355
          ring       0.62      0.18      0.28        88
      schizont       0.33      0.02      0.04        45
   trophozoite       0.51      0.64      0.57       368

      accuracy                           0.97     20028
     macro avg       0.51      0.36      0.39     20028
  weighted avg       0.97      0.97      0.97     20028



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
# Save model + scaler + label encoder
joblib.dump(svm_model, "svm_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(le, "label_encoder.pkl")


['label_encoder.pkl']

In [14]:
import joblib

# Load saved objects
svm_model = joblib.load("svm_model.pkl")
scaler = joblib.load("scaler.pkl")
le = joblib.load("label_encoder.pkl")

# Example: predicting on new images
new_im = os.path.join(image_path, '0a7bfa8a-ee52-4f7a-b9c5-2919ecfa93ef.png')
print(root_path)
new_im = os.path.join(root_path,'resized_images_by_classes', 'gametocyte', '0d3c2521-da55-4df5-bf06-56b3bd4fb2fc_cell_29_gametocyte.png')
features = feature_extractor(new_im)
features_scaled = scaler.transform([features])
pred = svm_model.predict(features_scaled)
pred_label = le.inverse_transform(pred)

print("Predicted class:", pred_label[0])


..\dataset\malaria
Predicted class: gametocyte


In [None]:
import os
import cv2
import numpy as np
from skimage.feature import hog
from tqdm import tqdm
import pandas as pd

# Assume df is a pre-loaded DataFrame with 'image_path' and 'label' columns
# df = pd.read_csv('your_data.csv') 

def extract_color_histogram(image_path, bins=(8, 8, 8)):
    # ... (function body as you provided) ...
    image = cv2.imread(image_path)
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, bins, [0, 180, 0, 256, 0, 256])
    cv2.normalize(hist, hist)
    return hist.flatten()

def extract_hog_features(image_path, pixels_per_cell=(16, 16), cells_per_block=(2, 2)):
    # ... (function body as you provided) ...
    image = cv2.imread(image_path)
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    resized_image = cv2.resize(gray_image, (128, 128))
    features = hog(resized_image, pixels_per_cell=pixels_per_cell,
                   cells_per_block=cells_per_block, visualize=False)
    return features

# Choose a feature extractor
feature_extractor = extract_hog_features

features_list = []
labels_list = []

# Loop over the image paths in the DataFrame
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    current_path = os.path.normpath(row['image_path']) # Use 'image_path' or the correct column name
    if os.path.isdir(current_path):
        continue
    label = row['label']
    features = feature_extractor(current_path)
    features_list.append(features)
    labels_list.append(label)

# Convert lists to NumPy arrays
X = np.array(features_list)
y = np.array(labels_list)

In [10]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


from sklearn.svm import SVC

# Create an SVM classifier
svm_model = SVC(kernel='linear') # 'linear' is a good starting point

# Train the SVM model on the training data
svm_model.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, classification_report

# Predict the labels on the test set
y_pred = svm_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy * 100:.2f}%')

# Print a full classification report
print(classification_report(y_test, y_pred))


NameError: name 'X' is not defined

In [None]:
# Step 1: Encode string labels into numbers
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Step 2: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, random_state=42, stratify=y_encoded)

# --- Support Vector Machine (SVM) ---
print("\n--- Training SVM ---")
# SVMs are scale-sensitive, so scaling is recommended
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svm_model = SVC(kernel='rbf', C=1.0, random_state=42, class_weight='balanced', probability=True)
svm_model.fit(X_train_scaled, y_train)
svm_preds = svm_model.predict(X_test_scaled)

print("\n--- SVM Evaluation ---")
print(f"Accuracy: {accuracy_score(y_test, svm_preds):.4f}")
print(classification_report(y_test, svm_preds, target_names=le.classes_))

NameError: name 'LabelEncoder' is not defined