# FEATURE EXTRACTION FOR TRAIN SET

In [13]:
# Step-by-step Plan (after augmentation):
#Start with augmented training set
#Includes original + augmented images (especially from minority class).
# test set remains untouched.
#Extract features from the augmented training set
#we use:
#Deep features: like from ResNet, VGG, 
#Handcrafted features: like HOG, GLCM, 
#Or combine both (e.g., ResNet + HOG + GLCM).

import pandas as pd
import numpy as np
import cv2
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from skimage.feature import hog
import umap
import tensorflow as tf
import os
import matplotlib.pyplot as plt  # Add the missing import

print(os.getcwd())
base_path     = "../data"
base_path_out = os.path.join(base_path, "processed")

print("Loading images from:", os.path.abspath(os.path.join(base_path_out, "df_xray_train_norm_plus_augmented.csv")))

# 1)Load csv file
df= pd.read_csv(os.path.abspath(os.path.join(base_path_out, "df_xray_train_norm_plus_augmented.csv")))
df['image_path'] = df.apply(lambda row: os.path.join(row['path'], row['file']), axis=1)
df.head()

c:\Users\arfin\Downloads\covid xray\mar25-bds_analysis-of-covid-19-chest-x-rays\notebooks
Loading images from: c:\Users\arfin\Downloads\covid xray\mar25-bds_analysis-of-covid-19-chest-x-rays\data\processed\df_xray_train_norm_plus_augmented.csv


Unnamed: 0,index,label,file,label_enc,path,image_path
0,0,Normal,NORMAL-7970.png,2,..\\data\\processed\normalized_xrays\Normal,..\\data\\processed\normalized_xrays\Normal\NO...
1,1,COVID,COVID-1372.png,0,..\\data\\processed\normalized_xrays\COVID,..\\data\\processed\normalized_xrays\COVID\COV...
2,2,Normal,NORMAL-354.png,2,..\\data\\processed\normalized_xrays\Normal,..\\data\\processed\normalized_xrays\Normal\NO...
3,3,Viral Pneumonia,Viral Pneumonia-569.png,3,..\\data\\processed\normalized_xrays\Viral Pne...,..\\data\\processed\normalized_xrays\Viral Pne...
4,4,Normal,NORMAL-4703.png,2,..\\data\\processed\normalized_xrays\Normal,..\\data\\processed\normalized_xrays\Normal\NO...


In [14]:
import os
import cv2
import numpy as np
from tqdm import tqdm

#2)Get images
#Step1 :Get the current working directory (where the script is running).
project_base_dir = os.getcwd()  

#Step 2:Combine:
#project_base_dir (base path),
#row['path'] (relative folder like data/processed/normalized_xrays/COVID), and
#row['file'] (e.g., COVID-1372.png)
#Use os.path.normpath to clean up path slashes (platform-independent).
#Add a new column image_path in df with full image paths.
df['image_path'] = df.apply(lambda row: os.path.normpath(os.path.join(project_base_dir, row['path'], row['file'])), axis=1)

# Step 3: Load images (as-is)
#Iterates over all image paths.
#Loads each image as grayscale using OpenCV (cv2.IMREAD_GRAYSCALE).
#Appends the loaded image array to image_data.
image_data = []
for path in tqdm(df['image_path'], desc="Loading images"):
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    if img is not None:
        image_data.append(img)
    else:
        print(f"Failed to load image: {path}")

# Step 4: Convert to NumPy array
X_gray = np.array(image_data, dtype=np.uint8)

# Step 5: Extract labels
y_labels = df['label_enc'].to_numpy()

# Step 6: Inspect results
print("X_gray shape:", X_gray.shape)
print("y_labels shape:", y_labels.shape)


Loading images: 100%|██████████| 35059/35059 [01:12<00:00, 481.83it/s]


X_gray shape: (35059, 299, 299)
y_labels shape: (35059,)


In [19]:
np.save("labels_train.npy", y_labels)
labels_train = np.load("labels_train.npy")
print("Shape of labels_train:", labels_train.shape)


Shape of labels_train: (35059,)


#VGG, HOG , COMBINED _VGG_HOg = VGG + HOG 

In [4]:
# 3) Build VGG16 (global avg pooling → 512-d output)
vgg = VGG16(
    weights='imagenet',
    include_top=False,
    pooling='avg',
    input_shape=(224,224,3)
)

# 4) Batch-wise VGG16 feature extraction
batch_size = 32
vgg_feats  = []

for start in range(0, len(X_gray), batch_size):
    end = min(start + batch_size, len(X_gray))
    batch_gray = X_gray[start:end]
    
    # Allocate only this batch’s RGB array (batch_size × 224 × 224 × 3)
    batch_rgb = np.zeros((len(batch_gray), 224, 224, 3), dtype='float32')
    for i, img in enumerate(batch_gray):
        resized = cv2.resize(img, (224,224))
        batch_rgb[i] = cv2.cvtColor(resized, cv2.COLOR_GRAY2BGR)
    
    # Preprocess and predict just this batch
    batch_pre = preprocess_input(batch_rgb)
    feats      = vgg.predict(batch_pre, verbose=0)  # → (batch, 512)
    vgg_feats.append(feats)

vgg_feats = np.vstack(vgg_feats)  # → (21165, 512)
print("VGG16 features:", vgg_feats.shape)

# 5) HOG feature extraction (one at a time)
hog_feats = []
for img in X_gray:
    resized = cv2.resize(img, (128,128))
    h       = hog(
        resized,
        orientations=9,
        pixels_per_cell=(8,8),
        cells_per_block=(2,2),
        feature_vector=True
    )
    hog_feats.append(h)

hog_feats = np.array(hog_feats)
print("HOG features:", hog_feats.shape)

# 6) Combine
combined_vgg_hog = np.concatenate([vgg_feats, hog_feats], axis=1)
print("Combined features:", combined_vgg_hog.shape)



VGG16 features: (35059, 512)
HOG features: (35059, 8100)
Combined features: (35059, 8612)


In [5]:
# Save the features to disk
np.save("vgg_features.npy", vgg_feats)
np.save("hog_features.npy", hog_feats)
np.save("combined_features.npy", combined_vgg_hog)

print("Features saved to disk!")


Features saved to disk!


#RESNET 50 FEATURES

In [6]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_preprocess
import numpy as np
import cv2

# 1) Build ResNet50 with global avg pooling
resnet = ResNet50(
    weights='imagenet',
    include_top=False,
    pooling='avg',          # <— yields a 2048-dim vector per image
    input_shape=(224,224,3)
)

# 2) Batch feature extraction
batch_size = 32
resnet_feats = []

for start in range(0, len(X_gray), batch_size):
    end = min(start + batch_size, len(X_gray))
    batch_gray = X_gray[start:end]
    
    # Allocate only this batch’s RGB array
    batch_rgb = np.zeros((len(batch_gray), 224, 224, 3), dtype='float32')
    for i, img in enumerate(batch_gray):
        resized = cv2.resize(img, (224,224))
        batch_rgb[i] = cv2.cvtColor(resized, cv2.COLOR_GRAY2BGR)
    
    # Preprocess & predict
    batch_pre = resnet_preprocess(batch_rgb)
    feats     = resnet.predict(batch_pre, verbose=0)  # → (batch, 2048)
    resnet_feats.append(feats)

resnet_feats = np.vstack(resnet_feats)  # → (N, 2048)
print("ResNet-50 features shape:", resnet_feats.shape)
np.save('resnet_feats.npy', resnet_feats)

ResNet-50 features shape: (35059, 2048)


#GLCM FEATURES

In [7]:
from skimage.feature import graycomatrix, graycoprops
from skimage import exposure

levels = 256  # use 256 to match 8-bit images
distances = [1, 2, 3]
angles = [0, np.pi/4, np.pi/2, 3*np.pi/4]

glcm_features = []

for img in X_gray:
    # X_gray is already grayscale, just rescale intensity
    img_rescaled = exposure.rescale_intensity(
        img, in_range='image', out_range=(0, levels - 1)
    ).astype(np.uint8)

    glcm = graycomatrix(
        img_rescaled,
        distances=distances,
        angles=angles,
        levels=levels,
        symmetric=True,
        normed=True
    )

    glcm_feat = [
        graycoprops(glcm, 'contrast').mean(),
        graycoprops(glcm, 'dissimilarity').mean(),
        graycoprops(glcm, 'homogeneity').mean(),
        graycoprops(glcm, 'energy').mean(),
        graycoprops(glcm, 'correlation').mean()
    ]

    glcm_features.append(glcm_feat)

glcm_features = np.array(glcm_features)
print("GLCM features:", glcm_features.shape)


GLCM features: (35059, 5)


In [8]:
np.save('glcm_features.npy', glcm_features)

#COMBINED 1 =ResNet + HOG, COMBINED 2 = ResNet + HOG + GLCM

In [10]:
# Option 1: ResNet + HOG
combined_1 = np.concatenate([resnet_feats, hog_feats], axis=1)
np.save('ResNet_HOG.npy', combined_1)


In [9]:
# Option 2: ResNet + HOG + GLCM
combined_2 = np.concatenate([resnet_feats, hog_feats, glcm_features], axis=1)
np.save('ResNet_HOG_GLCM.npy', combined_2)

#COMBINED all

In [11]:
# Merge features
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

print("vgg_feats" in locals())  # Will return True if the variable is present
print("hog_feats" in locals())  # Will return True if the variable is present

import numpy as np
combined_all = np.concatenate([vgg_feats, hog_feats, glcm_features, resnet_feats], axis=1)
print("Combined feature shape:", combined_all.shape)
np.save('combined_all.py', combined_all)

True
True


KeyboardInterrupt: 

In [None]:
#load features
#import numpy as np
#vgg_feats = np.load("vgg_features.npy")
#hog_feats = np.load("hog_features.npy")
#combined_features = np.load("combined_features.npy")

# # feature extraction methods (VGG, ResNet, HOG) produced 1D feature vectors by design.
# 1. Extract features from train and test sets
# train_feats = features from augmented training set
# test_feats = features from test set (no augmentation)

# 2. Scale both
#scaler = StandardScaler()
#train_scaled = scaler.fit_transform(train_feats)    # fit on train
#test_scaled = scaler.transform(test_feats)          # transform test using same scaling

# 3. PCA (or UMAP)(Optional)
#pca = PCA(n_components=100)
#train_pca = pca.fit_transform(train_scaled)         # fit PCA on train
#test_pca = pca.transform(test_scaled)               # apply same PCA to test

#4.Train traditional ML models like:SVM, Random Forest, XGBoost, Logistic Regression, etc.(on the feature vectors extracted from training data)
#Evaluate on the test set
#Use the model trained above to predict on the test features.
#Compare accuracy, F1, etc.