In [1]:
import pandas as pd
import cv2 as cv
import matplotlib.pyplot as plt

from PIL import Image
import numpy as np
import os
from tqdm import tqdm # for loading progress

import seaborn as sns #  for plotting
import sys

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC,LinearSVC, LinearSVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA

#from KOC.koc.src.elpv_reader import load_dataset

from keras.applications.vgg19 import VGG19, preprocess_input
from keras.preprocessing import image
from keras.models import Model



# 0. Setup

In [4]:
# Reduce the dataset while maintaining correct distributions of samples from the original
def reduce_dataset(df, fraction=1.0):

    if fraction == 1.0:
        return df

    sample_sizes = df.groupby(['type', 'proba']).size().mul(fraction).astype(int)



    # Create an empty DataFrame to store the reduced dataset

    reduced_df = pd.DataFrame(columns=df.columns)



    # Iterate over each group and sample according to the calculated sample sizes

    for (type_value, proba_value), size in sample_sizes.items():

        # Filter the dataframe for the current group

        group_df = df[(df['type'] == type_value) & (df['proba'] == proba_value)]

        # Sample 'size' number of rows from the group

        samples = group_df.sample(n=size, random_state=1) if size > 0 else pd.DataFrame()

        # Append the samples to the reduced dataframe

        reduced_df = pd.concat([reduced_df, samples])



    return reduced_df

In [5]:
sys.path.append('../../elpv-dataset/utils')
from elpv_reader import load_dataset
images, probas, types = load_dataset()

In [6]:


img_ids = np.arange(images.shape[0])
df = pd.DataFrame({'img_id': img_ids, 'type': types, 'proba': probas})
reduced_df = reduce_dataset(df, 1.0) # reduce to 50% original

# 1. Feature Extraction

## 1.1 Keypoint Detection

### 1.1.1 Dense Sampling

In [7]:
class DenseSampler:
    def __init__(self, grid_size=55):
        self.grid_size = grid_size

    def detect(self, image):
        assert image is not None
        img_dim = image.shape[0]
        n_cells_x = img_dim // self.grid_size
        n_cells_y = img_dim // self.grid_size

        # Calculate the centers of each grid cell as the keypoint
        centers = tuple(cv.KeyPoint(x * self.grid_size + self.grid_size / 2, y * self.grid_size + self.grid_size / 2, self.grid_size)
                       for y in range(n_cells_y) for x in range(n_cells_x))

        return centers # cv.KeyPoint

In [8]:
dsampler = DenseSampler(grid_size=60) #ideal from paper
image_keypoints = {i: dsampler.detect(images[i]) for i in tqdm(reduced_df['img_id'], desc="Densley sampling image keypoints")}
image_keypoints = {k: v for k, v in image_keypoints.items() if v} # remove any images where no kps are detected

image_kp_offsets = {}

offset = 0
for img_id in img_ids:
  kps = image_keypoints.get(img_id, None)
  if kps:
    num_kps = len(kps)
    image_kp_offsets[img_id] = (offset, num_kps)
    offset += num_kps
total_descriptors = offset

Densley sampling image keypoints: 100%|██████████| 2624/2624 [00:00<00:00, 92046.82it/s]


## 1.2 Feature description

### 1.2.1 SIFT descriptor

In [9]:
sift = cv.SIFT_create()

descriptor_size = sift.descriptorSize()
print('total #descriptors:',total_descriptors, 'descriptor size:', descriptor_size)

all_descriptors = np.empty((total_descriptors, 128), dtype=np.float64)

for img_id, keypoints in image_keypoints.items():
    start, num_desc = image_kp_offsets[img_id]
    end = start + num_desc
    #print('id, start:end', f'{img_id}, {start}:{end}')
    all_descriptors[start: end, :] = sift.compute(images[img_id], keypoints)[1]

#sift_descriptors = {k: sift.compute(images[k],v)[1] for k, v in image_keypoints.items()}p
print('all_desc.shape', all_descriptors.shape)

total #descriptors: 65600 descriptor size: 128
all_desc.shape (65600, 128)


# 2. Encoding

In [10]:
# Codebook creation

def encode_descriptors(all_descriptors):
    """Creates the VLAD codebook  using subset clustering.
        Descriptors for training samples only should be used."""

    D = {}
    num_subsets = 5
    m = num_subsets
    subset_size = len(all_descriptors) // 4
    k = 20 # 20 with subser // 20 best!! (5% of training data)

    print(all_descriptors.shape)


    for i in range(num_subsets):
        subset_indices = np.random.choice(len(all_descriptors), subset_size, replace=False)
        subset_descriptors = all_descriptors[subset_indices]
        kmeans = MiniBatchKMeans(n_clusters=k, random_state=i, batch_size=256*24) # 256*#cores
        kmeans.fit(subset_descriptors)# replace with only test set samples
        D[i] = kmeans


    def vlad_encode(descriptors):
        """a vlad encoding is return for the descriptors of a single image"""
        d = descriptors.shape[1]

        # encode the desctiptors for a single image into a vlad vector for each kmean subset
        vlad_vector = []#np.empty((1, m*k*d))
        pca = PCA(whiten=True)#, n_components=128)

        p = 0.5

        for idx, kmeans_ in D.items():
            cluster_assignments = kmeans_.predict(descriptors)

            vlad_vector_ = np.zeros((k, descriptors.shape[1]),)

            for idx, cluster_idx in enumerate(cluster_assignments):
                vlad_vector_[cluster_idx] += (descriptors[idx] - kmeans_.cluster_centers_[cluster_idx])

            vlad_vector_ = np.sign(vlad_vector_) * np.abs(vlad_vector_) ** p
            vlad_vector_ = normalize(vlad_vector_.reshape(1, -1), axis=1, norm='l2') # (1, Kd)
            vlad_vector.append(vlad_vector_)


        vlad_vector = np.hstack(vlad_vector)
        vlad_vector = pca.fit_transform(vlad_vector.reshape(-1,1)) # (1, mKd)
        vlad_vector = normalize(vlad_vector.reshape(1, -1), axis=1, norm='l2')

        return vlad_vector

    return vlad_encode

class Descriptors:
    def __init__(self, desc_offsets, all_descriptors, encoder):
        self.desc_offsets = desc_offsets
        self.all_descriptors = all_descriptors
        self.encoder = encoder
        self.encoder_is_init = False
        #self.vlad_encode = encode_descriptors

    def get_by_id(self, img_id):
        off, num_desc = self.desc_offsets[img_id]
        end = off + num_desc
        return self.all_descriptors[off: end, :]

    def get_by_ids(self, img_ids):
        """Returns:
            descriptors grouped by the order of id list
        """
        descs_by_id = []
        for id in img_ids:
            descs_by_id.append(self.get_by_id(id))
        return descs_by_id

    def init_encoder(self, all_train_descriptors):
        self.encoder = self.encoder(all_train_descriptors)
        self.encoder_is_init = True

    def _get_vlad_vector(self, id):
        if not self.encoder_is_init:
            raise Exception("initiliase encoder first.")

        descriptors = self.get_by_id(id)
        vlad_vector = self.encoder(descriptors)
        return vlad_vector


    def get_all_vlad_vectors(self, img_ids):
        return np.vstack([self._get_vlad_vector(id) for id in img_ids])

# 3. Classification

## 3.1 SVM

In [11]:
# RUN svm classifier


#X = np.vstack(all_vlad_vectors) # The descriptors as VLAD vectors
X = sorted(image_keypoints)
#y = new_y # The y with all samples without any detected keypoints/descriptors removed
y = reduced_df[reduced_df['img_id'].isin(image_kp_offsets.keys())].sort_index()['proba'].to_numpy()
 # The y with all samples without any detected keypoints/descriptors removed

y_binary = np.where(y > 0, 1, 0)

# weights are:
# 1.0         (class 0) functional
# 0.33,0.67,1 (class 1) defective
# This is a binary classification problem
sample_weights = np.where((y == 0) | (y == 1), 1, y)

X_train, X_test, y_train, y_test, sw_train, sw_test = train_test_split(
    X, y_binary,
    sample_weights,   # the certainty of the class label
    test_size=0.25,   # from the paper 75/25 split
    random_state=42,
    stratify=y_binary # ensure balanced functional/defective samples
  )

# Inverse proportion heuristic derived from King and Zeng (2001) from the paper
class_weights = {
    0: y_train.size / 2*np.sum(y_train == 0),
    1: y_train.size / 2*np.sum(y_train == 1)
}

descriptors = Descriptors(image_kp_offsets, all_descriptors, encode_descriptors)
X_train_descriptors = np.vstack(descriptors.get_by_ids(X_train))

# ===============================================================================
# Create the VLAD codebook with subsets of the descriptors training set
# ===============================================================================
descriptors.init_encoder(X_train_descriptors)

# ===============================================================================
# Create VLAD vector for each image in the training set
# with its descriptors using the created codebook
# ===============================================================================
all_vlad_vectors = descriptors.get_all_vlad_vectors(X_train)

# ===============================================================================
# Initialize and train the SVM classifier using the vlad vectors created
#  for each image
# ===============================================================================
clf = LinearSVC(class_weight='balanced', C=1)#, max_iter=10000)
clf.fit(all_vlad_vectors, y_train, sample_weight=sw_train)

# ===============================================================================
# Create VLAD vector for each image in the test set
# ===============================================================================
all_vlad_vectors = descriptors.get_all_vlad_vectors(X_test)

y_pred = clf.predict(all_vlad_vectors)

# Print the classification report
print(classification_report(y_test, y_pred, sample_weight=sw_test))

(49200, 128)
              precision    recall  f1-score   support

           0       0.74      0.83      0.79     377.0
           1       0.64      0.51      0.56 219.00000000000006

    accuracy                           0.71     596.0
   macro avg       0.69      0.67      0.68     596.0
weighted avg       0.71      0.71      0.70     596.0



## 3.2 Random Forest Classifier

In [12]:
X = sorted(image_keypoints)
#y = new_y # The y with all samples without any detected keypoints/descriptors removed
y = reduced_df[reduced_df['img_id'].isin(image_kp_offsets.keys())].sort_index()['proba'].to_numpy()
 # The y with all samples without any detected keypoints/descriptors removed

y_binary = np.where(y > 0, 1, 0)

# weights are:
# 1.0         (class 0) functional
# 0.33,0.67,1 (class 1) defective
# This is a binary classification problem
sample_weights = np.where((y == 0) | (y == 1), 1, y)

X_train, X_test, y_train, y_test, sw_train, sw_test = train_test_split(
    X, y_binary,
    sample_weights,   # the certainty of the class label
    test_size=0.25,   # from the paper 75/25 split
    random_state=42,
    stratify=y_binary # ensure balanced functional/defective samples
  )

# Inverse proportion heuristic derived from King and Zeng (2001) from the paper
class_weights = {
    0: y_train.size / 2*np.sum(y_train == 0),
    1: y_train.size / 2*np.sum(y_train == 1)
}

descriptors = Descriptors(image_kp_offsets, all_descriptors, encode_descriptors)
X_train_descriptors = np.vstack(descriptors.get_by_ids(X_train))

# ===============================================================================
# Create the VLAD codebook with subsets of the descriptors training set
# ===============================================================================
descriptors.init_encoder(X_train_descriptors)

# ===============================================================================
# Create VLAD vector for each image in the training set
# with its descriptors using the created codebook
# ===============================================================================
all_vlad_vectors = descriptors.get_all_vlad_vectors(X_train)

# ===============================================================================
# Initialize and train the SVM classifier using the vlad vectors created
#  for each image
# ===============================================================================
#clf = LinearSVC(class_weight='balanced', C=1)#, max_iter=10000)
clf = RandomForestClassifier(class_weight='balanced')
clf.fit(all_vlad_vectors, y_train, sample_weight=sw_train)


# ===============================================================================
# Create VLAD vector for each image in the test set
# ===============================================================================
all_vlad_vectors = descriptors.get_all_vlad_vectors(X_test)

y_pred = clf.predict(all_vlad_vectors)

# Print the classification report
print(classification_report(y_test, y_pred, sample_weight=sw_test))

(49200, 128)
              precision    recall  f1-score   support

           0       0.82      0.86      0.84     377.0
           1       0.74      0.67      0.70 219.00000000000006

    accuracy                           0.79     596.0
   macro avg       0.78      0.77      0.77     596.0
weighted avg       0.79      0.79      0.79     596.0

