# EDA using SIFT

SIFT identifies keypoints that are distinctive across an image’s width, height, and most importantly, scale. By considering scale, we can identify keypoints that will remain stable (to an extent) even when the template of interest changes size, when the image quality becomes better or worse, or when the template undergoes changes in viewpoint or aspect ratio. Moreover, each keypoint has an associated orientation that makes SIFT features invariant to template rotations. Finally, SIFT will generate a descriptor for each keypoint, a 128-length vector that allows keypoints to be compared. These descriptors are nothing more than a histogram of gradients computed within the keypoint’s neighborhood.

# Let's have a look at some images and their Keypoints

In [None]:
import pathlib
import pandas as pd
import os
import cv2
import matplotlib.pyplot as plt
%matplotlib inline
import timeit
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
curr_dir = os.getcwd()
parent_dir = pathlib.Path(curr_dir).parents[1]
filename = f"{parent_dir}/data/data_original/train.csv"
data_dir = f"{parent_dir}/data/data_original/train"

df = pd.read_csv(filename)
df_cell_compressed = df.drop_duplicates(subset=['id'])
df_cell_compressed.head(2)

# celltype shsy5y

In [None]:
img_path = '../../data/data_original/train/0030fd0e6378.png'
img1 = cv2.imread(img_path)

gray1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
#keypoints
sift = cv2.xfeatures2d.SIFT_create()
keypoints_1, descriptors_1 = sift.detectAndCompute(img1,None)
imag_1 = cv2.drawKeypoints(gray1,keypoints_1,img1)

# plots
fig, arr = plt.subplots(1,2, figsize=(30, 30))
arr[0].imshow(cv2.imread(img_path))
arr[1].imshow(cv2.drawKeypoints(gray1,keypoints_1,img1))

# celltype astrocytes

In [None]:
nOctaveLayers = 42

nfeatures=500
contrastThreshold = 0.05
edgeThreshold = 0.025
sigma = 0.98

img_path = '../../data/data_original/train/0140b3c8f445.png'
img1 = cv2.imread(img_path)

gray1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
#keypoints
sift = cv2.xfeatures2d.SIFT_create(nfeatures, nOctaveLayers, contrastThreshold,edgeThreshold, sigma)
keypoints_1, descriptors_1 = sift.detectAndCompute(img1,None)
imag_1 = cv2.drawKeypoints(gray1,keypoints_1,img1)

# plots
fig, arr = plt.subplots(1,2, figsize=(30, 30))
arr[0].imshow(cv2.imread(img_path))
arr[1].imshow(cv2.drawKeypoints(gray1,keypoints_1,img1))

# celltype cort

In [None]:
img_path = '../../data/data_original/train/01ae5a43a2ab.png'
img1 = cv2.imread(img_path)

gray1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
#keypoints
sift = cv2.xfeatures2d.SIFT_create()
keypoints_1, descriptors_1 = sift.detectAndCompute(img1,None)
imag_1 = cv2.drawKeypoints(gray1,keypoints_1,img1)

# plots
fig, arr = plt.subplots(1,2, figsize=(30, 30))
arr[0].imshow(cv2.imread(img_path))
arr[1].imshow(cv2.drawKeypoints(gray1,keypoints_1,img1))

# Feature Matching comparing images of different cell types

* we can compare images of different cell types and have a look if those images have the same keypoints

In [None]:
img_path = '../../data/data_original/train/085eb8fec206.png' #astro
#img_path = '../../data/data_original/train/01ae5a43a2ab.png' #cort

img1 = cv2.imread(img_path)

#img_path2 = '../../data/data_original/train/01ae5a43a2ab.png' #same cort
#img_path2 = '../../data/data_original/train/026b3c2c4b32.png' #cort
img_path2 = '../../data/data_original/train/0030fd0e6378.png'#shsy5y
#img_path2 = '../../data/data_original/train/0140b3c8f445.png' #astrocytes
img2 = cv2.imread(img_path2)


img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
img2 = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)
#sift
sift = cv2.xfeatures2d.SIFT_create()
keypoints_1, descriptors_1 = sift.detectAndCompute(img1,None)
keypoints_2, descriptors_2 = sift.detectAndCompute(img2,None)
#feature matching
bf = cv2.BFMatcher(cv2.NORM_L1, crossCheck=True)
matches = bf.match(descriptors_1,descriptors_2)
matches = sorted(matches, key = lambda x:x.distance)
img3 = cv2.drawMatches(img1, keypoints_1, img2, keypoints_2, matches[:10], img2, flags=2)
fig= plt.figure(figsize=(30,60))
plt.imshow(img3),plt.show()

# Feature matching loop

* we go a step further and search for keypoint matches by looping over different images

In [None]:
img_path = '../../data/data_original/train/085eb8fec206.png' #astro
#img_path = '../../data/data_original/train/01ae5a43a2ab.png' #cort

img1 = cv2.imread(img_path)

#img_path2 = '../../data/data_original/train/01ae5a43a2ab.png' #same cort
#img_path2 = '../../data/data_original/train/026b3c2c4b32.png' #cort
img_path2 = '../../data/data_original/train/0030fd0e6378.png'#shsy5y
#img_path2 = '../../data/data_original/train/0140b3c8f445.png' #astrocytes



img1 = cv2.imread(img_path)
img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
sift = cv2.xfeatures2d.SIFT_create()
keypoints_1, descriptors_1 = sift.detectAndCompute(img1,None)

match_lst = []
id_lst = []

img_path2 = '../../data/data_original/train/'
for item in os.listdir(img_path2):
    try:
        imgi = cv2.imread(img_path2 + item)
        imgi = cv2.cvtColor(imgi, cv2.COLOR_BGR2GRAY)
        #sift
        keypoints_i, descriptors_i= sift.detectAndCompute(imgi,None)
        #feature matching
        bf = cv2.BFMatcher(cv2.NORM_L1, crossCheck=True)
        matches = bf.match(descriptors_1,descriptors_i)
        matches = sorted(matches, key = lambda x:x.distance)
        match_lst.append(len(matches))
        id_lst.append(item)
    except:
        print(item)   

# Playing with parameters of SIFT


In [None]:
nOctaveLayers = 30
nfeatures=2000
contrastThreshold = 0.1
edgeThreshold = 0.2
sigma = 0.88

#img_path = '../../data/data_original/train/0140b3c8f445.png'
img_path = '../../data/data_original/train/085eb8fec206.png' #astro
img1 = cv2.imread(img_path)

#img_path2 = '../../data/data_original/train/01ae5a43a2ab.png' #same cort
#img_path2 = '../../data/data_original/train/026b3c2c4b32.png' #cort
#img_path2 = '../../data/data_original/train/0030fd0e6378.png'#shsy5y
img_path2 = '../../data/data_original/train/0140b3c8f445.png' #astrocytes
img2 = cv2.imread(img_path2)


img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
img2 = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)
#sift
sift = cv2.xfeatures2d.SIFT_create(nfeatures, nOctaveLayers, contrastThreshold,edgeThreshold, sigma)
keypoints_1, descriptors_1 = sift.detectAndCompute(img1,None)
keypoints_2, descriptors_2 = sift.detectAndCompute(img2,None)
#feature matching
bf = cv2.BFMatcher(cv2.NORM_L1, crossCheck=True)
matches = bf.match(descriptors_1,descriptors_2)
matches = sorted(matches, key = lambda x:x.distance)
img3 = cv2.drawMatches(img1, keypoints_1, img2, keypoints_2, matches[:500], img2, flags=2)
fig= plt.figure(figsize=(30,60))
plt.imshow(img3),plt.show()
print(len(matches))

# Classification with SIFT feature matching

As you can see in the dataframe above comparing the number of keypoint matches (NoKM) for each image pair, NoKM could be useful for classicication (Spoiler: Indeed, it turns out to be that case)

### 1. First, we calculate keypoints / descriptors for each img as before

* nice to know: sift.detect finds keypoints, sift.compute computes the descriptors from the keypoint we have found. the command used here (sift.detectAndCompute), does both at the same time

### 1.1 Train and test splitting

In [None]:
# Fixed seed for reproducability
RSEED = 42

In [None]:
df_cell_compressed2 = df_cell_compressed[['id','cell_type']]

id_temp = []
for i in range(0,df_cell_compressed2.shape[0]):
    id_temp.append("../../data/data_original/train/" + df_cell_compressed2.id.iloc[i] + ".png")

df_cell_compressed2['id'] = id_temp

# 1. We reserve 20% = ~516 images of our data as test data
train, test, y_train, y_test = train_test_split(df_cell_compressed2, 
                                                df_cell_compressed2.cell_type, 
                                                test_size=0.3, random_state=RSEED)

train.to_csv('cells_train.csv', header = False, index = False)
test.to_csv('cells_test.csv', header = False, index = False)


In [None]:
df_train = pd.read_csv('cells_train.csv', names=["img_path", "cell_type"])
df_test = pd.read_csv('cells_test.csv', names=["img_path", "cell_type"])

### 2.1 keypoint detection for the train set

In [None]:
nOctaveLayers = 42

nfeatures=500
contrastThreshold = 0.05
edgeThreshold = 0.1
sigma = 1


sift = cv2.xfeatures2d.SIFT_create(nfeatures, nOctaveLayers, contrastThreshold, edgeThreshold, sigma)

train_keypoint_lst = []
train_descriptor_lst = []
train_img_lst = []
train_cell_lst = []
for i in range(0,df_train.shape[0]):  #key_train_df.shape[0]  
        imgi = cv2.imread(df_train.img_path[i]) 
        imgi = cv2.cvtColor(imgi, cv2.COLOR_BGR2GRAY)
        #sift
        keypoints_i, descriptors_i= sift.detectAndCompute(imgi,None)
        #feature matching
        train_img_lst.append(df_train.img_path[i])
        train_cell_lst.append(df_train.cell_type[i])
        train_keypoint_lst.append(keypoints_i)
        train_descriptor_lst.append(descriptors_i)
key_train_df = pd.DataFrame()
key_train_df['img'] = train_img_lst
key_train_df['cell_type'] = train_cell_lst
key_train_df['keypoints'] = train_keypoint_lst
key_train_df['descriptors'] = train_descriptor_lst

In [None]:
key_train_df.head(1)

## 2.2 keypoint detection for the test set

In [None]:
test_keypoint_lst = []
test_descriptor_lst = []
test_img_lst = []
test_cell_lst = []
for i in range(0,df_test.shape[0]):  #key_test_df.shape[0]         
        imgi = cv2.imread(df_test.img_path[i]) 
        imgi = cv2.cvtColor(imgi, cv2.COLOR_BGR2GRAY)
        #sift
        keypoints_i, descriptors_i= sift.detectAndCompute(imgi,None)
        #feature matching
        test_img_lst.append(df_test.img_path[i])
        test_cell_lst.append(df_test.cell_type[i])
        test_keypoint_lst.append(keypoints_i)
        test_descriptor_lst.append(descriptors_i)
key_test_df = pd.DataFrame()
key_test_df['img'] = test_img_lst
key_test_df['cell_type'] = test_cell_lst
key_test_df['keypoints'] = test_keypoint_lst
key_test_df['descriptors'] = test_descriptor_lst

In [None]:
key_test_df.head(1)

3. Calculating NoKM for each image from the test test with every image of the train set

* How do we use that to classify:
Let's say, image "x" (from test) has 200 keypoints. We check each image from train set wether it has the same keypoints, count the numbers of those keypoints (keypoint matches) und put it into a list. After iterating through all images in the train set, we sort the list with the highest NoKM on top of a dataframe, in which the corresponding cell type of the train image is also saved.

* Now, we do a voting with the first 30 images. The cell type which occurs the most, defines the label of the test image.



In [None]:
y_pred = []
y_actual = []
j_start = 0
j_end = key_test_df.shape[0] 

for j in range(j_start, j_end): #key_train_df.shape[0] 
    match_lst = []
    cell_lst = []
    keypoints_j = key_test_df.keypoints[j]
    descriptors_j = key_test_df.descriptors[j]
    start = timeit.default_timer()

    for i in range(0, key_train_df.shape[0]):  #df_train.shape[0]         
        #sift
        keypoints_i =  key_train_df.keypoints[i]
        descriptors_i= key_train_df.descriptors[i]
        #feature matching
        bf = cv2.BFMatcher(cv2.NORM_L1, crossCheck=True)
        matches = bf.match(descriptors_j,descriptors_i)
        matches = sorted(matches, key = lambda x:x.distance)
        match_lst.append(len(matches))
        cell_lst.append(df_train.cell_type[i])
    #create temporary dataframe
    new_df = pd.DataFrame()
    new_df['keypoint_matches'] = match_lst
    new_df['cell_type'] = cell_lst
    new_df = new_df.sort_values('keypoint_matches', ascending = False).head(29)
    y_pred.append(list(new_df.cell_type.mode()))
    y_actual.append(df_test.cell_type[j])
    print(j+1, 'von', j_end+1)
    print('actual cell_type:', df_test.cell_type[j])
    print('predicted cell_type:',list(new_df.cell_type.mode()))
    #time measurement
    stop = timeit.default_timer()
    print('This prediction took', stop - start, 'seconds')
    print('---'*10)  
    del new_df

sum_df = pd.DataFrame()
sum_df['actual'] = y_actual
sum_df['predicted'] = y_pred
sum_df

## 3. corresponding Confusion Matrix

In [None]:
a = sum_df.drop(sum_df.index[[80, 129]]) #we drop them, because voting is not distinct
y_test = list(a.actual)
y_pred = list(a.predicted)
cfm = confusion_matrix(y_test, y_pred)
sns.heatmap(cfm, cmap='YlGnBu', annot=True, fmt='d', linewidths=.5);

report_dt = classification_report(y_test, y_pred)
print(report_dt)

In [None]:
a.to_csv('summary_sift_classification.csv', header = True, index = False)

# Summary

The classification of this (very straight forward) method is surprisingly accurate (0.89). Unfortunately, we misclassify 54% of the astrocytes as shsy5y. This seems to be a limitation of this method when it comes to this microscopic images. Other SIFT parameters lead to worse results.
The drawback of this method is the running time of the algorithm, escpecially the calculations of the keypoints takes several minutes on just 600 images. Therefore, we skip using this method for an increased dataset.