# Required libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%run -i ../scripts/fyp2021p3_group08_functions.py
import math
from skimage import transform
from skimage import morphology
import skimage
import seaborn as sns

## Loading in an image

In [None]:
im = plt.imread('../data/example_image/ISIC_0001769.jpg')
seg = plt.imread('../data/example_segmentation/ISIC_0001769_segmentation.png')
print(im.shape)
plt.imshow(seg, cmap = "gray")

In [None]:
plt.imshow(im[:,:,1], cmap = "gray")

# How many images are we dealing with

In [None]:
table = pd.read_csv("../data/example_ground_truth.csv")
table

### Seperating melanoma, keratosis and healthy images

In [None]:
id_melanoma = table["image_id"][table["melanoma"] == 1.0]
id_keratosis = table["image_id"][table["seborrheic_keratosis"] == 1.0]
id_healthy = table["image_id"][(table["melanoma"] == 0.0) & (table["seborrheic_keratosis"] == 0.0)]

In [None]:
print(len(id_keratosis))
print(len(id_melanoma))
print(len(id_healthy))

### Showing pictures from the lists of either sick or healthy lesions

In [None]:
img3 = plt.imread(f'../data/example_image/{list(id_healthy)[60]}.jpg')
#img3 = plt.imread(f'../data/example_image/ISIC_0015243.jpg')
seg3 = plt.imread(f'../data/example_segmentation/{list(id_healthy)[60]}_segmentation.png')
#seg3 = plt.imread(f'../data/example_segmentation/ISIC_0015243_segmentation.png')
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 15))
axes[0].imshow(img3)
axes[1].imshow(seg3, cmap='gray')

## Zooming in on the lesion

In [None]:
def crop_to_lesion(picture, segment):
    """
    "Zooms" in on the lesion, meaning it cuts away everything else
    
    Takes a picture and it's corresponding mask/segment
    Returns the picture and the segment, cropped to only the lesion 
    """
    
    height = segment.shape[0]
    width = segment.shape[1]
       
    img_df = pd.DataFrame(segment)    
    
    x_1 = x_2 = 0
    y_1 = y_ = 0
    
    #Iterating over all the columns finding the first column that has a white pixel
    for i in range(width):
        if list(img_df[i]).count(1.0) > 0:
            x_1 = i
            break
        else:
            continue
            
    #Iterating over all the columns backwards finding the first column that has a white pixel
    for i in range(width):
        if list(img_df[(width-1) - i]).count(1.0) > 0:
            x_2 = ((width-1) - i)
            break
        else:
            continue   
    
    
    #Iterating over all the rows finding the first row that has a white pixel
    for i in range(height):
        if list(img_df.loc[i]).count(1.0) > 0:
            y_1 = i
            break
        else:
            continue   
    
    #Iterating over all the rows backwards finding the first row that has a white pixel
    for i in range(height):
        if list(img_df.loc[(height-1) - i]).count(1.0) > 0:
            y_2 = ((height-1) - i)
            break
        else:
            continue
    
    img_part = picture[y_1:y_2,x_1:x_2]
    segment_part = segment[y_1:y_2,x_1:x_2]
    
    return (img_part, segment_part)

In [None]:
img3 = plt.imread(f'../data/example_image/{list(id_healthy)[50]}.jpg')
seg3 = plt.imread(f'../data/example_segmentation/{list(id_healthy)[50]}_segmentation.png')
cropped_img, cropped_seg = crop_to_lesion(img3, seg3)
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 15))
axes[0].imshow(cropped_img)
axes[1].imshow(cropped_seg, cmap='gray')

## Manipulating the picture

In [None]:
def length(cropped_picture, xy1, xy2):
    """
    Calculates the length between two coordinates that is on the lesion. (Pythagoras)
    
    Takes a cropped picture and to coordinate sets
    Returns the length between the two given coordinates if they lie on the lesion
    """
    
    img_np = np.array(cropped_picture)
    
    if img_np[xy1] == 1.0 and img_np[xy2] == 1.0:
        length = xy2[1] - xy1[1]
        width = xy2[0] - xy1[0]
        return (length**2+width**2)**0.5
    else:
        return "Insert valid coordinates on the leisure"
    
length(cropped_seg, (300,100), (400,200))

In [None]:
def edge_points(segment):
    """
    Finds all the coordinates to the points which lie on the edge of the lesion. 
    It creates an identical image to the segment only one pixel smaller. We then subtract the segment 
    with the smaller segment and we are left with only the border
    
    Takes a segment
    Returns coordinates to all the edge points
    
    """
    struct_el = morphology.disk(1)
    mask_eroded = morphology.binary_erosion(segment, struct_el)
    image_perimeter = segment - mask_eroded
    
    height = segment.shape[0]
    width = segment.shape[1]
    
    perimeter_df = pd.DataFrame(image_perimeter)
    
    coords = []
    
    for i in range(width):
        for val, j in zip(perimeter_df[i], range(len(perimeter_df[i]))):
            if val == 1.0:
                coords.append((i,j))
                
    return coords

In [None]:
def rescale_img(segment, scale):
    """
    Rescales an image so it has the width of the scale
    
    Takes a segment/mask
    Returns the segment/mask rescaled to have the width of the scale
    """
    
    width = segment.shape[1]
        
    cut_rescaled_segment = skimage.transform.rescale(segment, scale/width, anti_aliasing=False)
    
    return cut_rescaled_segment

In [None]:
def longest_diameter(segment):
    """
    Calculates the longest distance among all the edge points. Takes quadratic time since it has to calculate the distance
    from all the coordinates to all the coordinates.
    
    Takes a segment
    Returns the longest distance, and the two coordinates that makes the longest distance
    """
    edge_coords = edge_points(segment)
    dist = {}
    
    for coord, i in zip(range(len(edge_coords)),edge_coords):
        dist.update({coord:[]})
        for coord2,j in zip(range(len(edge_coords)),edge_coords):
            length = edge_coords[coord][1] - edge_coords[coord2][1]
            width = edge_coords[coord][0] - edge_coords[coord2][0]
            dist[coord].append(((round(((length**2+width**2)**0.5),2)),(i,j)))
            
    #creates a list of only the longest coordinate pair for every coordinate
    coordinates_max = [max(dist[key]) for key in dist]
    #Returns only the max of the list
    return max(coordinates_max)

In [None]:
def show_longest_diameter(img, segment): #currently not working
    cropped_img = rescale_img(img, 500)
    max_dia = longest_diameter(segment)
    x = [max_dia[1][0][0], max_dia[1][1][0]]
    y = [max_dia[1][0][1], max_dia[1][1][1]]
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 15))
    axes[0].imshow(cropped_img)
    axes[0].plot(x, y, color="red", linewidth=1)
    axes[1].imshow(cropped_segment, cmap='gray')
    axes[1].plot(x, y, color="red", linewidth=1)
    return fig
    #return (plt.imshow(cropped_img, cmap= "gray" ), plt.imshow(cropped_segment, cmap= "gray" ))

### Test

In [None]:
scaled_img = rescale_img(seg3, 500)
longest_diameter(scaled_img)

In [None]:
img3 = plt.imread(f'../data/example_image/{list(id_melanoma)[23]}.jpg')
#img3 = plt.imread(f'../data/example_image/ISIC_0015243.jpg')
seg3 = plt.imread(f'../data/example_segmentation/{list(id_melanoma)[23]}_segmentation.png')
#seg3 = plt.imread(f'../data/example_segmentation/ISIC_0015243_segmentation.png')
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 15))
axes[0].imshow(img3)
axes[1].imshow(seg3, cmap='gray')

# Getting a feature

## Compactness

In [None]:
def compactness(segment):
    """
    Calculates the compactness of a lesion
    """
    A, l = measure_area_perimeter(segment)
    C = l**2/(4*3.14159*A)
    return C

In [None]:
# #Calculates the compactness for all the 

# melanoma = [[],[]]
# not_melanoma = [[],[]]

# for i in range(len(id_melanoma)):
#     seg = plt.imread(f'../data/example_segmentation/{list(id_melanoma)[i]}_segmentation.png')
#     A, l = measure_area_perimeter(seg)
#     melanoma[0].append(A)
#     melanoma[1].append(l)
    
# for i in range(len(id_healthy)):
#     seg = plt.imread(f'../data/example_segmentation/{list(id_healthy)[i]}_segmentation.png')
#     A, l = measure_area_perimeter(seg)
#     not_melanoma[0].append(A)
#     not_melanoma[1].append(l)
    
# print(melanoma, not_melanoma)

In [None]:
fig=plt.figure()
ax=fig.add_axes([0,0,1,1])
ax.scatter(melanoma[0], melanoma[1], color='r')
ax.scatter(not_melanoma[0], not_melanoma[1], color='b')
ax.set_xlabel('Area')
ax.set_ylabel('Perimiter')
ax.set_title('scatter plot')
plt.show()

## Symmetry

In [None]:
def tilt_image(segment):
    """
    Rotates a segment so it's longest diameter is vertical
    Takes a long time if run with a normal picture that has not been rescaled.
    
    Takes a segment
    Returns the segment so it is vertical
    """
    length, coord = longest_diameter(segment)
    xy1 = coord[0]
    xy2 = coord[1]
    
    v1 = (xy2[0]-xy1[0], xy2[1]-xy1[1])
    v2 = (0,100)
    
    angel = 180-math.degrees(math.acos((v1[0]*v2[0]+v1[1]*v2[1])/(math.sqrt(v1[0]**2+v1[1]**2)*math.sqrt(v2[0]**2+v2[1]**2))))                       

    rot_im = transform.rotate(segment, 360-angel)
    #plt.imshow(rot_im, cmap='gray')
    return rot_im

In [None]:
#Calls the tilt_img with a picture that has been rescaled, and then that picture gets cropped to the lesion only
plt.imshow(dim_of_mark(img3, tilt_image(rescale_img(seg3, 500)))[1], cmap = "gray")

In [None]:
def symmetric(seg):
    """
    Calculates how "symmetric" a segment is. It flips the picture 180 degrees mirrors it and the subtracts
    it from the original. This is done both vertically and horisontally. The closer to 1 the return value is, the more
    symmetric the lesion is
    
    Takes a segment
    Returns the vertical and horisontal symmetry (between 0 and 1)
    """
    
    rescaled_img = rescale_img(seg, 500)
    tilted_img = tilt_image(rescaled_img)
    vertical_seg = crop_to_lesion(seg, tilted_img)[1]
    
    #Vertical difference    
    vertical_rot_180 = transform.rotate(vertical_seg, 180)
    vertical_flip = vertical_rot_180[:, ::-1]
    
    vertical_asymmetric_pixels = vertical_seg - vertical_flip
    
    white = len(vertical_asymmetric_pixels[vertical_asymmetric_pixels == 1.0])
    black = len(vertical_asymmetric_pixels[vertical_asymmetric_pixels == -1.0])
    vertical_asymmetry = black+white
    
#     #Plot the difference
#     fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(20, 15))
#     axes[0].imshow(vertical_seg, cmap = "gray")
#     axes[1].imshow(vertical_flip, cmap='gray')
#     axes[2].imshow(vertical_asymmetric_pixels, cmap = "gray")
    
    #Horisontal difference
    rot_90 = transform.rotate(tilted_img, 90) 
    rescaled_seg = crop_to_lesion(seg, rot_90)[1]   
    rot_180 = transform.rotate(rescaled_seg, 180)
    
    flip = rot_180[:, ::-1]
    
    asymmetric_pixels = rescaled_seg - flip
    
    white = len(asymmetric_pixels[asymmetric_pixels == 1.0])
    black = len(asymmetric_pixels[asymmetric_pixels == -1.0])
    horisontal_asymmetry = black+white
        
#     #Plot the difference
#     fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(20, 15))
#     axes[0].imshow(rescaled_seg, cmap = "gray")
#     axes[1].imshow(flip, cmap='gray')
#     axes[2].imshow(asymmetric_pixels, cmap = "gray")
    
    total = len(vertical_seg[vertical_seg == 1.0])
    
    #Checks for negative values
    vertical_symmetry = 1 - (vertical_asymmetry/total)
    if vertical_symmetry < 0:
        vertical_symmetry = vertical_symmetry*(-1)
    
    horisontal_symmetry = 1 - (horisontal_asymmetry/total)
    if horisontal_symmetry < 0:
        horisontal_symmetry = horisontal_symmetry*(-1)
    
    return (vertical_symmetry, horisontal_symmetry)

In [None]:
asy = symmetric(seg3)
print(asy)

In [None]:
# #Runs through all lesions with melanoma and alll healthy lesion and calculates their symmetry, takes 1 sec per picture -ish
# sym_melanoma = [[],[]]
# sym_not_melanoma = [[],[]]

# for i in range(len(id_melanoma)):
#     seg = plt.imread(f'../data/example_segmentation/{list(id_melanoma)[i]}_segmentation.png')
#     V, H = symmetric(seg)
#     sym_melanoma[0].append(V)
#     sym_melanoma[1].append(H)
    
# for i in range(len(id_healthy)):
#     seg = plt.imread(f'../data/example_segmentation/{list(id_healthy)[i]}_segmentation.png')
#     V, H = symmetric(seg)
#     sym_not_melanoma[0].append(V)
#     sym_not_melanoma[1].append(H)

In [None]:
fig=plt.figure()
ax=fig.add_axes([0,0,1,1])
ax.scatter(sym_melanoma[0], sym_melanoma[1], color='r')
ax.scatter(np.mean(sym_melanoma[0]), np.mean(sym_melanoma[1]), color='orange')
ax.scatter(sym_not_melanoma[0], sym_not_melanoma[1], color='b')
ax.scatter(np.mean(sym_not_melanoma[0]), np.mean(sym_not_melanoma[1]), color='green')
ax.set_xlabel('Vertical')
ax.set_ylabel('Horisontal')
ax.set_title('scatter plot')
plt.show()

In [None]:
print(np.mean(sym_melanoma[0]), np.mean(sym_melanoma[1]))
print(np.mean(sym_not_melanoma[0]), np.mean(sym_not_melanoma[1]))

# Loading MORE data

In [None]:
#local path to the 2017 data set
path_img = "../../../../../../documents/2017/ISIC-2017_Training_Data/ISIC-2017_Training_Data/"
path_segment = "../../../../../../documents/2017/ISIC-2017_Training_Part1_GroundTruth/ISIC-2017_Training_Part1_GroundTruth/"

table_2017 = pd.read_csv("../../../../../../documents/2017/ISIC-2017_Training_Part3_GroundTruth.csv")

In [None]:
melanoma_2017 = list(table_2017["image_id"][table_2017["melanoma"] == 1.0])
healthy_2017 = list(table_2017["image_id"][(table_2017["melanoma"] == 0.0) & (table_2017["seborrheic_keratosis"] == 0.0)])
print(len(melanoma_2017),len(healthy_2017))

In [None]:
img = plt.imread(f"{path_img}{melanoma_2017[100]}.jpg")
seg = plt.imread(f"{path_segment}{melanoma_2017[100]}_segmentation.png")
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 15))
axes[0].imshow(img)
axes[1].imshow(seg, cmap='gray')

### Going through the whole 2017 data set (Has already been done, don't run this code again)

In [None]:
# symmetry_melanoma = [[],[]]
# symmetry_healthy = [[],[]]

# for i in range(len(melanoma_2017)):
#     seg = plt.imread(f"{path_segment}{melanoma_2017[i]}_segmentation.png")
#     V, H = symmetric(seg)
#     symmetry_melanoma[0].append(V)
#     symmetry_melanoma[1].append(H)
    
# for i in range(len(healthy_2017)):
#     seg = plt.imread(f"{path_segment}{healthy_2017[i]}_segmentation.png")
#     V, H = symmetric(seg)
#     symmetry_healthy[0].append(V)
#     symmetry_healthy[1].append(H)

In [None]:
fig=plt.figure()
ax=fig.add_axes([0,0,1,1])
ax.scatter(symmetry_healthy[0], symmetry_healthy[1], color='b')
ax.scatter(symmetry_melanoma[0], symmetry_melanoma[1], color='r')
ax.scatter(np.mean(symmetry_melanoma[0]), np.mean(symmetry_melanoma[1]), color='orange')
ax.scatter(np.mean(symmetry_healthy[0]), np.mean(symmetry_healthy[1]), color='green')
ax.set_xlabel('Vertical')
ax.set_ylabel('Horisontal')
ax.set_title('scatter plot')
plt.savefig("../figures/scatter_symmetry")
plt.show()

In [None]:
big_table = pd.read_csv("../data/processed/final_balanced.csv")
#big_table = big_table.drop("seborrheic_keratosis", axis = 1)
#big_table = big_table.drop("Unnamed: 0", axis = 1)
#big_table["symmetry"] = (big_table["vertical_symmetry"]+big_table["horisontal_symmetry"])/2
#big_table = big_table.rename(columns = {"diagosis_sum": "diagnosis_sum"})
big_table

In [None]:
tablei = pd.read_csv("../data/sym_col.csv")
train = tablei[0:int(len(tablei)/3)]
test = tablei[int(len(tablei)/3):int(len(tablei)/3)*2]
val = tablei[int(len(tablei)/3)*2:int(len(tablei)/3)*3]
tablei.columns

In [None]:
#big_table.to_csv('../data/melanoma_healthy_sym.csv', index=False)

# Classifiers
### Splitting the dataset

We split the dataset into three sets of equal length. It is split up into a training set, a test set and a verification set. The verification set will not be touched untill we are done with our research

In [None]:
#Id's for the melanoma lesions
train_id = melanoma_2017[0:int(round(len(melanoma_2017)/3, 0))]
test_id = melanoma_2017[int(round(len(melanoma_2017)/3, 0)):int(round(len(melanoma_2017)/3, 0)*2)]
ver_id = melanoma_2017[int(round(len(melanoma_2017)/3, 0)*2):int((round(len(melanoma_2017)/3, 0)*3))-1]

#Creating the data frames for each melanoma set
train_melanoma = big_table[np.isin(big_table, train_id)]
test_melanoma = big_table[np.isin(big_table, test_id)]
ver_melanoma = big_table[np.isin(big_table, ver_id)]


#Id's for the healthy lesions
train_healthy_id = healthy_2017[0:int(round(len(healthy_2017)/3, 0))]
test_healthy_id = healthy_2017[int(round(len(healthy_2017)/3, 0)):int(round(len(healthy_2017)/3, 0)*2)]
ver_healthy_id = healthy_2017[int(round(len(healthy_2017)/3, 0)*2):int((round(len(healthy_2017)/3, 0)*3))-1]

#Creating the data frames for each healthy set
train_healthy = big_table[np.isin(big_table, train_healthy_id)]
test_healthy = big_table[np.isin(big_table, test_healthy_id)]
ver_healthy = big_table[np.isin(big_table, ver_healthy_id)]


#concat the dataframes
train = pd.concat([train_melanoma, train_healthy])
test = pd.concat([test_melanoma, test_healthy])
ver = pd.concat([ver_melanoma, ver_healthy])

In [None]:
train

## Plotting the differences

### Plotting the area to the perimeter (compactness)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 8))
ax[0].scatter(train_healthy["area"], train_healthy["perimeter"], color='b')
ax[0].scatter(train_melanoma["area"], train_melanoma["perimeter"], color='r')
ax[0].scatter(np.mean(train_healthy["area"]), np.mean(train_healthy["perimeter"]), color='green')
ax[0].scatter(np.mean(train_melanoma["area"]), np.mean(train_melanoma["perimeter"]), color='orange')
ax[0].set_xlabel('Area')
ax[0].set_ylabel("Perimeter")
ax[0].set_title('scatter plot train')

ax[1].scatter(test_healthy["area"], test_healthy["perimeter"], color='b')
ax[1].scatter(test_melanoma["area"], test_melanoma["perimeter"], color='r')
ax[1].scatter(np.mean(test_healthy["area"]), np.mean(test_healthy["perimeter"]), color='green')
ax[1].scatter(np.mean(test_melanoma["area"]), np.mean(test_melanoma["perimeter"]), color='orange')
ax[1].set_xlabel('Area')
ax[1].set_ylabel("Perimeter")
ax[1].set_title('scatter plot test')
plt.show()

In [None]:
sns.displot(list(big_table["symmetry"]), hue = list(big_table["diagnosis_sum"]) ,kind = "kde")

In [None]:
#sns.scatterplot(data = tablei, x = "relative_color_variance", y= "average_std", hue = "diagnosis_sum")
sns.scatterplot(data = tablei, x = "vertical_symmetry", y= "horisontal_symmetry", hue = "diagnosis_sum")

### Plotting the vertical symmetry to the horisontal symmetry (symmetry)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 8))
ax[0].scatter(train_healthy["vertical_symmetry"], train_healthy["horisontal_symmetry"], color='b')
ax[0].scatter(train_melanoma["vertical_symmetry"], train_melanoma["horisontal_symmetry"], color='r')
ax[0].scatter(np.mean(train_healthy["vertical_symmetry"]), np.mean(train_healthy["horisontal_symmetry"]), color='green')
ax[0].scatter(np.mean(train_melanoma["vertical_symmetry"]), np.mean(train_melanoma["horisontal_symmetry"]), color='orange')
ax[0].set_xlabel('Veritcal')
ax[0].set_ylabel("Horisontal")
ax[0].set_title('scatter plot train')

ax[1].scatter(test_healthy["vertical_symmetry"], test_healthy["horisontal_symmetry"], color='b')
ax[1].scatter(test_melanoma["vertical_symmetry"], test_melanoma["horisontal_symmetry"], color='r')
ax[1].scatter(np.mean(test_healthy["vertical_symmetry"]), np.mean(test_healthy["horisontal_symmetry"]), color='green')
ax[1].scatter(np.mean(test_melanoma["vertical_symmetry"]), np.mean(test_melanoma["horisontal_symmetry"]), color='orange')
ax[1].set_xlabel("Vertical")
ax[1].set_ylabel("Horisontal")
ax[1].set_title('scatter plot test')
plt.show()

### Plotting compactness to symmetry

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 8))
ax[0].scatter(train_healthy["symmetry"], train_healthy["compactness"], color='b')
ax[0].scatter(train_melanoma["symmetry"], train_melanoma["compactness"], color='r')
ax[0].scatter(np.mean(train_healthy["symmetry"]), np.mean(train_healthy["compactness"]), color='green')
ax[0].scatter(np.mean(train_melanoma["symmetry"]), np.mean(train_melanoma["compactness"]), color='orange')
ax[0].set_xlabel('Symmetry')
ax[0].set_ylabel("Compactness")
ax[0].set_title('scatter plot train')

ax[1].scatter(test_healthy["symmetry"], test_healthy["compactness"], color='b')
ax[1].scatter(test_melanoma["symmetry"], test_melanoma["compactness"], color='r')
ax[1].scatter(np.mean(test_healthy["symmetry"]), np.mean(test_healthy["compactness"]), color='green')
ax[1].scatter(np.mean(test_melanoma["symmetry"]), np.mean(test_melanoma["compactness"]), color='orange')
ax[1].set_xlabel("Symmetry")
ax[1].set_ylabel("Compactness")
ax[1].set_title('scatter plot test')
plt.show()

## Nearest mean classifier

In [None]:
def nearest_mean(data, feature):
    #Defining the mean from the training set
    comp_mean_mel = np.mean(train_melanoma["compactness"])
    sym_mean_mel = np.mean(train_melanoma["symmetry"])

    comp_mean_hel = np.mean(train_healthy["compactness"])
    sym_mean_hel = np.mean(train_healthy["symmetry"])
    
    ids = data["image_id"]
    
    #Keep track of errors
    correct = 0
    fails = 0
    total = len(data["image_id"])
    
    #print(df["diagosis_sum"])
    
    #For every lesion in the test set label it as having melanoma or not
    for idd in ids:
        #Take the information about the compactness and the symmetry
        C = float(data["compactness"][data["image_id"] == idd])
        sym = float(data["symmetry"][data["image_id"] == idd])
        
        if feature == "all":
            #Calculate the distance to the mean of both melanoma and healthy
            dist_mel = math.sqrt((comp_mean_mel - C)**2+(sym_mean_mel - sym)**2)
            dist_hel = math.sqrt((comp_mean_hel - C)**2+(sym_mean_hel - sym)**2)
        
        elif feature == "symmetry":
            dist_mel = abs(sym_mean_mel - sym)
            dist_hel = abs(sym_mean_hel - sym)
        
        elif feature == "compactness":
            dist_mel = abs(comp_mean_mel - C)
            dist_hel = abs(comp_mean_hel - C)
        
        #Get the actual diagnosis
        di = str(data["diagnosis_sum"][data["image_id"] == idd]).split()[1:-4]
        try:
            diagnosis = di[0]+" "+di[1]
        except:
            diagnosis = di[0]
        
        #Check if our labeling is correct
        if dist_mel > dist_hel:
            if diagnosis == "cancerous":
                correct += 1
            else:
                fails += 1
        else:
            if diagnosis == "not cancerous":
                correct += 1
            else:
                fails += 1
    
    print(feature, "correct ", correct/total)
    print(feature, "fails:", fails/total)

In [None]:
for i in ["symmetry", "compactness", "all"]:
    nearest_mean(train, i)

In [None]:
for i in ["symmetry", "compactness", "all"]:
    nearest_mean(test, i)

In [None]:
for i in ["symmetry", "compactness", "all"]:
    nearest_mean(test, i)

## Nearest neighbour classifier

In [None]:
def nearest_neighbour(data, feature):
    #calculate distance to all other points and choose the closest one. This becomes the diagnosis
    
    v = (list(train["compactness"]), list(train["symmetry"]), list(train["diagnosis_sum"]))
    
    ids = list(data["image_id"])
    
    #Keep track of errors
    correct = 0
    fails = 0
    
    correct_cancer = 0
    correct_healthy = 0
    
    total = len(data["image_id"])
    
    for idd in ids:
        dist = []

        C = float(data["compactness"][data["image_id"] == idd])
        sym = float(data["symmetry"][data["image_id"] == idd])
        
        di = str(data["diagnosis_sum"][data["image_id"] == idd]).split()[1:-4]
        try:
            diagnosis = di[0]+" "+di[1]
        except:
            diagnosis = di[0]
        
        #Calculate the distance to all the other points in the training set
        if feature == "all":
            for comp, symmetry, diag in zip(v[0], v[1], v[2]):
                dist.append((math.sqrt((comp - C)**2+(symmetry - sym)**2), diag))
        label = min(dist)[1]
        
        #Check if our labeling is correct
        if label == diagnosis:
            correct += 1
            if label == "cancerous":
                correct_cancer += 1
            else:
                correct_healthy += 1
            
        else:
            fails += 1
    
    print(feature, "correct ", correct/total)
    print(feature, "fails:", fails/total)
    print(correct_cancer, correct_cancer/len(data[data["melanoma"] == 1.0]), correct_healthy, correct_healthy/len(data[data["melanoma"] == 0.0]))

In [None]:
nearest_neighbour(test, "all")

# LOADING EVEN MORE DATA

In [None]:
#local path to the 2017 data set
path_bias = "../../../../../../documents/bias/images/{}"
#path_segment = "../../../../../../documents/2017/ISIC-2017_Training_Part1_GroundTruth/ISIC-2017_Training_Part1_GroundTruth/"

bias_table = pd.read_csv("../../../../../../documents/bias/metadata.csv")

In [None]:
bias_table

In [None]:
print(len(np.unique(bias_table["patient_id"])))
print(bias_table.columns)
for i in range(1,7):
    print(i, len(bias_table["fitspatrick"][bias_table["fitspatrick"] == i]))

In [None]:
len(bias_table["img_id"][bias_table["diagnostic"] == "NEV"])

In [None]:

imgg = plt.imread(path_bias.format("PAT_792_4351_858.png"))
plt.imshow(imgg)

In [None]:
def check_id(id_list, path_picture):
    missing_picture = 0
    missing_segment = 0
    for idd in id_list:
        try:
            np.fromfile(path_picture.format(idd))
        except:
            missing_picture += 1
        
        #try:
        #    np.fromfile(path_segment.format(idd))
        #except:
        #    missing_segment += 1
            
    return ("Missing_pictures:", missing_picture, "Missing segments", missing_segment)

In [None]:
check_id(bias_table["img_id"], path_bias)

# Open question: running the models with keratosis pictures

In [None]:
table_2017
#table_keratosis = table_2017[table_2017["seborrheic_keratosis"] == 1.0]
#table_keratosis = table_keratosis.drop("melanoma", axis = 1)
#table_keratosis

In [None]:
np.std([1,0.5,0.75,1,1,0.5,0.5,0.75])
#np.std([1,1,1,1])