In [6]:
###Progression bar
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

# Building a melanoma classifier

In [3]:
import os
import numpy as np
import pandas as pd
from skimage.measure import find_contours
from skimage.io import imread
from skimage.transform import resize
from skimage import draw
import matplotlib.pyplot as plt
#from mpl_toolkits.axes_grid1 import AxesGrid
import cv2
#from copy import copy
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.decomposition import PCA
from sklearn import metrics
from IPython.display import HTML
#from mlxtend.feature_selection import SequentialFeatureSelector
import time
from imblearn.over_sampling import SMOTE



%matplotlib inline

In [4]:
df = pd.read_csv('data/train.csv')
X_df = df['ImageId']
y_df = df['Malignant']
X = X_df.values
y = y_df.values

Our training dataset :

In [170]:
df.head()

Unnamed: 0,ImageId,Malignant
0,IM_000498,0
1,IM_000617,0
2,IM_000394,0
3,IM_000244,0
4,IM_000599,0


In [171]:
labels_counts_df = df.groupby('Malignant').count()
labels_counts_df = labels_counts_df.rename(columns={'Malignant': 'count'})
labels_counts_df

Unnamed: 0_level_0,ImageId
Malignant,Unnamed: 1_level_1
0,485
1,115


In [172]:
name_im = X[1]
filename = 'data/im/{}.jpg'.format(name_im)
image = imread(filename)
filename_Segmentation = 'data/im/{}_Segmentation.jpg'.format(name_im)
image_Segmentation = imread(filename_Segmentation) # Value 0 or 255

# Cleaning the data

First, we need to clean the data. To do that, we will only keep the pictures whose segmentation respect all of those criterias :

-  segmentation results with the lesion mask growing into the image border are rejected
-  segmentation results without any detected region are rejected
-  segmentation results comprising fragments at the image borders are rejected

In [212]:
#cdf is the clean dataframe
cdf = df.copy()


for im in X_df:
    #Reading the files
    filename_Segmentation = 'data/im/{}_Segmentation.jpg'.format(im)
    image_Segmentation = imread(filename_Segmentation) # Value 0 or 255
    
    #Rejecting segmentations with "white" in the border or no segmentation at all
    if (np.mean(image_Segmentation[0,:]) != 0 
            or np.mean(image_Segmentation[-2,:]) != 0 
                    or np.mean(image_Segmentation[0,:]) != 0 
                            or np.mean(image_Segmentation[-2,:]) != 0 
                                    or np.mean(image_Segmentation) == 0
                                            or im == 'IM_000877'):
            cdf = cdf[cdf.ImageId != im]

index = cdf['ImageId'].values
y = cdf['Malignant'].values

print("Clean DataFrame has {} elements, while the old one has {} elements".format(len(cdf), len(df)))        

Clean DataFrame has 511 elements, while the old one has 600 elements


In [7]:
cdf.to_csv('data/train_clean.csv')

Otherwise, we can load the previously calculated clean Dataframe :

In [4]:
cdf = pd.read_csv('data/train_clean.csv', index_col = 0)
index = cdf['ImageId'].values
y = cdf['Malignant'].values

In [5]:
cdf.head()

Unnamed: 0,ImageId,Malignant
1,IM_000617,0
2,IM_000394,0
3,IM_000244,0
4,IM_000599,0
6,IM_000279,0


In [6]:
labels_counts_cdf = cdf.groupby('Malignant').count()
labels_counts_cdf = labels_counts_cdf.rename(columns={'Malignant': 'count'})
labels_counts_cdf

Unnamed: 0_level_0,ImageId
Malignant,Unnamed: 1_level_1
0,431
1,80


In [11]:
from PIL import Image

t1 = time.time()

for im in log_progress(X_test , every = 1, name = "Images"):
    image_mul_mask, skin_mul_mask = read_image(im)
    lesion = Image.fromarray(image_mul_mask)
    lesion.save("data/im/processed_ims/{}.jpg".format(im))

t2 = time.time()
print("Effectué en {} secondes".format(round(t2-t1)))
    
    

A Jupyter Widget

Effectué en 146 secondes


# Feature calculation

We will use 26 different features, including color, border, texture and other local features.

In [8]:
def read_image(im):
    filename = 'data/im/{}.jpg'.format(im)
    image = imread(filename)
    filename_Segmentation = 'data/im/{}_Segmentation.jpg'.format(im)
    image_Segmentation = imread(filename_Segmentation) # Value 0 or 255
    #Resizing images
    (h,w,c) = image.shape
    h_new = 300
    w_new = round(h_new*w/h)
    image = resize(image,(h_new,w_new), mode = 'reflect')
    image = (255 * image).astype(np.uint8)
    #Applying the segmentation mask
    image_Segmentation = resize(image_Segmentation,(h_new,w_new), mode='reflect')
    ret,image_Segmentation = cv2.threshold(image_Segmentation,0.5,1,cv2.THRESH_BINARY)
    image_Segmentation_boolean = image_Segmentation.astype(np.uint8) # To get uint8
    image_Segmentation_expand = np.expand_dims(image_Segmentation_boolean, axis=2)
    image_mul_mask = (image_Segmentation_expand*image)
    skin_mul_mask = (1-image_Segmentation_expand)*image
    return image_mul_mask, skin_mul_mask


def dist(coords1, coords2):
    return np.sqrt((coords1[0]-coords2[0])**2 + (coords1[1]-coords2[1])**2)

def get_longest_diagonal(contours):
    max_diag = 0
    c1, c2 = [],[]
    for coords1 in contours:
        for coords2 in contours:
            d = dist(coords1,coords2)
            if (d > max_diag):
                max_diag = d
                c1, c2 = coords1, coords2
    return [c1,c2]

def get_normal_vector(c1,c2):
    x = c1[0]-c2[0]
    y = c1[1]-c2[1]
    if (x==0):
        y=1
    elif (y==0):
        x=1
    else:
        if (abs(x)<abs(y)):
            y = y/x
            x = 1
        else:
            x = -x/y
            y = -1
    return [-y,x]

def is_in(el, arr):
    for e in arr:
        if (e[0]==el[0] and e[1]==el[1]):
            return True
    return False

def get_intersections(start, vec, contour):
    x,y = vec
    if (x==1):
        z=0
    else:
        z=1
    x,y = vec[z], vec[1-z]
    int1,int2 = start[:], start[:]
    a=0
    c=0
    increase = True
    while not(is_in(int1,contour)) and c<200:
        if (x==0):
            int1[1]+=1
        elif (y==0):
            int1[0]+=1
        elif (increase):
            int1[z]+=1
            increase = False
        else:
            int1[1-z]+=np.sign(y)
            a+=1
            if (a==abs(y)):
                a=0
                increase = True
        c+=1
    a=0
    c=0
    increase = True
    while not(is_in(int2,contour)) and c<200:
        if (x==0):
            int2[1]-=1
        elif (y==0):
            int2[0]-=1
        elif (increase):
            int2[z]-=1
            increase = False
        else:
            int2[1-z]-=np.sign(y)
            a+=1
            if (a==abs(y)):
                a=0
                increase = True
        c+=1
            
    return int1, int2           

def change_base(Np):
    Npa = np.array(Np[:])-Np[2]
    Npb = np.array(Np[:])-Np[2]
    if (Npb[3,0] == Npb[1,0]):
        Npb[:,0] = Npa[:,1]
        Npb[:,1] = - Npa[:,0]
    else:
        a = (Npa[3,1] - Npa[1,1])/((Npa[3,0] - Npa[1,0]))
        M = np.array([[1,-a],[a,1]])
        Npb = Npb.dot(M)
    return Npb

def threshold(a,s):
    if (a>s):
        return 1
    elif (a<s):
        return -1
    else:
        return 0

In [67]:
def compute_features(Xf, im):
    ####Reading the files containing the image and the segmentation mask, and combining them
    image_mul_mask, skin_mul_mask = read_image(im)


    ####Computing min, max, average and std of the normalized H and V channels
    im_hsv = cv2.cvtColor(image_mul_mask, cv2.COLOR_RGB2HSV)
    int_not0 = im_hsv[np.nonzero(im_hsv[:,:,2])]     #We remove the non-segmented part
    skin_hsv = cv2.cvtColor(skin_mul_mask, cv2.COLOR_RGB2HSV).astype(np.double)
    skin_int_not0 = skin_hsv[np.nonzero(skin_hsv[:,:,2])]     #We remove the non-segmented part
    #Normalization
    mean_int_skin = np.mean(skin_int_not0[:,2])
    unique, counts = np.unique(skin_int_not0[:,0], return_counts=True)
    n_hue_skin = unique[np.argmax(counts)]
    int_n_not0 = int_not0 - [mean_int_skin, 0, n_hue_skin]
    min_hue, max_hue, mean_hue, std_hue = np.min(int_n_not0[:,0]), np.max(int_n_not0[:,0]), np.mean(int_n_not0[:,0]), np.std(int_n_not0[:,0])
    min_int, max_int, mean_int, std_int = np.min(int_n_not0[:,2]), np.max(int_n_not0[:,2]), np.mean(int_n_not0[:,2]), np.std(int_n_not0[:,2])


    ####Computing perimeter and area
    area = int_not0.shape[0]
    contours = find_contours(image_mul_mask[:,:,2],0)
    contour = np.concatenate(contours).astype('int')
    #image_mul_mask[contour[:,0].astype('int'),contour[:,1].astype('int')].shape   (To get the values of the contour)
    per = len(contour)


    ####Asymmetry features
    c1, c2 = get_longest_diagonal(contour)
    #Getting the perpendicular lines
    starting_points = []
    ratios = []
    p=10
    normal_vec = get_normal_vector(c1,c2)
    for k in range(1,p):
        t=k/10
        starting_points.append([round(t*c1[0]+(1-t)*c2[0]),round(t*c1[1]+(1-t)*c2[1])])
        int1, int2 = get_intersections(starting_points[k-1], normal_vec, contour)
        dist1, dist2 = dist(int1,starting_points[k-1]), dist(int2,starting_points[k-1])
        if (dist2!=0):
            ratios.append(dist1/dist2)
        else:
            ratios.append(0)
    std_asymetry = np.std(ratios)


    ####Border features
    #Small irregularities
    inflexion_points=[0,0,0]
    for p in range(per):
        if (p<2):
            Np = np.concatenate((contour[p-2][np.newaxis],contour[p-1][np.newaxis],contour[p:p+3]))
        elif (p+2>=per):
            Np = np.concatenate((contour[p-2:p+1],contour[(p+1) % per][np.newaxis],contour[(p+2) % per][np.newaxis]))
        else:
            Np = contour[p-2:p+3]
        Npb = change_base(Np)
        Dl = threshold(Npb[0][1],0) + threshold(Npb[1][1],0)
        Dr = threshold(Npb[3][1],0) + threshold(Npb[4][1],0)
        if (min(abs(Dl),abs(Dr))>=1):
            inflexion_points[int(1-np.sign((1+Dl*Dr/abs(Dl*Dr))*(Dl+Dr)))] += 1
    inflexion_points = [x / np.sum(inflexion_points) for x in inflexion_points]
    #Large irregularities
    irregularities=[0,0,0]
    for p in range(per):
        p1, p2, p3 = contour[p-30], contour[p-15], contour[p]
        Vp = (p2[0]-p1[0])*(p3[1]-p1[1]) - (p2[1]-p1[1])*(p3[0]-p1[0])
        irregularities[1 - np.sign(Vp)] += 1
    irregularities = [x / per for x in irregularities]


    Xf.loc[im] = [min_hue, max_hue, mean_hue, std_hue, min_int, max_int, mean_int, std_int, area, per,
                  std_asymetry] + ratios + inflexion_points + irregularities      


In [74]:
cols = ['min_hue', 'max_hue', 'mean_hue', 'std_hue', 'min_int', 'max_int','mean_int', 'std_int', 'area', 'perimeter',
        'std_asymetry']+['Ratio{}'.format(i) for i in range(1,10)]+['s_valleys','s_str_lines','s_peaks', 'l_valleys', 
                                                                    'l_str_lines', 'l_peaks']

Xf = pd.DataFrame(columns = cols, index = index)

t1 = time.time()

for im in log_progress(index , every = 1, name = "Images"):
    compute_features(Xf, im)

t2 = time.time()
print("Effectué en {} secondes".format(round(t2-t1)))

A Jupyter Widget

Effectué en 1879 secondes


Centering and scaling the features :

In [75]:
m,sigma = [],[]

for j in range(Xf.shape[1]):
    m.append(np.mean(Xf.iloc[:,j]))
    sigma.append(np.std(Xf.iloc[:,j]))
    
    Xf.iloc[:,j] = (Xf.iloc[:,j] - m[j])/sigma[j]

In [77]:
Xf.to_csv('data/train_features.csv')

Otherwise, we can load the previously calculated features Dataframe :

In [42]:
Xf = pd.read_csv('data/train_features.csv', index_col = 0)

In [76]:
Xf.head()

Unnamed: 0,min_hue,max_hue,mean_hue,std_hue,min_int,max_int,mean_int,std_int,area,perimeter,...,Ratio6,Ratio7,Ratio8,Ratio9,s_valleys,s_str_lines,s_peaks,l_valleys,l_str_lines,l_peaks
IM_000617,-0.391442,-1.47536,-0.661559,-0.671694,0.921894,0.451743,0.608976,-0.415395,-1.25188,-0.6976,...,0.773113,0.417941,0.643495,0.134792,-0.862803,0.0883643,0.714648,-0.945552,-0.765781,1.45883
IM_000394,-1.25721,0.279076,-1.23143,-0.342971,0.867895,0.918748,0.912507,-0.392405,1.31448,0.572492,...,0.267291,0.314388,0.324018,0.621877,0.529725,0.04859,-0.617437,1.32662,-0.711353,-0.965393
IM_000244,0.361596,0.861025,0.753603,1.67943,-1.58906,-1.66535,-1.42854,-0.271365,0.973543,0.532801,...,-0.470301,-0.356884,-0.281196,-0.489852,1.04223,-0.968856,0.634532,1.60307,-0.466311,-1.40472
IM_000599,-0.167235,0.670914,-0.153786,-0.394943,-0.779076,0.000304635,-0.0823261,0.458542,2.2582,0.603161,...,0.264681,-0.0681243,-0.380471,-0.504349,-0.937228,0.962392,-0.728957,-0.448519,2.94374,-1.31057
IM_000279,-0.431226,-1.47751,-0.653207,-0.690617,0.638399,0.778647,0.816506,-0.879366,-0.707438,-0.39451,...,0.0350194,0.625729,1.80776,1.95528,-0.121596,0.0176165,0.0917462,-0.622206,0.256463,0.499517


# Over-sampling

We use the SMOTE algortihm to generate new samples of the malignant class

In [110]:
X = Xf.values
sm = SMOTE(random_state = 42)
X_res, y_res = sm.fit_sample(X_train,y_train)

# Feature Selection

In [57]:
sfs = SequentialFeatureSelector(estimator=knn, k_features= 20, forward=True, floating=False)

In [None]:
sfs.fit(X_res, y_res)

X_sfs = sfs.transform(X_res)

# Feature tranformation

Let's compute the PCA of our data :

In [113]:
pca = PCA(n_components=20, whiten = True)

In [114]:
X_pca = pca.fit_transform(X_res)

# Splitting

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Choosing the right model

We'll be using .....

In [12]:
mcc = metrics.make_scorer(metrics.matthews_corrcoef, greater_is_better = True)

metrics.matthews_corrcoef([0,0,1,1], [0,0,1,0])

0.5773502691896258

In [13]:
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y_res, test_size=0.4, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]



print("# Tuning hyper-parameters")
print()

clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
                   scoring=mcc)
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print()

# Tuning hyper-parameters



  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Best parameters set found on development set:

{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}

Grid scores on development set:

0.302 (+/-0.028) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.000 (+/-0.000) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.421 (+/-0.225) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.324 (+/-0.109) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.510 (+/-0.220) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.422 (+/-0.181) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.607 (+/-0.189) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.432 (+/-0.184) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.413 (+/-0.186) for {'C': 1, 'kernel': 'linear'}
0.420 (+/-0.168) for {'C': 10, 'kernel': 'linear'}
0.420 (+/-0.168) for {'C': 100, 'kernel': 'linear'}
0.413 (+/-0.184) for {'C': 1000, 'kernel': 'linear'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.




In [15]:
#knn = KNeighborsClassifier()

svm = SVC(C= 1000, gamma = 0.001, kernel = 'rbf')

In [33]:

####kNN
p_grid = {"n_neighbors": np.arange(1, 31, 2),
        "metric": ["euclidean", "cityblock"]}


####SVM
#p_grid = [
#  {'C': [0.1,1, 10, 100, 1000], 'kernel': ['linear']},
#  {'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.01, 0.001, 0.0001], 'kernel': ['rbf']},
# ]

In [40]:
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=0)
clf = GridSearchCV(estimator=knn, param_grid=p_grid, cv=inner_cv)
nested_score = cross_val_score(clf, X=X_pca, y=y_res,scoring="accuracy", cv=outer_cv)
print("Average and std Cv score : {0} +- {1}".format(nested_score.mean(), nested_score.std() ))

Average and std Cv score : 0.8898642290630461 +- 0.027375830500070645


In [42]:
clf.fit(X_pca, y_res)

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]), 'metric': ['euclidean', 'cityblock']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [43]:
clf.best_params_

{'metric': 'cityblock', 'n_neighbors': 1}

# Fitting / Cross Validation

In [115]:
svm = LinearSVC(C=5)

In [116]:
svm.fit(X_pca,y_res)

LinearSVC(C=5, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [35]:
mcc = metrics.make_scorer(metrics.matthews_corrcoef, greater_is_better = True)

score = cross_val_score(svm,X=X_pca, y=y_res, cv=10, scoring=mcc)

print(" Average and std CV score : {0} +- {1}".format(score.mean(), score.std() ))

 Average and std CV score : 0.4206183803087927 +- 0.06076768015456214


# Predictions

Let's extract the features from the test dataset:

In [10]:
test_df = pd.read_csv('data/test.csv')
X_test = test_df['ImageId'].values

In [68]:
cols = ['min_hue', 'max_hue', 'mean_hue', 'std_hue', 'min_int', 'max_int','mean_int', 'std_int', 'area', 'perimeter',
        'std_asymetry']+['Ratio{}'.format(i) for i in range(1,10)]+['s_valleys','s_str_lines','s_peaks', 'l_valleys', 
                                                                    'l_str_lines', 'l_peaks']

Xf_test = pd.DataFrame(columns = cols, index = X_test)

t1 = time.time()

for im in log_progress(X_test , every = 1, name = "Images"):
    compute_features(Xf_test, im)

t2 = time.time()
print("Effectué en {} secondes".format(round(t2-t1)))

A Jupyter Widget

Effectué en 1034 secondes


Centering and scaling the data :

In [69]:
m,sigma = [],[]

for j in range(Xf_test.shape[1]):
    m.append(np.mean(Xf_test.iloc[:,j]))
    sigma.append(np.std(Xf_test.iloc[:,j]))
    Xf_test.iloc[:,j] = (Xf_test.iloc[:,j] - m[j])/sigma[j]

In [88]:
Xf_test.to_csv('data/test_features.csv')

Otherwise we can load the previously calculated features dataframe :

In [72]:
Xf_test = pd.read_csv('data/test_features.csv', index_col = 0)

In [73]:
Xf_test.head()

Unnamed: 0,min_hue,max_hue,mean_hue,std_hue,min_int,max_int,mean_int,std_int,area,perimeter,...,Ratio6,Ratio7,Ratio8,Ratio9,s_valleys,s_str_lines,s_peaks,l_valleys,l_str_lines,l_peaks
IM_000773,0.38345,-1.201211,-0.106902,-0.712265,1.156952,0.14567,0.736934,-1.395463,-1.085773,-1.352526,...,-0.254079,-0.254971,-0.620876,-0.137872,-0.162941,0.36392,-0.47364,-1.74372,-0.795965,2.306557
IM_000538,0.192352,0.806828,0.027583,-0.109918,-0.612,0.161862,-0.146065,1.810615,0.967055,0.35013,...,-0.113662,-0.234438,-0.663812,-0.14367,-0.970579,0.955974,-0.655047,-0.292438,0.442331,-0.121053
IM_000274,-0.779312,0.457392,-0.693171,-0.120063,-0.455077,0.485694,0.189924,1.859368,-1.01006,-1.116483,...,0.097648,-0.106542,-0.260152,-0.10414,-0.964192,0.595188,-0.01701,-1.035672,-0.594282,1.477151
IM_000817,0.803248,1.026523,0.817053,1.744181,-2.352421,-2.283069,-2.532417,2.151813,-0.740712,-0.85841,...,0.013283,-0.025997,-0.152877,-0.11278,-0.401195,0.349607,-0.189336,-1.652968,-0.304896,1.789921
IM_000674,0.709198,0.9927,0.43188,0.698612,0.443665,-0.06482,0.253055,-0.474715,-0.681829,-0.433532,...,-0.482514,-0.356696,-0.608603,-0.148741,0.994693,-0.845185,0.430805,0.934355,-0.681988,-0.258235


In [89]:
X_test_pca = pca.transform(Xf_test.values)

And now we make predictions on the test dataset :

In [90]:
prediction = svm.predict(X_test_pca)
test_df['Malignant'] = prediction

test_df['Malignant'] = test_df['Malignant'].astype(int) # This line is mandatory to be sure to have integer
print(test_df.head(3))
test_df.to_csv('data/example_test.csv', index=None, sep=',', mode='w') # Save the data in the exemple_test.csv file 

     ImageId  Malignant
0  IM_000773          0
1  IM_000538          0
2  IM_000274          0


In [91]:
labels_counts_test_df = test_df.groupby('Malignant').count()
labels_counts_test_df = labels_counts_test_df.rename(columns={'Malignant': 'count'})
labels_counts_test_df

Unnamed: 0_level_0,ImageId
Malignant,Unnamed: 1_level_1
0,199
1,101
