In [1]:
#####Barre de progression#####
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [19]:
from IPython.display import HTML

import os
import re
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
import tensorflow.python.platform
from tensorflow.python.platform import gfile

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE

# Importing the dataset

In [3]:
df = pd.read_csv('data/train.csv')
ims = df['ImageId']
y = df['Malignant']
model_path = 'model/inception_dec_2015/tensorflow_inception_graph.pb'
# all training images
images_dir = 'data/im/'
list_images = ['data/im/processed_ims/{}.jpg'.format(im) for im in ims]

# Creating the graph and extracting the features

In [4]:
# setup tensorFlow graph initiation
def create_graph():
    with gfile.FastGFile(model_path, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
        _ = tf.import_graph_def(graph_def, name='')
# extract all features from the last pooling layer of InceptionV3
def extract_features(list_images):
    nb_features = 2048
    features = np.empty((len(list_images),nb_features))
    create_graph()
    with tf.Session() as sess:
        next_to_last_tensor = sess.graph.get_tensor_by_name('pool_3:0')
        ind=0
        for image in log_progress(list_images , every = 1, name = "Images"):
            if not gfile.Exists(image):
                tf.logging.fatal('File does not exist %s', image)
            image_data = gfile.FastGFile(image, 'rb').read()
            predictions = sess.run(next_to_last_tensor,
            {'DecodeJpeg/contents:0': image_data})
            features[ind,:] = np.squeeze(predictions)
            ind+=1
        return features

In [91]:
features_segmented = extract_features(list_images)

A Jupyter Widget

In [92]:
np.save('train_features_inception_segmented.npy', features_segmented)

In [4]:
features = np.load('train_features_inception.npy')

# Oversampling

In [127]:
sm = SMOTE(random_state = 42)
features_res, y_res = sm.fit_sample(features,y)

# Feature transformation

In [12]:
pca = PCA(n_components=600, whiten = True)

In [13]:
features_pca = pca.fit_transform(features)

In [19]:
features.shape

(600, 2048)

In [20]:
y.shape

(600,)

# Hyperparameters tuning

In [8]:
svm = LinearSVC()

In [9]:
p_grid = {'C': [0.01, 0.05,1,5,10,20,50,100]}
inner_cv = KFold(n_splits=5, shuffle=True, random_state=0)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=0)


clf = GridSearchCV(estimator = svm, param_grid = p_grid, cv = inner_cv)
nested_score = cross_val_score(clf, X=features, y=y, cv=outer_cv)


In [11]:
clf.fit(features,y)

GridSearchCV(cv=KFold(n_splits=5, random_state=0, shuffle=True),
       error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.01, 0.05, 1, 5, 10, 20, 50, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [12]:
clf.best_params_

{'C': 0.01}

# Fitting the chosen classifier

In [16]:
svm = LinearSVC(C=0.05)

In [18]:
mcc = metrics.make_scorer(metrics.matthews_corrcoef, greater_is_better = True)

score = cross_val_score(svm,X=features, y=y, cv=10, scoring=mcc)

print(" Average and std CV score : {0} +- {1}".format(score.mean(), score.std() ))

 Average and std CV score : 0.2818008164319141 +- 0.09622034661506325


In [20]:
svm.fit(features,y)

LinearSVC(C=0.05, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

# Predictions

In [21]:
df_test = pd.read_csv('data/test.csv')
ims_test = df_test['ImageId']
y_test = df_test['Malignant']

list_images_test = ['data/im/processed_ims/{}.jpg'.format(im) for im in ims_test]

In [99]:
test_features_segmented = extract_features(list_images_test)

A Jupyter Widget

In [100]:
np.save('test_features_inception_segmented.npy', test_features_segmented)

In [22]:
test_features = np.load('test_features_inception.npy')

In [23]:
prediction = svm.predict(test_features)
df_test['Malignant'] = prediction

df_test['Malignant'] = df_test['Malignant'].astype(int) # This line is mandatory to be sure to have integer
df_test.to_csv('data/inception.csv', index=None, sep=',', mode='w') # Save the data in the exemple_test.csv file 

In [24]:
labels_counts_test_df = df_test.groupby('Malignant').count()
labels_counts_test_df = labels_counts_test_df.rename(columns={'Malignant': 'count'})
labels_counts_test_df

Unnamed: 0_level_0,ImageId
Malignant,Unnamed: 1_level_1
0,262
1,38
