In [32]:
import os
import pandas as pd
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib
import matplotlib.pyplot as plt

import sklearn
import sklearn.metrics
import sklearn.model_selection
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [33]:
# Explore ResNet feature matrices
image_folder = "train_input/resnet_features/"
#image = np.load('/tmp/123.npy', mmap_mode='r')

# Function to load folder into arrays and then it returns that same array
def loadImages(path):
    image_files = sorted([os.path.join(path, file)
         for file in os.listdir(path) if file.endswith('.npy')])
    return image_files

In [36]:
def get_average_features(filenames):
    """Load and aggregate the resnet features by the average.

    Args:
        filenames: list of filenames of length `num_patients` corresponding to resnet features

    Returns:
        features: np.array of mean resnet features, shape `(num_patients, 2048)`
    """
    # Load numpy arrays
    features = []
    for f in filenames:
        patient_features = np.load(f)

        # Remove location features (but we could use them?)
        patient_features = patient_features[:, 3:]

        aggregated_features = np.mean(patient_features, axis=0)
        features.append(aggregated_features)

    features = np.stack(features, axis=0)
    return features

In [37]:
# load feature npy folder into arrays and then it returns that same array of strings
def loadFiles(path):
    feature_files = sorted([os.path.join(path, file)
         for file in os.listdir(path) if file.endswith('.npy')])
    return feature_files

In [38]:
# precise training set and test set relative location
train_dir = Path("train_input/resnet_features")
test_dir = Path("test_input/resnet_features")

train_output_filename = Path("training_output.csv")

train_output = pd.read_csv(train_output_filename)

<h2>Use locally annotated information</h2>

In [39]:
# old id looks like: ID_387_annotated_tile_0_15_69_30.jpg
# reformat the annotation id for upcoming df joining
# function for label file

def get_new_id(old_id):
    # get rid of the .jpg extension
    old_id = Path(old_id).stem
    # store all characters in a list
    string_list = old_id.split('_')
    # new_id looks like: <patient_id>_<zoom_level>_<x_coord>_<y_coord>
    new_id = f"{string_list[0]}_{string_list[1]}_{string_list[-3]}_{string_list[-2]}_{string_list[-1]}"
    return new_id

In [40]:
# load local annotations (tile-level)
# id goes as follows: <patient_id>_annotated_tile_<tile_id>_<tile_coords>
local_annot = pd.read_csv('train_input/train_tile_annotations.csv')
local_annot.rename(columns={'Unnamed: 0': 'Tile_annotation_id'}, inplace=True)

# add new column new_id
local_annot['new_tile_id'] = local_annot['Tile_annotation_id'].map(get_new_id)

In [41]:
local_annot

Unnamed: 0,Tile_annotation_id,Target,new_tile_id
0,ID_387_annotated_tile_0_15_69_30.jpg,0.0,ID_387_15_69_30
1,ID_387_annotated_tile_1_15_23_53.jpg,0.0,ID_387_15_23_53
2,ID_387_annotated_tile_2_15_58_20.jpg,0.0,ID_387_15_58_20
3,ID_387_annotated_tile_3_15_67_12.jpg,0.0,ID_387_15_67_12
4,ID_387_annotated_tile_4_15_57_20.jpg,0.0,ID_387_15_57_20
...,...,...,...
10119,ID_035_annotated_tile_861_16_73_121.jpg,1.0,ID_035_16_73_121
10120,ID_035_annotated_tile_862_16_67_126.jpg,1.0,ID_035_16_67_126
10121,ID_035_annotated_tile_863_16_24_116.jpg,0.0,ID_035_16_24_116
10122,ID_035_annotated_tile_864_16_69_119.jpg,0.0,ID_035_16_69_119


In [100]:
# Function to load 11 annotated npy files in resnet features folder
def loadAnnotatedData(path):
    feature_files = sorted([os.path.join(path, file)
         for file in os.listdir(path) if file.endswith('_annotated.npy')])
    return feature_files


# Compile all data into a dataframe to form strong supervised dataset (local data)
def dataCompiler(filelist):
    df_data = pd.DataFrame()
    
    # Load numpy arrays
    for f in filelist:
        try:
            patient_features = np.load(f)
            patient_id = Path(f).stem.strip("_annotated")

            # add patient id to features
            df_patient = pd.DataFrame(data=patient_features)
            df_patient['patient_id'] = patient_id

            # add df to global dataframe
            df_data = df_data.append(df_patient, ignore_index=True)
        
        except FileNotFoundError:
            print(f"{f} does not exist.")
    
    # rename dataframe with proper column names
    colnames = ['zoom_level', 'x_coord', 'y_coord'] + [i for i in range(1,2049)] + ['patient_id']
    df_data.columns = colnames
    
    return df_data


# generate new_id for annotated data
# funciton for 2048-dimension feature file

def generate_new_id(patient_id, zoom, x, y):
    element_list = [patient_id, str(int(zoom)), str(int(x)), str(int(y))]
    separator = "_"
    new_id = separator.join(element_list)
    return new_id

In [43]:
# Gives the list of annotated patient npy files
annotatedFiles_train = loadAnnotatedData(train_dir)

# Complete annotated dataset
annotatedData = dataCompiler(annotatedFiles_train)

# add new column new_tile_id
annotatedData['new_tile_id']:str = annotatedData.apply(lambda x: generate_new_id(x.patient_id, x.zoom_level, x.x_coord, x.y_coord),  axis = 1)

In [44]:
annotatedData

Unnamed: 0,zoom_level,x_coord,y_coord,1,2,3,4,5,6,7,...,2041,2042,2043,2044,2045,2046,2047,2048,patient_id,new_tile_id
0,16.0,56.0,117.0,0.063937,0.000000,0.117584,0.005884,0.271771,0.007639,0.000000,...,0.000000,0.010490,0.000000,0.026191,0.000000,0.319746,0.000000,0.089043,ID_035,ID_035_16_56_117
1,16.0,47.0,136.0,0.946910,2.319555,0.036641,0.007913,0.148661,0.071836,0.000000,...,0.022700,0.497255,0.085011,0.000000,0.000000,0.395565,0.758223,0.000000,ID_035,ID_035_16_47_136
2,16.0,38.0,117.0,0.393728,0.000000,0.031280,0.000000,0.041272,0.005072,0.129262,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.015349,0.000000,0.000000,ID_035,ID_035_16_38_117
3,16.0,40.0,128.0,0.231148,0.000000,0.000126,0.000000,0.015671,0.326770,0.345363,...,0.000000,0.027319,0.000000,0.000000,0.000000,0.007136,0.000000,0.000000,ID_035,ID_035_16_40_128
4,16.0,47.0,117.0,0.290326,0.002089,0.032283,0.000000,0.115174,0.012818,0.107415,...,0.000000,0.042446,0.000000,0.000000,0.000000,0.034594,0.000000,0.046535,ID_035,ID_035_16_47_117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10119,15.0,34.0,42.0,0.874400,0.044763,0.413319,0.013475,0.549523,0.565158,0.000000,...,0.296946,0.037002,0.000000,0.125798,0.000000,0.000000,0.000000,0.076320,ID_387,ID_387_15_34_42
10120,15.0,54.0,20.0,0.092193,0.591326,0.217773,0.000000,0.068469,0.000000,0.000000,...,0.003810,0.025986,0.000000,0.000000,0.000000,1.629614,0.506654,0.000000,ID_387,ID_387_15_54_20
10121,15.0,78.0,20.0,0.788472,2.213521,0.000000,0.022404,0.032162,0.449662,0.063583,...,0.018248,0.052106,0.023716,0.324964,0.008178,1.230977,1.443332,0.000000,ID_387,ID_387_15_78_20
10122,15.0,67.0,33.0,0.728631,1.686132,0.000000,0.061719,0.130151,0.286733,0.000000,...,0.009419,0.095885,0.000000,0.000000,0.411714,1.435175,0.483119,0.010892,ID_387,ID_387_15_67_33


<p style="font-weight:bold">Join the features table and label table with newly created new_tile_id</p>

In [45]:
data = annotatedData.merge(local_annot, how='inner', on='new_tile_id')

<p style="font-weight:bold">Separate now local features and target for models</p>

In [46]:
# local features for train (tile-level)
#cols = [i for i in range(1, 2049)]
local_features = np.array(data.iloc[:, 3:2051])

# local targets (tile-level)
local_labels = data["Target"].values

<h2>Apply SVM to local-annotated data</h2>

In [21]:
# Get filenames for train
filenames_train = loadFiles(train_dir)

# Get global labels (patient-wise) for train
labels_train = train_output["Target"].values

# check if the number of observations and labels corresponds
assert len(filenames_train) == len(labels_train)

In [22]:
# Get the numpy filenames for test
filenames_test = loadFiles(test_dir)
# ID list without its suffix (ex: "ID_005")
ids_test = [Path(f).stem for f in filenames_test]

In [23]:
# Get the resnet features and aggregate them by the average
features_train = get_average_features(filenames_train)
features_test = get_average_features(filenames_test)

In [24]:
# given path of a filename, returns a numpy array
def get_tile_features(filename):
    # Load npy to numpy arrays 
    patient_features = np.load(filename)
    
    # Remove location features (but we could use them?)
    patient_features = patient_features[:, 3:]
    return patient_features

In [50]:
# Scale training set and test set together
# can normally have better performance

# Standardize training features and apply standardization to test features
scaler = StandardScaler()
train_X = scaler.fit_transform(local_features)

In [54]:
# Use the tile-level resnet features to predict the labels

# number of runs for cross-validation
num_runs = 3
# number of splits for cross-validation
num_splits = 5

# Multiple cross validations on the local feature training set
aucs = []
accuracies =[]
recalls = []

for seed in range(num_runs):
    #Create a svm Classifier
    clf = svm.SVC(kernel='linear') # Linear Kernel without C parameter

    cv = sklearn.model_selection.StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=seed)

    # Cross validation on the training set
    auc = sklearn.model_selection.cross_val_score(clf, X=train_X, y=local_labels,
                                                  cv=cv, scoring="roc_auc", verbose=1)
    accuracy = sklearn.model_selection.cross_val_score(clf, X=train_X, y=local_labels,
                                                  cv=cv, scoring="accuracy", verbose=1)
    recall = sklearn.model_selection.cross_val_score(clf, X=train_X, y=local_labels,
                                                  cv=cv, scoring="recall", verbose=1)
    
    aucs.append(auc)
    accuracies.append(accuracy)
    recalls.append(recall)

aucs = np.array(aucs)
accuracies = np.array(accuracies)
recalls = np.array(recalls)

print("Predicting strong labels by tile-level resnet")
print("AUC: mean {}, std {}".format(aucs.mean(), aucs.std()))

print("Predicting strong labels by tile-level resnet")
print("Accuracy: mean {}, std {}".format(accuracies.mean(), accuracies.std()))

print("Predicting strong labels by tile-level resnet")
print("True Positive Rate: mean {}, std {}".format(recalls.mean(), recalls.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.4min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

Predicting strong labels by tile-level resnet
AUC: mean 0.919714839559539, std 0.01773988538222485
Predicting strong labels by tile-level resnet
Accuracy: mean 0.9578557068267216, std 0.004225995867068012
Predicting strong labels by tile-level resnet
True Positive Rate: mean 0.714234342223554, std 0.044187437945193704


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.3min finished


In [58]:
# Test using the following values for coefficient 'c'
c_coeff = np.array([5**-3, 5**-2, 5**-1, 1 , 5 , 5**2, 5**3])

# number of runs for cross-validation
num_runs = 3
# number of splits for cross-validation
num_splits = 5

# Multiple cross validations on the local feature training set
aucs = []
accuracies =[]
recalls = []

for i in c_coeff:
    # use the C parameter of SVM
    svf = SVC(C=i, kernel='linear')

    cv = sklearn.model_selection.StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=seed)

    # Cross validation on the training set
    auc = sklearn.model_selection.cross_val_score(svf, X=train_X, y=local_labels,
                                              cv=cv, scoring="roc_auc", verbose=1)
    accuracy = sklearn.model_selection.cross_val_score(svf, X=train_X, y=local_labels,
                                              cv=cv, scoring="accuracy", verbose=1)
    recall = sklearn.model_selection.cross_val_score(svf, X=train_X, y=local_labels,
                                              cv=cv, scoring="recall", verbose=1)

    print(f"When c_coeff is {i}")
    print("AUC: mean {}, std {}".format(auc.mean(), auc.std()))
    print("Accuracy: mean {}, std {}".format(accuracy.mean(), accuracy.std()))
    print("True Positive Rate: mean {}, std {}".format(recall.mean(), recall.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


When c_coeff is 0.008
AUC: mean 0.9409161675829241, std 0.009161927498755718
Accuracy: mean 0.9711573220123946, std 0.005224424078312489
True Positive Rate: mean 0.7156128258915194, std 0.04897567906589284


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


When c_coeff is 0.04
AUC: mean 0.9227796457698807, std 0.018378565926560293
Accuracy: mean 0.9599958522422289, std 0.006192905181682128
True Positive Rate: mean 0.7198281889921087, std 0.06031469059858214


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


When c_coeff is 0.2
AUC: mean 0.9185350830270054, std 0.02124387175568656
Accuracy: mean 0.9584154101400477, std 0.0062590471388418055
True Positive Rate: mean 0.7184197382878833, std 0.0665063473717086


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.1min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.1min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.1min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


When c_coeff is 1.0
AUC: mean 0.9185350830270054, std 0.02124387175568656
Accuracy: mean 0.9584154101400477, std 0.0062590471388418055
True Positive Rate: mean 0.7184197382878833, std 0.0665063473717086


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.1min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.1min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.1min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


When c_coeff is 5.0
AUC: mean 0.9185350830270054, std 0.02124387175568656
Accuracy: mean 0.9584154101400477, std 0.0062590471388418055
True Positive Rate: mean 0.7184197382878833, std 0.0665063473717086


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.1min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


When c_coeff is 25.0
AUC: mean 0.9185350830270054, std 0.02124387175568656
Accuracy: mean 0.9584154101400477, std 0.0062590471388418055
True Positive Rate: mean 0.7184197382878833, std 0.0665063473717086


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


When c_coeff is 125.0
AUC: mean 0.9185350830270054, std 0.02124387175568656
Accuracy: mean 0.9584154101400477, std 0.0062590471388418055
True Positive Rate: mean 0.7184197382878833, std 0.0665063473717086


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min finished


<h2>Grid Search for SVM with RBF Kernel</h2>

In [None]:
# It is usually a good idea to scale the data for SVM training.
scaler = StandardScaler()
X = scaler.fit_transform(local_features)

# Train classifiers
# For an initial search, a logarithmic grid with basis
# 10 is often helpful. Using a basis of 2, a finer
# tuning can be achieved but at a much higher cost.

C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = sklearn.model_selection.StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
grid = sklearn.model_selection.GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(X, local_labels)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

<p>Finally the grid search of RBF takes too long to find the optimal hyperparameters. Since the data dimension is relatively high, theoractically there is no need for data projection and the linear kernel can reach already a comparable performance. </p>

<h2>Annotate unlabeled positive cases</h2>

In [91]:
# retrieve all positive patient case
positive_patients = train_output[train_output['Target']==1]

In [93]:
positive_patients['npy_ID'] = positive_patients.apply(lambda x: f"ID_{str(x['ID']).zfill(3)}.npy", axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_patients['npy_ID'] = positive_patients.apply(lambda x: f"ID_{str(x['ID']).zfill(3)}.npy", axis=1)


In [95]:
# store all positive case npys to an array for compilation
added_pat_list = np.array(positive_patients['npy_ID'])
added_pat_list = [f"train_input/resnet_features/{i}" for i in added_pat_list]

In [101]:
# Compile unlabeled positive cases in training set
posData = dataCompiler(added_pat_list)

# add new column new_tile_id
posData['new_tile_id']:str = posData.apply(lambda x: generate_new_id(x.patient_id, x.zoom_level, x.x_coord, x.y_coord),  axis = 1)

train_input/resnet_features/ID_035.npy does not exist.
train_input/resnet_features/ID_036.npy does not exist.
train_input/resnet_features/ID_041.npy does not exist.
train_input/resnet_features/ID_046.npy does not exist.
train_input/resnet_features/ID_129.npy does not exist.
train_input/resnet_features/ID_166.npy does not exist.
train_input/resnet_features/ID_174.npy does not exist.
train_input/resnet_features/ID_218.npy does not exist.
train_input/resnet_features/ID_243.npy does not exist.
train_input/resnet_features/ID_262.npy does not exist.
train_input/resnet_features/ID_387.npy does not exist.


In [104]:
# create a training test set for prediction
added_features = np.array(posData.iloc[:, 3:2051])

<h2>Train SVM model with tuned hyperparameter and predict on the added unlabeled set</h2>

In [111]:
# Standardize training and test features and apply standardization to test features
scaler = StandardScaler()
train_X = scaler.fit_transform(local_features)
test_X = scaler.transform(added_features)

# Use the tile-level resnet features to predict the labels
# Create a svm Classifier
clf = svm.SVC(C=0.008, kernel='linear', probability=True)

# Train the model using the training sets
clf.fit(train_X, local_labels)

# Predict the response for test dataset
y_pred = clf.predict(test_X)
y_pred_proba = clf.predict_proba(test_X)

In [142]:
# recap prediction info to dataframe
recap = pd.DataFrame(data=y_pred_proba)
recap['Target'] = y_pred
recap['patient_id'] = posData['patient_id']
recap.columns = ['negative', 'positive', 'Target', 'ID']

In [157]:
recap_count = pd.pivot_table(recap, index=['ID'], columns=['Target'], aggfunc={'Target': 'count'})
recap_count.columns = ['NB_neg', 'NB_pos']
recap_count

Unnamed: 0_level_0,NB_neg,NB_pos
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
ID_008,985.0,15.0
ID_010,437.0,5.0
ID_013,985.0,15.0
ID_018,984.0,16.0
ID_019,835.0,10.0
...,...,...
ID_375,720.0,280.0
ID_377,948.0,52.0
ID_378,712.0,288.0
ID_390,993.0,7.0


In [158]:
# check if patient-level data to be eliminated
eliminated = recap_count[recap_count['NB_pos'].isnull()]
print(len(eliminated)) # all global prediction points to positive, fitting our expectation

0


In [None]:
# create list to eliminated
eliminated.reset_index(inplace=True)
eli_list = list(eliminated['ID'])

In [203]:
# Do top scoring example selections 
# form a simulated dataset to complete the old locally annotated one 
posData['Target'] = y_pred
posData['positive_proba'] = recap['positive']
neg_examples = posData[posData['positive_proba']<0.003]
pos_examples = posData[posData['positive_proba']>=0.95]
selected_examples = pd.concat([neg_examples, pos_examples], ignore_index=True)

In [211]:
# eliminate those samples in the list from the data set
selected_examples = selected_examples[~selected_examples['patient_id'].isin(eli_list)]

# form simulated dataset from unlabeled data
simulated_features = np.array(selected_examples.iloc[:, 3:2051])
simulated_labels = selected_examples['Target'].values

# add the simulated features and labels to labelised training data
# to form a bigger training set
new_features = np.append(local_features, simulated_features, axis=0)
new_labels = np.append(local_labels, simulated_labels, axis=0)
assert new_features.shape[0] == new_labels.shape[0]

<h2>Take newly-formed dataset as input and predict on the test set</h2>

In [265]:
# Evaluate the model

# number of runs for cross-validation
num_runs = 3
# number of splits for cross-validation
num_splits = 5

# Multiple cross validations on the local feature training set
aucs = []
accuracies =[]
recalls = []

# Standardize training and test features and apply standardization to test features
scaler = StandardScaler()
train_X = scaler.fit_transform(new_features)

for seed in range(num_runs):
    #Create a svm Classifier
    clf = svm.SVC(C=0.008, kernel='linear', probability=True) # Linear Kernel without C parameter

    cv = sklearn.model_selection.StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=seed)

    # Cross validation on the training set
    auc = sklearn.model_selection.cross_val_score(clf, X=train_X, y=new_labels,
                                                  cv=cv, scoring="roc_auc", verbose=1)
    accuracy = sklearn.model_selection.cross_val_score(clf, X=train_X, y=new_labels,
                                                  cv=cv, scoring="accuracy", verbose=1)
    recall = sklearn.model_selection.cross_val_score(clf, X=train_X, y=new_labels,
                                                  cv=cv, scoring="recall", verbose=1)
    
    aucs.append(auc)
    accuracies.append(accuracy)
    recalls.append(recall)

aucs = np.array(aucs)
accuracies = np.array(accuracies)
recalls = np.array(recalls)

print("Predicting strong labels by tile-level resnet")
print("AUC: mean {}, std {}".format(aucs.mean(), aucs.std()))
print("Accuracy: mean {}, std {}".format(accuracies.mean(), accuracies.std()))
print("True Positive Rate: mean {}, std {}".format(recalls.mean(), recalls.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 35.5min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 34.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 33.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 33.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 33.1min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 33.1min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

Predicting strong labels by tile-level resnet
AUC: mean 0.9928278659609465, std 0.0017427640496536488
Accuracy: mean 0.9920213094740761, std 0.0008569568035782643
True Positive Rate: mean 0.9102156068271996, std 0.01645458678175664


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 33.2min finished


In [224]:
# empty df to store output of all patients
df_output = pd.DataFrame()

# Standardize training and test features and apply standardization to test features
scaler = StandardScaler()
train_X = scaler.fit_transform(new_features)

# Use the tile-level resnet features to predict the labels
# Create a svm Classifier
clf = svm.SVC(C=0.008, kernel='linear', probability=True)

# Train the model using the training sets
clf.fit(train_X, new_labels)

for f in filenames_test: 
    patient_id:str = Path(f).stem.split("ID_")[1]
    test_X = get_tile_features(f)
    
    # do the PCA transformation on test set
    test_X = scaler.transform(test_X)
    
    # Predict the response for test dataset
    y_pred = clf.predict(test_X)
    y_pred_proba = clf.predict_proba(test_X)[:, 1] #keep only positive probability
    
    # Check that predictions are in [0, 1]
    assert np.max(y_pred_proba) <= 1.0
    assert np.min(y_pred_proba) >= 0.0
    
    test_output = pd.DataFrame({"ID": patient_id, "Target": y_pred_proba, "Category": y_pred})
    df_output = df_output.append(test_output, ignore_index=True)

In [231]:
recap = pd.pivot_table(df_output, values='Target', index=['ID'], columns=['Category'], aggfunc={'Target': np.mean})
# rename recap table
colnames = ['negative_proba', 'positive_proba']
recap.columns = colnames

In [232]:
# if positive proba is absent, ie. no positive class detected for a patient
# choose negative proba. Otherwise, use positive proba
def select_final_proba(pos_proba, nega_proba):
    if np.isnan(pos_proba):
        final_proba = nega_proba
    else:
        final_proba = pos_proba
    return final_proba

# apply the function to new column of dataframe
recap['Target'] = recap.apply(lambda x: select_final_proba(x.positive_proba, x.negative_proba), axis=1)

In [235]:
# drop useless columns and save result to csv
output = recap.drop(columns=['negative_proba', 'positive_proba'])
output.to_csv("predictions/SVM_with_simulated_labels.csv")

<h2>Try without simulated data</h2>

In [237]:
# empty df to store output of all patients
df_output = pd.DataFrame()

# Standardize training and test features and apply standardization to test features
scaler = StandardScaler()
train_X = scaler.fit_transform(local_features)

# Use the tile-level resnet features to predict the labels
# Create a svm Classifier
clf = svm.SVC(C=0.008, kernel='linear', probability=True)

# Train the model using the training sets
clf.fit(train_X, local_labels)

for f in filenames_test: 
    patient_id:str = Path(f).stem.split("ID_")[1]
    test_X = get_tile_features(f)
    
    # do the PCA transformation on test set
    test_X = scaler.transform(test_X)
    
    # Predict the response for test dataset
    y_pred = clf.predict(test_X)
    y_pred_proba = clf.predict_proba(test_X)[:, 1] #keep only positive probability
    
    # Check that predictions are in [0, 1]
    assert np.max(y_pred_proba) <= 1.0
    assert np.min(y_pred_proba) >= 0.0
    
    test_output = pd.DataFrame({"ID": patient_id, "Target": y_pred_proba, "Category": y_pred})
    df_output = df_output.append(test_output, ignore_index=True)

In [255]:
recap = pd.pivot_table(df_output, values='Target', index=['ID'], columns=['Category'], aggfunc={'Target': np.mean})
# rename recap table
colnames = ['negative_proba', 'positive_proba']
recap.columns = colnames

# apply the function to new column of dataframe
recap['Target'] = recap.apply(lambda x: select_final_proba(x.positive_proba, x.negative_proba), axis=1)

# drop useless columns and save result to csv
output = recap.drop(columns=['negative_proba', 'positive_proba'])
output.to_csv("predictions/SVM.csv")

<p>Try to add accuracy control on output</p>

In [258]:
def redo_category(category, accur_rate):
    if category == 1:
        draw = np.random.uniform(0.0, 1.0, 1)[0]
        # if randomly generated draw is greater than the recall_ration
        # then its category is changed to 0 (negative)
        if draw > accur_rate:
            new_category = 0
        else: 
            new_category = 1
    else:
        new_category = 0
    return new_category

In [261]:
# add new column to df that takes into consideration the TPR
accur = 0.9712
df_output['New_Category'] = df_output.apply(lambda x: redo_category(x.Category, accur), axis = 1)
recap = pd.pivot_table(df_output, values='Target', index=['ID'], columns=['Category'], aggfunc={'Target': np.mean})

# rename recap table
colnames = ['negative_proba', 'positive_proba']
recap.columns = colnames

In [262]:
# if positive proba is absent, ie. no positive class detected for a patient
# choose negative proba. Otherwise, use positive proba
def select_final_proba(pos_proba, nega_proba):
    if np.isnan(pos_proba):
        final_proba = nega_proba
    else:
        final_proba = pos_proba
    return final_proba

# apply the function to new column of dataframe
recap['Target'] = recap.apply(lambda x: select_final_proba(x.positive_proba, x.negative_proba), axis=1)

In [264]:
# drop useless columns and save result to csv
output = recap.drop(columns=['negative_proba', 'positive_proba'])
output.to_csv("predictions/SVM_with_accuracy_control.csv")