In [11]:
import os
import pandas as pd
import numpy as np
import pandas as pd
from pathlib import Path
import sklearn
import sklearn.linear_model
import sklearn.metrics
import sklearn.model_selection


In [12]:
# Explore ResNet feature matrices
image_folder = "train_input/resnet_features/"
#image = np.load('/tmp/123.npy', mmap_mode='r')

# Function to load folder into arrays and then it returns that same array
def loadImages(path):
    image_files = sorted([os.path.join(path, file)
         for file in os.listdir(path) if file.endswith('.npy')])
    return image_files

In [13]:
# Check number of images in training set
len(loadImages(image_folder))

279

In [14]:
def get_average_features(filenames):
    """Load and aggregate the resnet features by the average.

    Args:
        filenames: list of filenames of length `num_patients` corresponding to resnet features

    Returns:
        features: np.array of mean resnet features, shape `(num_patients, 2048)`
    """
    # Load numpy arrays
    features = []
    for f in filenames:
        patient_features = np.load(f)

        # Remove location features (but we could use them?)
        patient_features = patient_features[:, 3:]

        aggregated_features = np.mean(patient_features, axis=0)
        features.append(aggregated_features)

    features = np.stack(features, axis=0)
    return features

In [15]:
# load feature npy folder into arrays and then it returns that same array of strings
def loadFiles(path):
    feature_files = sorted([os.path.join(path, file)
         for file in os.listdir(path) if file.endswith('.npy')])
    return feature_files

In [16]:
# precise training set and test set relative location
train_dir = Path("train_input/resnet_features")
test_dir = Path("test_input/resnet_features")

train_output_filename = Path("training_output.csv")

train_output = pd.read_csv(train_output_filename)

In [17]:
# Get filenames for train
filenames_train = loadFiles(train_dir)

# Get global labels (patient-wise) for train
labels_train = train_output["Target"].values

# check if the number of observations and labels corresponds
assert len(filenames_train) == len(labels_train)

In [18]:
# Get the numpy filenames for test
filenames_test = loadFiles(test_dir)
# ID list without its suffix (ex: "ID_005")
ids_test = [Path(f).stem for f in filenames_test]

In [19]:
# Get the resnet features and aggregate them by the average
features_train = get_average_features(filenames_train)
features_test = get_average_features(filenames_test)

In [21]:
# Use the average resnet features to predict the labels

# number of runs for cross-validation
num_runs = 10
# number of splits for cross-validation
num_splits = 20

# Multiple cross validations on the training set
aucs = []
accuracies = []

for seed in range(num_runs):
    # Use logistic regression with L2 penalty
    estimator = sklearn.linear_model.LogisticRegression(penalty="l2", C=1.0, solver="liblinear")

    cv = sklearn.model_selection.StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=seed)

    # Cross validation on the training set
    auc = sklearn.model_selection.cross_val_score(estimator, X=features_train, y=labels_train,
                                                  cv=cv, scoring="roc_auc", verbose=0)
    accuracy = sklearn.model_selection.cross_val_score(estimator, X=features_train, y=labels_train,
                                                  cv=cv, scoring="accuracy", verbose=1)

    aucs.append(auc)
    accuracies.append(accuracy)

aucs = np.array(aucs)
accuracies = np.array(accuracies)

print("Predicting weak labels by mean resnet")
print("AUC: mean {}, std {}".format(aucs.mean(), aucs.std()))

print("Predicting weak labels by mean resnet")
print("Accuracy: mean {}, std {}".format(accuracies.mean(), accuracies.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    3.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    3.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    3.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    3.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    3.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    3.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

Predicting weak labels by mean resnet
AUC: mean 0.7345138888888889, std 0.13709738684576842
Predicting weak labels by mean resnet
Accuracy: mean 0.6838736263736263, std 0.11321731891801153


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    3.3s finished


In [None]:
# Prediction on the test set

# Train a final model on the full training set
estimator = sklearn.linear_model.LogisticRegression(penalty="l2", C=1.0, solver="liblinear")
estimator.fit(features_train, labels_train)

preds_test = estimator.predict_proba(features_test)[:, 1]

# Check that predictions are in [0, 1]
assert np.max(preds_test) <= 1.0
assert np.min(preds_test) >= 0.0

# -------------------------------------------------------------------------
# Write the predictions in a csv file, to export them in the suitable format
# to the data challenge platform
ids_number_test = [i.split("ID_")[1] for i in ids_test]
test_output = pd.DataFrame({"ID": ids_number_test, "Target": preds_test})
test_output.set_index("ID", inplace=True)
test_output.to_csv("predictions/preds_test_baseline.csv")
