In [4]:
import os
import pandas as pd
import numpy as np
import pandas as pd
from pathlib import Path
import sklearn
import sklearn.metrics
import sklearn.model_selection
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [5]:
# Explore ResNet feature matrices
image_folder = "train_input/resnet_features/"
#image = np.load('/tmp/123.npy', mmap_mode='r')

# Function to load folder into arrays and then it returns that same array
def loadImages(path):
    image_files = sorted([os.path.join(path, file)
         for file in os.listdir(path) if file.endswith('.npy')])
    return image_files

In [6]:
# Check number of images in training set
len(loadImages(image_folder))

279

In [7]:
# load first npy and explore data
npy_train = loadImages(image_folder)
matrix_npy = np.load(npy_train[0], mmap_mode='r')

In [8]:
def get_average_features(filenames):
    """Load and aggregate the resnet features by the average.

    Args:
        filenames: list of filenames of length `num_patients` corresponding to resnet features

    Returns:
        features: np.array of mean resnet features, shape `(num_patients, 2048)`
    """
    # Load numpy arrays
    features = []
    for f in filenames:
        patient_features = np.load(f)

        # Remove location features (but we could use them?)
        patient_features = patient_features[:, 3:]

        aggregated_features = np.mean(patient_features, axis=0)
        features.append(aggregated_features)

    features = np.stack(features, axis=0)
    return features

In [9]:
# load feature npy folder into arrays and then it returns that same array of strings
def loadFiles(path):
    feature_files = sorted([os.path.join(path, file)
         for file in os.listdir(path) if file.endswith('.npy')])
    return feature_files

In [10]:
# precise training set and test set relative location
train_dir = Path("train_input/resnet_features")
test_dir = Path("test_input/resnet_features")

train_output_filename = Path("training_output.csv")

train_output = pd.read_csv(train_output_filename)

In [11]:
# Get filenames for train
filenames_train = loadFiles(train_dir)

# Get global labels (patient-wise) for train
labels_train = train_output["Target"].values

# check if the number of observations and labels corresponds
assert len(filenames_train) == len(labels_train)

In [12]:
# Get the numpy filenames for test
filenames_test = loadFiles(test_dir)
# ID list without its suffix (ex: "ID_005")
ids_test = [Path(f).stem for f in filenames_test]

In [13]:
# Get the resnet features and aggregate them by the average
features_train = get_average_features(filenames_train)
features_test = get_average_features(filenames_test)

In [19]:
# Use the average resnet features to predict the labels

# number of runs for cross-validation
num_runs = 5
# number of splits for cross-validation
num_splits = 20

# Multiple cross validations on the training set
aucs = []
for seed in range(num_runs):
    # Use linear Discriminant Analysis
    model = LinearDiscriminantAnalysis()

    cv = sklearn.model_selection.RepeatedStratifiedKFold(n_splits=num_splits, n_repeats=num_runs, random_state=seed)

    # Cross validation on the training set
    auc = sklearn.model_selection.cross_val_score(model, X=features_train, y=labels_train,
                                                  cv=cv, scoring="roc_auc", verbose=0)
    
    aucs.append(auc)

aucs = np.array(aucs)

print("Predicting weak labels by mean resnet")
print("AUC: mean {}, std {}".format(aucs.mean(), aucs.std()))

Predicting weak labels by mean resnet
AUC: mean 0.7195638888888889, std 0.1486114135770546


In [20]:
# Prediction on the test set

# Train a final model on the full training set
estimator = LinearDiscriminantAnalysis()
estimator.fit(features_train, labels_train)

preds_test = estimator.predict_proba(features_test)[:, 1]

# Check that predictions are in [0, 1]
assert np.max(preds_test) <= 1.0
assert np.min(preds_test) >= 0.0

# -------------------------------------------------------------------------
# Write the predictions in a csv file, to export them in the suitable format
# to the data challenge platform
ids_number_test = [i.split("ID_")[1] for i in ids_test]
test_output = pd.DataFrame({"ID": ids_number_test, "Target": preds_test})
test_output.set_index("ID", inplace=True)
test_output.to_csv("predictions/preds_lda.csv")


<h2>Tune LDA hyperparameters with grid search</h2>

In [23]:
# Use the average resnet features to predict the labels

# number of runs for cross-validation
num_runs = 5
# number of splits for cross-validation
num_splits = 20

# Multiple cross validations on the training set

# Use linear Discriminant Analysis
model = LinearDiscriminantAnalysis()

cv = sklearn.model_selection.RepeatedStratifiedKFold(n_splits=num_splits, n_repeats=num_runs, random_state=1)

# define grid
grid = dict()
grid['solver'] = ['svd', 'lsqr', 'eigen']
# define search
search = sklearn.model_selection.GridSearchCV(model, grid, scoring='roc_auc', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X=features_train, y=labels_train)
    
# summarize
print('Mean Accuracy: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)


Mean Accuracy: 0.722
Config: {'solver': 'svd'}


<p>Finally the default solver gives best result over all solvers so no need to do another prediction.</p>