# Mechanisms of Action (MoA) Prediction - 0 Label Classifier
## Test

In this notebook, we will create a test data prediction pipeline for 0-label records in order to produce a submission file. This prediction pipeline will not end up producing finalised submission files, but will be used to assess the efficacy of our 0-label classifiers, and ultimately tell us where this algorithm belongs in our entire prediction pipeline.

## 1.00 Import Packages

In [None]:
# General packages
import pandas as pd
import numpy as np
import os
import gc
import random
from tqdm import tqdm, tqdm_notebook

import time
import warnings
warnings.filterwarnings('ignore')

# Data vis packages
import matplotlib.pyplot as plt
%matplotlib inline

# Data prep
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA

# Modelling packages
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras import backend as k
# Key layers
from tensorflow.keras.models import load_model
# Cross validation
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

strategy = tf.distribute.get_strategy()
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

# Data access
gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)

## 2.00 Read in Data

In [None]:
# Directory and file paths
input_dir                 = '../input/lish-moa/'
train_features_path       = os.path.join(input_dir, 'train_features.csv')
test_features_path        = os.path.join(input_dir, 'test_features.csv')
train_targets_scored_path = os.path.join(input_dir, 'train_targets_scored.csv')
sample_submission_path    = os.path.join(input_dir, 'sample_submission.csv')

# Read in data
train_features       = pd.read_csv(train_features_path)
test_features        = pd.read_csv(test_features_path)
train_targets_scored = pd.read_csv(train_targets_scored_path)
sample_submission    = pd.read_csv(sample_submission_path)

del train_features_path, test_features_path, train_targets_scored_path, sample_submission_path

print(f'train_features shape: \t\t{train_features.shape}')
print(f'test_features shape: \t\t{test_features.shape}')
print(f'train_targets_scored shape: \t{train_targets_scored.shape}')
print(f'sample_submission shape: \t{sample_submission.shape}')

In [None]:
# Define key parameters
SCALER_METHOD = RobustScaler()

KFOLDS = 10

MODEL_TO_USE = 'nn'
MODEL_NAME = MODEL_TO_USE + '_0_label_classifier'

print(f'Model name: {MODEL_NAME}')

## 3.00 Data Preparation

In [None]:
def transform_target_data(data):
    """
    Transforms the target dataset with multiple labels into 
    a dataset that has one label (indicating whether there were
    0 labels or not)
    """
    # Get number of labels per sig_id
    data_transformed = data.drop('sig_id', axis=1).sum(axis=1)
    data_transformed = pd.DataFrame(data_transformed).rename(columns={0:'num_labels'})
    # Add labels based on whether there are zero labels or not
    data_transformed['has_zero_label'] = 0
    data_transformed.loc[data_transformed.num_labels == 0, 'has_zero_label'] = 1
    # Remove num_labels feature for final target df
    data_transformed = data_transformed.drop('num_labels', axis=1)
    
    return data_transformed

In [None]:
def transform_feature_set(X_train, X_test, y_train, 
                          seed,
                          num_components,
                          verbose=0, 
                          scaler=SCALER_METHOD
                         ):
    """
    Takes in X_train and X_test datasets, and applies feature selection, scaling and pca
    depending on arguments. 
    
    Returns X_train and X_test data ready for training/prediction
    """
    
    feature_selector = RandomForestClassifier(random_state=seed)
    pca = PCA(n_components=num_components, random_state=seed)
    
    ## DATA PREPARATION ##
    
    # Drop unique ID feature
    X_train = X_train.drop('sig_id', axis=1)
    X_test  = X_test.drop('sig_id', axis=1)
    # Get indices for train and test dfs - we'll need these later
    train_idx = list(X_train.index)
    test_idx  = list(X_test.index)
    # Separate train data types
    X_train_numerical   = X_train.select_dtypes('number')
    X_train_categorical = X_train.select_dtypes('object')
    X_train_categorical = X_train_categorical.astype('category')
    # Separate val data types
    X_test_numerical   = X_test.select_dtypes('number')
    X_test_categorical = X_test.select_dtypes('object')
    X_test_categorical = X_test_categorical.astype('category')
    
    
    ## SCALING ##
    
    if scaler is not None:
        if verbose == 1:
            print('APPLYING SCALER...')
        # Fit and transform scaler to train and val
        scaler.fit(X_train_numerical)
        X_train_numerical = scaler.transform(X_train_numerical)
        X_test_numerical  = scaler.transform(X_test_numerical)
    
    
    ## FEATURE SELECTION ##
    
    # Feature selection is only ran on numerical data
    if feature_selector is not None:
        if verbose == 1:
            print('APPLYING FEATURE SELECTOR...')
        # Fit tree based classifier to select features
        if verbose == 1: 
            num_cols = X_train_numerical.shape[1]
        feature_selector  = SelectFromModel(estimator=feature_selector).fit(X_train_numerical, y_train)
        X_train_numerical = feature_selector.transform(X_train_numerical)
        X_test_numerical  = feature_selector.transform(X_test_numerical)
        if verbose == 1: 
            print(f'{num_cols - X_train_numerical.shape[1]} features removed in feature selection.')
            del num_cols

    
    ## PCA ##
    
    if pca is not None:
        if verbose == 1:
            print('APPLYING PCA...')
        # Fit and transform pca to train and val
        pca.fit(X_train_numerical)
        X_train_numerical = pca.transform(X_train_numerical)
        X_test_numerical  = pca.transform(X_test_numerical)
        if verbose == 1:
            print(f'NUMBER OF PRINCIPAL COMPONENTS: {pca.n_components_}')
    # Convert numerical features into pandas dataframe
    X_train_numerical = pd.DataFrame(X_train_numerical, index=train_idx).add_prefix('pca_')
    X_test_numerical  = pd.DataFrame(X_test_numerical, index=test_idx).add_prefix('pca_')
    
    
    ## CATEGORICAL FEATURES ##
    
    # Get categorical and numerical column names
    num_cols = X_train_numerical.columns
    cat_cols = X_train_categorical.columns
    # Encode categorical features
    X_train_categorical = X_train_categorical.apply(lambda x: x.cat.codes)
    X_test_categorical  = X_test_categorical.apply(lambda x: x.cat.codes)

    
    # Concatenate transformed categorical features with transformed numerical features  
    X_train = pd.concat([X_train_categorical, X_train_numerical], axis=1)
    X_test = pd.concat([X_test_categorical, X_test_numerical], axis=1)
    
    if verbose == 1:
        print(f'TRAIN SHAPE: \t{X_train.shape}')
        print(f'TEST SHAPE: \t{X_test.shape}')
    
    return X_train, X_test, num_cols, cat_cols

In [None]:
X_train = train_features
y_train = transform_target_data(train_targets_scored)

## 4.00 Test Predictions

Because in the model train pipeline, we performed in-fold Bayesian hyperparameter searches for each model, it is expected that the model architecture will be slighlty different for each of the 10 folds. Consequently, we'll need to do a little manual analysis to prepare the test prediction pipeline before we start to make the predictions (as we won't be able to feed in the same dataset into each model - differing transformations will be required per model).

In future, I'd like to automate this step. In order to do this, more work will need to be carried out on the train notebook, but due to time constraints and resource limits, we will have to move on for now without making those amendments. 

### 4.01 Prepare Prediction Pipeline

In [None]:
# After manually inspecting, these are the parameters that will affect the model inputs
model_parameters = pd.DataFrame([[0, 200, True, 14],
                                 [1, 200, False,14],
                                 [2, 200, True, 14],
                                 [3, 200, True, 14],
                                 [4, 200, True, 14],
                                 [5, 200, False,14],
                                 [6, 200, False,14],
                                 [7, 200, True, 14],
                                 [8, 200, False,14],
                                 [9, 200, False,14],
                                 [0, 200, False,140],
                                 [1, 200, False,140],
                                 [2, 200, False,140],
                                 [3, 200, False,140],
                                 [4, 200, False,140],
                                 [5, 200, False,140],
                                 [6, 200, False,140],
                                 [7, 200, False,140],
                                 [8, 200, False,140],
                                 [9, 200, False,140],
                                ], 
                                columns=['kfold','num_components','use_embedding','seed'])

# Create an empty dataframe for zero label indicators to populate during test pipeline
preds_zero_label = sample_submission[['sig_id']]

In [None]:
def make_test_predictions(X_test, 
                          num_components, 
                          use_embedding, 
                          seed, 
                          kfold, 
                          X_train=X_train, 
                          y_train=y_train, 
                          model_name=MODEL_NAME,
                          submission=preds_zero_label):
    """
    Reads in X_test feature set, loads the model specified by model_path, and 
    applies transformations as per num_components and use_embedding
    
    Returns dataframe with sig_id and a binary column indicating 
    """
    
    # Retrieve the dataframe ids that were used in kfold during cross validation (using specified seed)
    skf = StratifiedKFold(n_splits=KFOLDS, random_state=seed)
    for fold, (tdx, vdx) in enumerate(skf.split(X_train, y_train)):
        if fold == kfold:
            # End the loop when it gets to kfold so we can retain tdx for kfold
            break
    
    # Subset X_train and y_train as per what occurred during cross validation for kfold and seed
    X_train, y_train = X_train.iloc[tdx, :], y_train.iloc[tdx, :]
    
    # Transform data - again to replicate what occurred with at kfold and seed
    X_train, X_test, num_cols, cat_cols = transform_feature_set(X_train        = X_train, 
                                                                X_test         = X_test, 
                                                                y_train        = y_train, 
                                                                seed           = seed,
                                                                num_components = num_components)
    
    # Further transformations if an embedding was used at kfold and seed
    if use_embedding == True:
        # Separate data to fit into embedding and numerical input layers
        X_train = [np.absolute(X_train[i]) for i in cat_cols] + [X_train[num_cols]]
        X_test = [np.absolute(X_test[i]) for i in cat_cols] + [X_test[num_cols]]
        
        
    # Get the model name and file path for kfold and seed, then load that model
    model_name = model_name + '_seed' + str(seed)
    model_path = 'weights/' + model_name + '/' + model_name + '_' + str(kfold) + '.h5'
    model = load_model(model_path)
    
    # Make test predictions using the model created at kfold and seed
    preds = model.predict(X_test)
    
    # Add new column for kfold
    submission['zero_label_' + 'fold_' + str(kfold) + '_seed_' + str(seed)] = preds
        
    return(submission)

### 4.02 Make Test Predictions

In [None]:
# Make 0_label test predictions for all models created during CV for all seeds
for idx in tqdm(model_parameters.index):
    y_preds = make_test_predictions(
        X_test         = test_features, 
        num_components = model_parameters.iloc[idx]['num_components'], 
        use_embedding  = model_parameters.iloc[idx]['use_embedding'], 
        seed           = model_parameters.iloc[idx]['seed'], 
        kfold          = model_parameters.iloc[idx]['kfold']
    )

y_preds.head()

In [None]:
# Get the mean for all predictions across folds and seeds
y_preds['zero_label'] = y_preds.iloc[:, 1:].mean(axis=1)
# Finalise zero_label prediction to just one column
y_preds = y_preds[['sig_id', 'zero_label']]
y_preds.head()

### 4.03 Create submission for zero label classifier
In order to test the efficacy of our zero label classfier, we'll make a submission and compare it to the results of the sample submission.

We'll have to invert the probabilities for the zero labels, and the multiply the sample submission values (0.5) by the prediction. We'll figure out where this zero label classifier belongs in the overall pipeline based on the difference in leaderboard scores between the transformed submission and the sample submission.

In [None]:
# Invert probabilities 
y_preds['zero_label'] = 1 - y_preds['zero_label']
sample_submission.head()

In [None]:
# Merge in zero_label predictions
sample_sub = sample_submission.merge(y_preds, on='sig_id')

# Multiply all probabilities by inverted zero_label probabilities
sample_sub.iloc[:, 1:-1] = sample_sub.iloc[:, 1:-1].multiply(sample_sub['zero_label'], axis=0)

# If there is a high probability of a zero_label, replace all values with 0
#for row in sample_sub.index:
#    if (1 - sample_sub.iloc[row, -1].item()) >= 0.75:
#        sample_sub.iloc[row, 1:-1] = 0


# Remove zero_label column
sample_sub.drop('zero_label', axis=1, inplace=True)
sample_sub.head()

In [None]:
sample_sub.to_csv('submissions/submission_zero_label.csv', index=False)