# Final ELN models for different preprocessing methods

In [7]:
import warnings
warnings.filterwarnings('ignore')

from src.data_processing import *
from src.grid_search import *
import os
import numpy as np
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
import pickle

data_type = 'float32'

In [2]:
pr_auc_scorer = make_scorer(pr_auc_score, greater_is_better=True,
                            needs_proba=True)

In [3]:
input_train = '../data/train_hvg2k_std_integrated.csv'
input_test = '../data/test_hvg2k_std_integrated.csv'

In [11]:
def make_binary(df):
    '''
    Takes a Pandas DataFrame with Count-Per-Million normalized read counts and returns a binarized version of it
    :param df: Pandas DataFrame with a row for each Sample.
    Columns contain Count-Per-Million(CPM) read counts, and possibly meta-data,i.e.
    the Strain, Treatment, RNAi, Biological Age, GEO accession number
    :param filter_genes: Filter columns names by filter_genes to compute the binarization only on the genes.
    :return: A binarized copy of the original data without meta-information
    '''
    df_bin = df.copy()
    df_bin['Median'] = df_bin.median(axis=1)  # calculate the median for each row and append a column
    genes = list(df_bin.columns)
    # divide each value, i.e. CPM, by the sample-median
    df_bin = df_bin[genes].div(df_bin.Median, axis=0)
    # set values smaller than the median to 0 and 1 otherwise
    df_bin[df_bin.isna()] = 0
    df_bin[df_bin <= 1] = 0
    df_bin[df_bin > 1] = 1

    return df_bin

In [14]:
df_test = assign_target(input_test)
test_X = df_test.iloc[:,:-2]
test_y = df_test.target
test_X, test_y = shuffle(test_X, test_y, random_state=42)

df_train = assign_target(input_train)
train = df_train.reset_index()
custom_cv = customized_cv_index(train)
    
train_X = train.iloc[:,1:-2]
train_y = train.target

In [19]:
test_X = make_binary(test_X)

In [20]:
train_X = make_binary(train_X)

In [23]:
# using our parameters
runs_10(train_X, train_y, test_X, test_y, 0.027825594022071243, 0.015848931924611134, 'hvg_lognorm_std_int_median_bin')

100%|██████████| 10/10 [02:55<00:00, 17.56s/it]


auprc: 0.852115491715257 ± 1.6201067799292863e-06


In [24]:
# using the parameters they provided
runs_10(train_X, train_y, test_X, test_y, 0.075, 0.3, 'hvg_lognorm_std_int_median_reuse_bin')

100%|██████████| 10/10 [05:45<00:00, 34.52s/it]


auprc: 0.8361856672864296 ± 3.512808159570376e-06
