### Work File

In [1]:
filename = "set_minresol_N_5.csv"
fold = '../rna-ion-step2/'

### Libraries

In [2]:
from sklearn.metrics.scorer import make_scorer
from class_magnesium_not_drop_na import *
from plot_gridsearch_results import *
import pickle

### Custom metric

$ myMetric = \frac{TP}{P} - \frac{FP}{N} $

In [3]:
class metric(object):
    def __init__(self, type_of):
        self.tp = []
        self.fp = []
        self.type_of = type_of 
    def my_metric_(self):    
        def my_custom_metric(y_true, y_pred):
            tp = np.sum((y_true == 1) & (y_pred == 1))
            fp = np.sum((y_true == 0) & (y_pred == 1))
            p = np.sum(y_true == 1)
            n = np.sum(y_true == 0)
            self.tp.append(tp/p)
            self.fp.append(fp/n)
            return (tp/p - fp/n)
        if self.type_of == 'custom':
            return make_scorer(my_custom_metric, greater_is_better=True)
        else:
            return None
    
    def get_tp_fp_(self, elements, n_splits):
         return [np.array(elements[0::2]).reshape(int(len(elements)/n_splits/2), n_splits), 
                 np.array(elements[1::2]).reshape(int(len(elements)/n_splits/2), n_splits)]
    def get_tp_fp(self, n_splits):
        return self.get_tp_fp_(self.tp, n_splits) + self.get_tp_fp_(self.fp, n_splits)

### Process function

In [4]:
# gridsearch
def Gridsearch_function(filename, fold, n_splits, type_of_metric, parametres, output_filename):
    m = Magnesium(filename, fold = fold,colours = ['#f6c35b', '#929292'], with_groups = False)
    my_metric = metric(type_of_metric)

    grid = GridSearchCV(RandomForestClassifier(n_estimators=200, n_jobs=-1, criterion='gini'),
                       scoring=my_metric.my_metric_(), verbose = 7,
                       cv=StratifiedShuffleSplit(n_splits = n_splits, test_size = 0.3, random_state = 0),
                       param_grid=parametres)

    rus = RandomUnderSampler(ratio = 'auto', random_state=42)
    x, y = rus.fit_sample(m.x, m.y)
    grid.fit(x, y)
    data = grid.cv_results_
    
    # save_data
    dataframe = pd.DataFrame(data)[[2,3,4,5, -2, -1]]

    if type_of_metric == 'custom':
        tp_tests, tp_trains, fp_tests, fp_trains = my_metric.get_tp_fp(n_splits)     
        tp_tests_mean, tp_trains_mean, fp_tests_mean, fp_trains_mean = [np.mean(i, axis = 1)
                                                                     for i in [tp_tests, tp_trains, fp_tests, fp_trains]]

        types = ['tp_tests_mean', 'tp_trains_mean', 'fp_tests_mean', 'fp_trains_mean']
        for j in types:
                dataframe[j] = eval(j)
                
        types = ["tp_tests", "tp_trains", 'fp_tests', 'fp_trains']
        for j in types:
            for i in range(eval(j).shape[1]):
                dataframe[j + '_split_' + str(i)] = eval(j)[:, i]
        
    dataframe.to_csv('outputs/Gridsearch/'+output_filename, index = False, sep = '\t')

## Auto metric

#### Input data

In [5]:
n_splits = 3
type_of_metric = 'auto'  # or 'custom'
parametres = {"max_depth":[1, 5, 10, 15, 20, 25, 30, 35, 40],
              'min_samples_leaf':[1, 5, 10, 15, 20, 25, 30, 35, 40]}

#parametres = {"max_depth":[1, 5],
#              'min_samples_leaf':[1]}

gridsearch_number = 1
output_filename = '_'.join([filename.split('.csv')[0], type_of_metric, str(gridsearch_number)])+'.csv'

#### Gridsearch

#### Open data & Plot

In [6]:
grid_data_auto1 = pd.read_table('outputs/Gridsearch/'+output_filename, sep = '\t')
#grid_data.head()

plot_gridsearch_results(grid_data_auto1)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#### Gridsearch. Step 2

#### Input data

In [7]:
n_splits = 3
type_of_metric = 'auto'  # or 'custom'
parametres = {"max_depth":range(20,41,2),
              'min_samples_leaf':range(1,6)}

#parametres = {"max_depth":[1, 5],
#              'min_samples_leaf':[1]}

gridsearch_number = 2
output_filename = '_'.join([filename.split('.csv')[0], type_of_metric, str(gridsearch_number)])+'.csv'

#### Gridsearch

#### Open data & Plot

In [8]:
grid_data_auto2 = pd.read_table('outputs/Gridsearch/'+output_filename, sep = '\t')
#grid_data.head()

plot_gridsearch_results(grid_data_auto2)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Custom metric

#### Input data

In [9]:
n_splits = 3
type_of_metric = 'custom'  # or 'auto'
parametres = {"max_depth":[1, 5, 10, 15, 20, 25, 30, 35, 40],
              'min_samples_leaf':[1, 5, 10, 15, 20, 25, 30, 35, 40]}

#parametres = {"max_depth":[1, 5],
#              'min_samples_leaf':[1]}

gridsearch_number = 1
output_filename = '_'.join([filename.split('.csv')[0], type_of_metric, str(gridsearch_number)])+'.csv' 

#### Gridsearch

#### Open data & Plot

In [10]:
grid_data_custom1 = pd.read_table('outputs/Gridsearch/'+output_filename , sep = '\t')
#grid_data.head()
plot_gridsearch_results(grid_data_custom1, type_of_metric)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Gridsearch step 2

#### Input data

In [11]:
n_splits = 3
type_of_metric = 'custom'  # or 'auto'
parametres = {"max_depth":range(20, 41,2),
              'min_samples_leaf':range(1,6)}

#parametres = {"max_depth":[1, 5],
#              'min_samples_leaf':[1]}

gridsearch_number = 2
output_filename = '_'.join([filename.split('.csv')[0], type_of_metric, str(gridsearch_number)])+'.csv' 

#### Gridsearch

#### Open data & Plot

In [12]:
grid_data_custom2 = pd.read_table('outputs/Gridsearch/'+output_filename , sep = '\t')
#grid_data.head()
plot_gridsearch_results(grid_data_custom2, type_of_metric)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>