In [6]:

import sys
import os
sys.path.append('../')
import numpy as np
import xgboost as xgb
import scipy
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, Normalizer
import pandas as pd



SAVE_DATA = False

def spearman_scoring(y, y_true):
    return scipy.stats.spearmanr(y, y_true)[0]

def to_pandas(x):
    return pd.DataFrame(x.numpy())


x_train = pd.read_csv('x_train.csv', index_col=0 )
x_test = pd.read_csv('x_test.csv', index_col=0)

y_train = pd.read_csv('y_train.csv', index_col=0)
y_test = pd.read_csv('y_test.csv', index_col=0)



print(x_train)

# Construct params dict
params = {'max_depth': [100],
          'eta': [0.1],
          'alpha': [0.1],
          'lambda': [0.01],
          'subsample': [0.9],
          'colsample_bynode': [0.2]}

# Construct model + classifiers
model = xgb.XGBRegressor()

# Spearmans score
custom_scorer = make_scorer(
    spearman_scoring, greater_is_better=True)

# Construct clf
clf = GridSearchCV(
    model, params, n_jobs=-1, cv=5, 
    scoring=custom_scorer
    )


# Fit train data
print('Fitting...')
clf.fit(x_train, y_train)
print('best params:', clf.best_params_)
print(pd.DataFrame(clf.cv_results_))
print(clf.cv_results_)

# Predict test
print('Predicting...')
preds = clf.predict(x_test)
test_score = scipy.stats.spearmanr(preds, y_test)
preds = clf.predict(x_train)
train_score = scipy.stats.spearmanr(preds, y_train)

# Report
print('Spearman Correlation Score train: {}'.format(train_score))
print('Spearman Correlation Score test: {}'.format(test_score))


# 0.5969965823754 on inter cell-line splits














              0         1         2         3         4         5         6  \
0      0.024102  0.024102  0.024102  0.024102  0.024102  0.029096  0.055319   
1      0.429289  0.388090  0.413063  0.492978  0.461761  0.938754  1.588689   
2      0.078337  0.059689  0.024102  0.034403  0.059377  0.100895  0.165514   
3      0.024102  0.024102  0.024102  0.029721  0.055319  0.036588  0.104017   
4      0.090004  0.159895  0.095900  0.056879  0.060001  0.197355  0.248862   
...         ...       ...       ...       ...       ...       ...       ...   
12549  0.284242  0.294439  0.170821  0.268217  0.446777  0.443031  0.371233   
12550  0.050589  0.024102  0.059689  0.086535  0.104329  0.202974  0.443967   
12551  0.024102  0.048451  0.080604  0.036588  0.079980  0.080604  0.049387   
12552  0.024102  0.046578  0.086535  0.056567  0.060313  0.205159  0.637199   
12553  0.080544  0.270714  0.292566  0.339391  0.763940  1.519388  1.922081   

              7         8         9  ...       130 

  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


best params: {'alpha': 0.01, 'colsample_bynode': 0.2, 'eta': 0.1, 'lambda': 0.01, 'max_depth': 100, 'subsample': 0.9}
   mean_fit_time  std_fit_time  mean_score_time  std_score_time param_alpha  \
0     273.138302      9.546046         0.173691        0.145687        0.01   

  param_colsample_bynode param_eta param_lambda param_max_depth  \
0                    0.2       0.1         0.01             100   

  param_subsample                                             params  \
0             0.9  {'alpha': 0.01, 'colsample_bynode': 0.2, 'eta'...   

   split0_test_score  split1_test_score  split2_test_score  split3_test_score  \
0            0.73819           0.718414           0.739602           0.725549   

   split4_test_score  mean_test_score  std_test_score  rank_test_score  
0           0.745337         0.733418        0.009901                1  
{'mean_fit_time': array([273.13830209]), 'std_fit_time': array([9.54604572]), 'mean_score_time': array([0.1736907]), 'std_score_time':

In [7]:
clf.cv_results_

{'mean_fit_time': array([273.13830209]),
 'std_fit_time': array([9.54604572]),
 'mean_score_time': array([0.1736907]),
 'std_score_time': array([0.1456871]),
 'param_alpha': masked_array(data=[0.01],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_colsample_bynode': masked_array(data=[0.2],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_eta': masked_array(data=[0.1],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_lambda': masked_array(data=[0.01],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[100],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_subsample': masked_array(data=[0.9],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'alpha': 0.01,
   'colsample_bynode': 0.2,
   'eta': 0.1,
   'lambda': 0.01,
   'ma