In [15]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import KFold
import numpy as np
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings('ignore')

In [3]:
data_folder_path = '../data/'

data_path = data_folder_path + 'complex_processed_data.csv'
standardized_data_path = data_folder_path + 'complex_processed_standardized_data.csv'
standardized_poutliers_removed_data_path = data_folder_path + 'complex_processed_standardized_outliers_removed_data.csv'

df_solubility = pd.read_csv(standardized_data_path)

In [4]:
# Split dataset into X and Y for machine learning

df_sol_X = df_solubility.copy()
df_sol_X.drop(columns=['solubility'], axis=1, inplace=True)

df_sol_y = df_solubility[['solubility']]

In [5]:
x_train, x_test, y_train, y_test = train_test_split(
                        df_sol_X, df_sol_y, 
                        train_size = 0.8,
                        test_size = 0.2,
                        random_state = 10
                        )

In [6]:
models_folder = '../models/'

lr = joblib.load(models_folder + 'lr_model.joblib')
svr = joblib.load(models_folder + 'svr_model.joblib')

In [7]:
def five_two(reg1, reg2, X, y, metric='default'):

  # Choose seeds for each 2-fold iterations
  seeds = [13, 51, 137, 24659, 347]

  # Initialize the score difference for the 1st fold of the 1st iteration 
  p_1_1 = 0.0

  # Initialize a place holder for the variance estimate
  s_sqr = 0.0

  # Initialize scores list for both classifiers
  scores_1 = []
  scores_2 = []
  diff_scores = []

  # Iterate through 5 2-fold CV
  for i_s, seed in enumerate(seeds):

    # Split the dataset in 2 parts with the current seed
    folds = KFold(n_splits=2, shuffle=True, random_state=seed)

    # Initialize score differences
    p_i = np.zeros(2)

    # Go through the current 2 fold
    for i_f, (trn_idx, val_idx) in enumerate(folds.split(X)):
      # Split the data
      trn_x, trn_y = X.iloc[trn_idx], y.iloc[trn_idx]
      val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

      # Train regression
      reg1.fit(trn_x, trn_y)
      reg2.fit(trn_x, trn_y)

      # Compute scores
      preds_1 = reg1.predict(val_x)
      score_1 = r2_score(val_y, preds_1)
      
      preds_2 = reg2.predict(val_x)
      score_2 = r2_score(val_y, preds_2)

      if metric == "adj_r2":
        score_1 = base_train_adj_r2 = get_adj_r2(
          n_observations=len(trn_y) / 2,
          n_independent_variables=trn_x.shape[1],
          r2_score = score_1
        )

        score_2 = base_train_adj_r2 = get_adj_r2(
          n_observations=len(trn_y) / 2,
          n_independent_variables=trn_x.shape[1],
          r2_score = score_2
        )


      # keep score history for mean and stdev calculation
      scores_1.append(score_1)
      scores_2.append(score_2)
      diff_scores.append(score_1 - score_2)
      print("Fold %2d score difference = %.6f" % (i_f + 1, score_1 - score_2))

      # Compute score difference for current fold  
      p_i[i_f] = score_1 - score_2

      # Keep the score difference of the 1st iteration and 1st fold
      if (i_s == 0) & (i_f == 0):
        p_1_1 = p_i[i_f]

    # Compute mean of scores difference for the current 2-fold CV
    p_i_bar = (p_i[0] + p_i[1]) / 2

    # Compute the variance estimate for the current 2-fold CV
    s_i_sqr = (p_i[0] - p_i_bar) ** 2 + (p_i[1] - p_i_bar) ** 2 

    # Add up to the overall variance
    s_sqr += s_i_sqr
    
  # Compute t value as the first difference divided by the square root of variance estimate
  t_bar = p_1_1 / ((s_sqr / 5) ** .5) 

  print("Regression 1 mean score and stdev : %.6f + %.6f" % (np.mean(scores_1), np.std(scores_1)))
  print("Regression 2 mean score and stdev : %.6f + %.6f" % (np.mean(scores_2), np.std(scores_2)))
  print("Score difference mean + stdev : %.6f + %.6f" 
        % (np.mean(diff_scores), np.std(diff_scores)))
  print("t_value for the current test is %.6f" % t_bar)

In [16]:
five_two(
    reg1=lr,
    reg2=svr,
    X=df_sol_X,
    y=df_sol_y
)

Fold  1 score difference = -0.091774
Fold  2 score difference = -0.148147
Fold  1 score difference = -0.094295
Fold  2 score difference = -0.315811
Fold  1 score difference = -0.223874
Fold  2 score difference = -0.110085
Fold  1 score difference = -0.080927
Fold  2 score difference = -0.219854
Fold  1 score difference = -0.180492
Fold  2 score difference = -0.171668
Regression 1 mean score and stdev : 0.035916 + 0.077861
Regression 2 mean score and stdev : 0.199609 + 0.039461
Score difference mean + stdev : -0.163693 + 0.070863
t_value for the current test is -0.997938
