# This is an example file to show how to train the model using the Rg.npy dataset.


### Import all required packages

In [None]:
# Import required packages
import numpy as np
from sklearn.decomposition import PCA
import random
from numpy import sqrt
from sklearn.preprocessing import MinMaxScaler
from collections import Counter

# Import regression models
from sklearn.svm import SVR
# from sklearn.gaussian_process import GaussianProcessRegressor
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.kernel_ridge import KernelRidge
# from sklearn.linear_model import Ridge

import utils

# Load the data and create arrays for Rg and sequences

In [None]:
seq = np.load("Rg_data.npy",allow_pickle=True)
seq = seq.item()
data_seq = []
Rg = []
nu = []
for i in range(len(seq)):
    data_seq.append(seq[i][0])
    Rg.append(seq[i][1][1][1])
    nu.append(seq[i][1][1][3])

## Featurization

In [None]:
CE = utils.seqs_to_count_encoding(data_seq)
OE = utils.seqs_to_ordinal_encoding(data_seq)
OHE = utils.seqs_to_onehot(data_seq)
BAA = utils.seqs_to_bag_of_AAs(data_seq)

In [None]:
Rg = np.array(Rg).reshape(-1,1)
nu = np.array(nu).reshape(-1,1)

X = CE # Choose the features
Y = np.hstack((Rg,nu)) # stack Rg and nu

# Define the dataset split
Give the random seed, cross validation fold and learning curve split

In [None]:
fold = 6
seed = 10
split = 8
c, CL = utils.get_CL_from_OE(OE)

Train_indices,Test_indices = utils.CV_split_CL(fold,seed,c)


# Get the best parameters
Here, we do the hyperparameter tuning. This function needs the following information: which regression model, X, Y, Train_indices, Test_indices, Fold of cross validation, Parameter list, Whether to train Rg or not(default is train the Rg: Train_Rg = True)

The parameter list setup is as follow:
LRR: A list of range that you want to test the parameter, alpha.
KRR: A list consists of two sublists. The first one is for alpha, and the second one is for gamma.
SVR: A list consists of three sublists. The first one is for C, the second one is for gamma, and the third is for alpha.
GPR: A list of range that you want to test the parameter, alpha.

In [None]:
Parameter_list = []
C_range = [1, 10, 100, 1000]
G_range = [0.001,0.01,0.1,1]
E_range = [0.001, 0.01, 0.1, 1]
Parameter_list.append(C_range)
Parameter_list.append(G_range)
Parameter_list.append(E_range)

best_C, best_gamma, best_epsilon = utils.Hyperparameters_Tuning("SVR", X, Y, Train_indices, Test_indices, fold, Parameter_list)

# Learning curve train test split

In [None]:
Train_indices,Test_indices = utils.LC_split_CL(fold,split,seed,c)

## Define a model using the best parameters and obtain the learning curve
The LC_results will give a list containing train loss, validation loss, train size, validation size, train score and validation score

In [None]:
model = SVR(kernel="rbf", C = best_C, gamma = best_gamma, epsilon = best_epsilon)
LC_results = utils.Learning_curve(model, X, Y, Train_indices, Test_indices, fold, split)

# Final Training

In [None]:
# Split train and test based on indices

X_train_unscaled = []
X_test_unscaled = []
Y_train_unscaled = []
Y_test_unscaled = []

for i in Train_indices:
    X_train_unscaled.append(X[i])
    Y_train_unscaled.append(Y[i])
for i in Test_indices:
    X_test_unscaled.append(X[i])
    Y_test_unscaled.append(Y[i])
    
X_train_unscaled = np.vstack(X_train_unscaled)
X_test_unscaled = np.vstack(X_test_unscaled)
Y_train_unscaled = np.vstack(Y_train_unscaled)
Y_test_unscaled = np.vstack(Y_test_unscaled)

Y_train_Rg = Y_train_unscaled[:,0]
Y_test_Rg = Y_test_unscaled[:,0]

# Normalize input

scaler = MinMaxScaler()
scaler.fit(X_train_unscaled)
X_train = scaler.transform(X_train_unscaled)
X_test = scaler.transform(X_test_unscaled)



In [None]:
# Train the model

svr_rbf = SVR(kernel="rbf", C = best_C, gamma = best_gamma, epsilon = best_epsilon)
svr_rbf.fit(X_train,Y_train_Rg)
svr_rbf.score(X_test,Y_test_Rg)

In [None]:
# Obtain the prediction and the test performance

Y_test_pred = svr_rbf.predict(X_test)
print(utils.coeff_determination(Y_test_Rg,Y_test_pred))
print(utils.percent_error(Y_test_Rg,Y_test_pred))
print(utils.MSE(Y_test_Rg,Y_test_pred))
print(utils.RMSE(Y_test_Rg,Y_test_pred))
print(utils.MAE(Y_test_Rg,Y_test_pred))

# Extrapolation Test

The extrapolation test is implemented as follow:
1) Define a variable that contains the best model parameters
<br>
2) Give the function X, Y, Train_indices, Test_indices, model and binary variable forward(default is True).

The results will give a list containing training size, test loss and test score

In [None]:
model = SVR(kernel="rbf", C = best_C, gamma = best_gamma, epsilon = best_epsilon)
results = utils.extrapolation_test_classical_regression(X, Y, Train_indices, Test_indices, model, forward = True)