In [1]:
# General libraries
import pandas as pd
import numpy as np

# Scikit Learn libraries
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from scipy import stats


# Load Dataset

In [2]:
data_path = "data/complex_processed_data.csv"
standardized_data_path = 'data/complex_processed_standardized_data.csv'
standardized_poutliers_removed_data_path = 'data/complex_processed_standardized_outliers_removed_data.csv'

df_solubility = pd.read_csv(standardized_data_path)

# Process Dataset

Process Dataset before the model creation.
The following actions were done:
* Split the independent variable from the dependent ones;
* Split Dataset for training and testing.

In [3]:
# Split dataset into X and Y for machine learning

df_sol_X = df_solubility.copy()
df_sol_X.drop(columns=['solubility'], axis=1, inplace=True)

df_sol_y = df_solubility[['solubility']]

In [4]:
df_sol_X.head()

Unnamed: 0,total_score,score,dslf_fa13,fa_atr,fa_dun,fa_elec,fa_intra_rep,fa_intra_sol_xover4,fa_rep,fa_sol,...,hbond_lr_bb,hbond_sc,hbond_sr_bb,lk_ball_wtd,omega,p_aa_pp,pro_close,rama_prepro,ref,time
0,0.743094,0.743094,0.524355,0.810367,0.194955,0.805052,0.204689,0.194424,0.150016,0.181959,...,0.899018,0.865276,0.745269,0.814943,0.064154,0.842755,0.029479,0.322475,0.208127,1.0
1,0.676975,0.676975,0.524355,0.686317,0.305593,0.720613,0.293997,0.271051,0.288532,0.295748,...,0.869617,0.799924,0.590403,0.680818,0.105766,0.764504,0.181751,0.415949,0.510299,1.0
2,0.834347,0.834347,0.524355,0.876691,0.142758,0.869296,0.133184,0.142547,0.125866,0.120619,...,0.855511,0.941145,0.857638,0.858072,0.030711,0.905852,0.017323,0.258,0.190017,0.0
3,0.577278,0.577278,0.442461,0.650845,0.340551,0.656721,0.348769,0.316203,0.326141,0.338899,...,0.709356,0.720251,0.652735,0.635345,0.063897,0.660731,0.099558,0.240302,0.307287,0.0
4,0.58374,0.58374,0.524355,0.693861,0.255558,0.675543,0.336511,0.257846,0.234749,0.303265,...,0.686046,0.681776,0.650759,0.6871,0.041349,0.791688,0.12916,0.355501,0.316987,0.0


In [5]:
df_sol_y.head()

Unnamed: 0,solubility
0,0.572519
1,0.274809
2,0.618321
3,0.458015
4,0.648855


In [6]:
x_train, x_test, y_train, y_test = train_test_split(
                        df_sol_X, df_sol_y, 
                        train_size = 0.8,
                        test_size = 0.2,
                        random_state = 10
                        )

# Support Vector Regression (SVR)

In [7]:
# The cross validation scheme to be used
cv = 10
folds = KFold(n_splits = cv, shuffle = True, random_state = 100)

In [8]:
# Create a SVR model with cross validation and default parameters

svr = SVR(kernel='rbf', C=1.0, gamma='auto', epsilon=0.1, degree=3)

scores = cross_validate(estimator=svr, X=x_train, y=np.ravel(y_train), cv=folds,
    scoring=('r2', 'neg_mean_squared_error'),
    return_train_score=True)

#print(scores.keys())
print("Train R2 score: {}".format(scores['train_r2']))
print("Test R2 score: {}".format(scores['test_r2']))

# Save the test scores from cross validation
first_try_results = scores['test_r2'] 

Train R2 score: [0.21188984 0.22036139 0.19591976 0.2107361  0.21919154 0.26842116
 0.21990612 0.22405558 0.21906073 0.20890184]
Test R2 score: [ 0.24952506  0.10166905  0.40115562  0.23081565  0.21221029 -0.28303423
  0.1801292   0.10694309  0.14179503  0.28495393]


In [9]:
# Test the model

svr.fit(x_train, np.ravel(y_train))
y_pred = svr.predict(x_test)
r2 = metrics.r2_score(y_test, y_pred)
print(r2)

0.21237884506754456


In [10]:
# Check the Adjusted R2

n_observations = len(y_train)
n_independent_variables = x_train.shape[1]

for cross_val_r2 in scores['test_r2']:
    Adj_r2 = 1 - (1 - cross_val_r2) * (n_observations - 1) / (n_observations - n_independent_variables - 1)
    print(Adj_r2)

0.19733971258727123
0.039202326913430174
0.35951412449359
0.17732932059702777
0.1574302101019851
-0.37225183833883313
0.12311831692236685
0.044843109028393946
0.08211852051196156
0.23523217831738397


## Grid Search

In [11]:
# Specify range of hyperparameters to tune
hyper_params = {
    'kernel': ('linear', 'rbf','poly', 'sigmoid'),
    'C':[1, 1.5, 10, 100],
    'gamma': [1e-7, 1e-4, 'auto', 'scale'],
    'epsilon':[0.1,0.2,0.5,0.3],
    'degree': [1,2,3,4]
    }


# Specify model
svr = SVR()
svr.fit(x_train, np.ravel(y_train))

# Call GridSearchCV()
model_cv = GridSearchCV(
    estimator = svr,
    param_grid = hyper_params,
    scoring= 'r2',
    cv = folds,
    verbose = 1,
    return_train_score=True,
    n_jobs = -1,
    refit = True
    )

# Fit the model
best_model = model_cv.fit(x_train, np.ravel(y_train)) 

print(model_cv.best_params_)

Fitting 10 folds for each of 768 candidates, totalling 7680 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 1060 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done 4996 tasks      | elapsed:   40.6s


{'C': 1.5, 'degree': 1, 'epsilon': 0.2, 'gamma': 'scale', 'kernel': 'rbf'}


[Parallel(n_jobs=-1)]: Done 7680 out of 7680 | elapsed:  1.1min finished


In [13]:
# Create new model with best_params_ from grid search

svr_best = SVR(
    kernel=model_cv.best_params_['kernel'],
    C=model_cv.best_params_['C'],
    gamma=model_cv.best_params_['gamma'],
    epsilon=model_cv.best_params_['epsilon'],
    degree=model_cv.best_params_['degree']
    )

scores = cross_validate(estimator=svr_best, X=x_train, y=np.ravel(y_train), cv=folds,
    scoring=('r2', 'neg_mean_squared_error'),
    return_train_score=True)

#print(scores.keys())
print("Train R2 score: {}".format(scores['train_r2']))
print("Test R2 score: {}".format(scores['test_r2']))

# Save the test scores from cross validation
best_parameters_result = scores['test_r2']

Train R2 score: [0.32093988 0.33328547 0.31823966 0.32026569 0.33359129 0.35968942
 0.34059933 0.33424372 0.32346938 0.3282754 ]
Test R2 score: [ 0.27134574  0.13602266  0.3282046   0.31202866  0.19940002 -0.09615614
  0.18304571  0.17160202  0.22694649  0.2720827 ]


In [14]:
# check the cross validation test scores
print('First try: {}'.format(first_try_results))
print('Best parameters: {}'.format(best_parameters_result))

First try: [ 0.24952506  0.10166905  0.40115562  0.23081565  0.21221029 -0.28303423
  0.1801292   0.10694309  0.14179503  0.28495393]
Best parameters: [ 0.27134574  0.13602266  0.3282046   0.31202866  0.19940002 -0.09615614
  0.18304571  0.17160202  0.22694649  0.2720827 ]


In [16]:
svr_best.fit(x_train, np.ravel(y_train))
y_pred = svr_best.predict(x_test)
r2 = metrics.r2_score(y_test, y_pred)
print(r2)

0.15514305038842702


In [17]:
# Check the Adjusted R2

n_observations = len(y_train)
n_independent_variables = x_train.shape[1]

for cross_val_r2 in best_parameters_result:
    Adj_r2 = 1 - (1 - cross_val_r2) * (n_observations - 1) / (n_observations - n_independent_variables - 1)
    print(Adj_r2)

0.2206777286889462
0.07594476732026856
0.2814903542821776
0.2641895978968528
0.14372915607705827
-0.17237891703444919
0.12623763445367364
0.1139981899299003
0.17319110986660147
0.2214659365425392


# Conclusions

What was done:
* Split dataset in test 20% and train 80%.
* Create a SVR model with grid seach then perform a grid search to find the best hyper parameters.
* Compare the test scores from before the grid search and after
* On grid search: C=1000 did not show better results so it was removed