In [1]:
# General libraries
import pandas as pd
import numpy as np

# Scikit Learn libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn import metrics

# Scipy libraries
from scipy import stats

# Single import from another libraries
import seaborn as sns
import statsmodels.api as sm
import joblib
import warnings
warnings.filterwarnings('ignore')


# Utils functions
from utils.utils import kfold, get_adj_r2, read_datasets

# Load Dataset

In [2]:
x_train, x_test, y_train, y_test = read_datasets(
    'x_train.csv',
    'x_test.csv',
    'y_train.csv',
    'y_test.csv'
)

In [3]:
x_train.head()

Unnamed: 0,dslf_fa13,fa_atr,fa_dun,fa_elec,fa_intra_rep,fa_intra_sol_xover4,fa_rep,fa_sol,hbond_bb_sc,hbond_lr_bb,hbond_sc,hbond_sr_bb,lk_ball_wtd,omega,p_aa_pp,pro_close,rama_prepro,ref,time,yhh_planarity
0,0.22973,0.763596,0.308557,0.775237,0.338457,0.243961,0.297071,0.233128,0.801774,0.686648,0.810329,0.780078,0.77176,0.201707,0.734554,0.040848,0.712342,0.523591,0.0,0.013655
1,0.22973,0.701397,0.316951,0.706863,0.311703,0.261941,0.278929,0.294983,0.693164,0.630547,0.735858,0.733178,0.694929,0.074615,0.636059,0.06549,0.423793,0.334937,0.0,0.097689
2,0.22973,0.934183,0.104464,0.930745,0.074034,0.117862,0.039392,0.093871,0.99147,0.971738,0.936427,0.821288,0.917112,0.044517,0.954476,0.003597,0.268529,0.001079,0.0,0.002101
3,0.22973,0.503359,0.489959,0.519385,0.506761,0.425321,0.454789,0.470166,0.523125,0.461226,0.521223,0.576508,0.483125,0.79876,0.529375,0.816637,0.601882,0.801711,0.0,0.056723
4,0.22973,0.946765,0.04685,0.936256,0.048643,0.062377,0.041516,0.069711,0.970284,0.89925,0.963758,0.914094,0.91118,0.01158,0.90731,0.019308,0.267537,0.034116,0.0,0.003151


In [4]:
y_train.head()

Unnamed: 0,solubility
0,94
1,66
2,98
3,10
4,23


In [5]:
# Graphic libraries
#import matplotlib.pyplot as plt
#import seaborn as sns

# yeo-jhonson Transformation

# normalize the exponential data with Yeo Jhonson
#normalized_data = stats.yeojohnson(df_sol_y['solubility'])

# plot both together to compare
#fig, ax=plt.subplots(1,2)
#sns.histplot(df_sol_y['solubility'], ax=ax[0])
#ax[0].set_title("Original Data")
#sns.histplot(normalized_data[0], ax=ax[1])
#ax[1].set_title("Normalized data")

#df_sol_y = pd.DataFrame({'solubility' : normalized_data[0]})

# Linear Regression (LR)

In [6]:
# The cross validation scheme to be used
folds = kfold()

In [7]:
# Create the Linear Regression model
# Use cross validation
# Check the R2 score for train and test


lm = LinearRegression()

scores = cross_validate(estimator=lm, X=x_train, y=y_train, cv=folds,
    scoring=('r2', 'neg_mean_squared_error'),
    return_train_score=True)

#print(scores.keys())
print("Train R2 score: {}".format(scores['train_r2']))
print("Test R2 score: {}".format(scores['test_r2']))

Train R2 score: [0.25286675 0.23355529 0.23417866 0.23198461 0.24178976 0.2974929
 0.24005319 0.24272515 0.22979474 0.23844723]
Test R2 score: [ 0.04053155  0.18325907  0.17604815  0.21213734  0.12600244 -0.43771814
  0.16370202  0.13143949  0.24589643  0.17161078]


In [8]:
# Test the model

lm.fit(x_train, y_train)
y_pred = lm.predict(x_test)
r2 = metrics.r2_score(y_test, y_pred)
print(r2)

0.020430262219981454


In [9]:
# Check the Adjusted R2

n_observations = len(y_train)
n_independent_variables = x_train.shape[1]

for cross_val_r2 in scores['test_r2']:
    Adj_r2 = get_adj_r2(n_observations=n_observations, n_independent_variables=n_independent_variables, r2_score=cross_val_r2)
    print(Adj_r2)

-0.022799699824254338
0.1293487812459032
0.1216618881353313
0.16013320558446642
0.06831283212384998
-0.5326170268046035
0.10850083798198296
0.0741087614654734
0.19612061387038215
0.11693162680523905


In [10]:
X2 = sm.add_constant(x_train)
est = sm.OLS(y_train, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:             solubility   R-squared:                       0.238
Model:                            OLS   Adj. R-squared:                  0.188
Method:                 Least Squares   F-statistic:                     4.738
Date:                Sun, 06 Nov 2022   Prob (F-statistic):           5.85e-10
Time:                        12:47:20   Log-Likelihood:                -1539.2
No. Observations:                 324   AIC:                             3120.
Df Residuals:                     303   BIC:                             3200.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                -266.6742    

In [11]:
# Compute p-values
est2.pvalues

const                  0.196312
dslf_fa13              0.396983
fa_atr                 0.029297
fa_dun                 0.800045
fa_elec                0.032087
fa_intra_rep           0.803053
fa_intra_sol_xover4    0.307993
fa_rep                 0.434938
fa_sol                 0.120615
hbond_bb_sc            0.009231
hbond_lr_bb            0.977642
hbond_sc               0.064916
hbond_sr_bb            0.294278
lk_ball_wtd            0.974143
omega                  0.408372
p_aa_pp                0.501780
pro_close              0.422800
rama_prepro            0.556877
ref                    0.959843
time                   0.894914
yhh_planarity          0.066869
dtype: float64

In [12]:
# Get p-values below 0.05
for feature in est2.pvalues.keys():
    if est2.pvalues[feature] <= 0.05:
        print((feature, est2.pvalues[feature]))

('fa_atr', 0.02929731892792759)
('fa_elec', 0.032087008100062965)
('hbond_bb_sc', 0.009230714391811367)


# Saving trained model

In [13]:
filename = '../models/lr_model.joblib'
joblib.dump(lm, filename)

['../models/lr_model.joblib']

# Conclusions

What was done:
* Used crossValidate with 10 folds and linear regression. Checked the R2, adjusted R2 and p_values (checked for columns with p_value above 0.05)