In [1]:
# General libraries
import pandas as pd
import numpy as np

# Graphic libraries
import matplotlib.pyplot as plt

# Scikit Learn libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn import metrics

# Scipy libraries
from scipy import stats

# Single import from another libraries
import seaborn as sns
import statsmodels.api as sm
import joblib
import warnings
warnings.filterwarnings('ignore')

# Load Dataset

In [2]:
data_path = "data/complex_processed_data.csv"
standardized_data_path = 'data/complex_processed_standardized_data.csv'
standardized_poutliers_removed_data_path = 'data/complex_processed_standardized_outliers_removed_data.csv'

df_solubility = pd.read_csv(standardized_data_path)

# Process Dataset

Process Dataset before the model creation.
The following actions were done:
* Split the independent variable from the dependent ones;
* Split Dataset for training and testing.

In [3]:
# Split dataset into X and Y for machine learning

df_sol_X = df_solubility.copy()
df_sol_X.drop(columns=['solubility'], axis=1, inplace=True)

df_sol_y = df_solubility[['solubility']]

In [4]:
df_sol_X.head()

Unnamed: 0,total_score,score,dslf_fa13,fa_atr,fa_dun,fa_elec,fa_intra_rep,fa_intra_sol_xover4,fa_rep,fa_sol,...,hbond_lr_bb,hbond_sc,hbond_sr_bb,lk_ball_wtd,omega,p_aa_pp,pro_close,rama_prepro,ref,time
0,0.743094,0.743094,0.524355,0.810367,0.194955,0.805052,0.204689,0.194424,0.150016,0.181959,...,0.899018,0.865276,0.745269,0.814943,0.064154,0.842755,0.029479,0.322475,0.208127,1.0
1,0.676975,0.676975,0.524355,0.686317,0.305593,0.720613,0.293997,0.271051,0.288532,0.295748,...,0.869617,0.799924,0.590403,0.680818,0.105766,0.764504,0.181751,0.415949,0.510299,1.0
2,0.834347,0.834347,0.524355,0.876691,0.142758,0.869296,0.133184,0.142547,0.125866,0.120619,...,0.855511,0.941145,0.857638,0.858072,0.030711,0.905852,0.017323,0.258,0.190017,0.0
3,0.577278,0.577278,0.442461,0.650845,0.340551,0.656721,0.348769,0.316203,0.326141,0.338899,...,0.709356,0.720251,0.652735,0.635345,0.063897,0.660731,0.099558,0.240302,0.307287,0.0
4,0.58374,0.58374,0.524355,0.693861,0.255558,0.675543,0.336511,0.257846,0.234749,0.303265,...,0.686046,0.681776,0.650759,0.6871,0.041349,0.791688,0.12916,0.355501,0.316987,0.0


In [5]:
# yeo-jhonson Transformation

# normalize the exponential data with Yeo Jhonson
#normalized_data = stats.yeojohnson(df_sol_y['solubility'])

# plot both together to compare
#fig, ax=plt.subplots(1,2)
#sns.histplot(df_sol_y['solubility'], ax=ax[0])
#ax[0].set_title("Original Data")
#sns.histplot(normalized_data[0], ax=ax[1])
#ax[1].set_title("Normalized data")

#df_sol_y = pd.DataFrame({'solubility' : normalized_data[0]})

In [6]:
df_sol_y.head()

Unnamed: 0,solubility
0,0.572519
1,0.274809
2,0.618321
3,0.458015
4,0.648855


In [7]:
x_train, x_test, y_train, y_test = train_test_split(
                        df_sol_X, df_sol_y, 
                        train_size = 0.8,
                        test_size = 0.2,
                        random_state = 10
                        )

# Linear Regression (LR)

In [8]:
# The cross validation scheme to be used
cv = 10
folds = KFold(n_splits = cv, shuffle = False, random_state = None)

In [9]:
# Create the Linear Regression model
# Use cross validation
# Check the R2 score for train and test


lm = LinearRegression()
cv = 10

scores = cross_validate(estimator=lm, X=x_train, y=y_train, cv=folds,
    scoring=('r2', 'neg_mean_squared_error'),
    return_train_score=True)

#print(scores.keys())
print("Train R2 score: {}".format(scores['train_r2']))
print("Test R2 score: {}".format(scores['test_r2']))

Train R2 score: [0.24070762 0.2779434  0.24402082 0.22620188 0.21954593 0.25365369
 0.24037885 0.24810633 0.24074922 0.26551295]
Test R2 score: [ 0.15333512 -0.91699637  0.07873072  0.32811153  0.26146821  0.03283949
  0.13819632  0.05133009  0.03650396 -0.08689509]


In [10]:
# Test the model

lm.fit(x_train, y_train)
y_pred = lm.predict(x_test)
r2 = metrics.r2_score(y_test, y_pred)
print(r2)

0.019538122610365583


In [11]:
# Check the Adjusted R2

n_observations = len(y_train)
n_independent_variables = x_train.shape[1]

# Adj_r2 = 1 - (1 - r2) * (n_observations - 1) / (n_observations - n_independent_variables - 1)

for cross_val_r2 in scores['test_r2']:
    Adj_r2 = 1 - (1 - cross_val_r2) * (n_observations - 1) / (n_observations - n_independent_variables - 1)
    print(Adj_r2)

0.09446107369655665
-1.0502974406930559
0.014668944367864922
0.2813908045105036
0.21011335123476638
-0.03441338986162035
0.0782695787670794
-0.014637019550696184
-0.030494111835906335
-0.16247388360902448


In [12]:
X2 = sm.add_constant(x_train)
est = sm.OLS(y_train, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:             solubility   R-squared:                       0.238
Model:                            OLS   Adj. R-squared:                  0.185
Method:                 Least Squares   F-statistic:                     4.498
Date:                Mon, 22 Aug 2022   Prob (F-statistic):           1.30e-09
Time:                        12:05:30   Log-Likelihood:                 40.328
No. Observations:                 324   AIC:                            -36.66
Df Residuals:                     302   BIC:                             46.52
Df Model:                          21                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                 806.5979    

In [13]:
# Compute p-values
est2.pvalues

const                  0.071453
total_score            0.857136
score                  0.856246
dslf_fa13              0.043858
fa_atr                 0.072166
fa_dun                 0.071127
fa_elec                0.067428
fa_intra_rep           0.194573
fa_intra_sol_xover4    0.061857
fa_rep                 0.071408
fa_sol                 0.072424
hbond_bb_sc            0.081097
hbond_lr_bb            0.071547
hbond_sc               0.079167
hbond_sr_bb            0.075385
lk_ball_wtd            0.072390
omega                  0.069755
p_aa_pp                0.068131
pro_close              0.084804
rama_prepro            0.069902
ref                    0.070234
time                   0.895635
dtype: float64

In [14]:
# Get p-values above 0.05
for feature in est2.pvalues.keys():
    if est2.pvalues[feature] > 0.05:
        print((feature, est2.pvalues[feature]))

('const', 0.07145334893504028)
('total_score', 0.8571355728006999)
('score', 0.8562459134467624)
('fa_atr', 0.07216558016321616)
('fa_dun', 0.07112732549291281)
('fa_elec', 0.06742812350342402)
('fa_intra_rep', 0.19457327333883412)
('fa_intra_sol_xover4', 0.061856682769149775)
('fa_rep', 0.07140767021901681)
('fa_sol', 0.07242412767518122)
('hbond_bb_sc', 0.08109705238323378)
('hbond_lr_bb', 0.071547421380971)
('hbond_sc', 0.07916686850908292)
('hbond_sr_bb', 0.07538514894740277)
('lk_ball_wtd', 0.0723899582662793)
('omega', 0.06975481133678908)
('p_aa_pp', 0.06813107910115832)
('pro_close', 0.08480437045463639)
('rama_prepro', 0.06990170590294244)
('ref', 0.0702336184793425)
('time', 0.8956346252120697)


# Saving trained model

In [15]:
filename = 'models/lr_model.joblib'
joblib.dump(lm, filename)

['models/lr_model.joblib']

# Conclusions

What was done:
* Split dataset in test 20% and train 80%
* Used crossValidate with 10 folds and linear regression. Checked the R2, adjusted R2 and p_values (checked for columns with p_value above 0.05)