In [675]:
# General libraries
import pandas as pd
import numpy as np

# Graphic libraries
#import matplotlib.pyplot as plt

# Scikit Learn libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from scipy.stats import pearsonr
from sklearn import metrics

# Load Dataset

In [676]:
data_path = "data/complex_processed_data.csv"
standardized_data_path = 'data/complex_processed_standardized_data.csv'

df_solubility = pd.read_csv(standardized_data_path)

# Process Dataset

Process Dataset before the model creation.
The following actions were done:
* Split the independent variable from the dependent ones;
* Split Dataset for training and testing.

In [677]:
# Split dataset into X and Y for machine learning

df_sol_X = df_solubility.copy()
df_sol_X.drop(columns=['solubility'], axis=1, inplace=True)

df_sol_y = df_solubility[['solubility']]

In [678]:
df_sol_X.head()

Unnamed: 0,total_score,score,dslf_fa13,fa_atr,fa_dun,fa_elec,fa_intra_rep,fa_intra_sol_xover4,fa_rep,fa_sol,...,hbond_lr_bb,hbond_sc,hbond_sr_bb,lk_ball_wtd,omega,p_aa_pp,pro_close,rama_prepro,ref,time
0,0.743094,0.743094,0.524355,0.810367,0.194955,0.805052,0.204689,0.194424,0.150016,0.181959,...,0.899018,0.865276,0.745269,0.814943,0.064154,0.842755,0.029479,0.322475,0.208127,1.0
1,0.676975,0.676975,0.524355,0.686317,0.305593,0.720613,0.293997,0.271051,0.288532,0.295748,...,0.869617,0.799924,0.590403,0.680818,0.105766,0.764504,0.181751,0.415949,0.510299,1.0
2,0.834347,0.834347,0.524355,0.876691,0.142758,0.869296,0.133184,0.142547,0.125866,0.120619,...,0.855511,0.941145,0.857638,0.858072,0.030711,0.905852,0.017323,0.258,0.190017,0.0
3,0.577278,0.577278,0.442461,0.650845,0.340551,0.656721,0.348769,0.316203,0.326141,0.338899,...,0.709356,0.720251,0.652735,0.635345,0.063897,0.660731,0.099558,0.240302,0.307287,0.0
4,0.58374,0.58374,0.524355,0.693861,0.255558,0.675543,0.336511,0.257846,0.234749,0.303265,...,0.686046,0.681776,0.650759,0.6871,0.041349,0.791688,0.12916,0.355501,0.316987,0.0


In [679]:
df_sol_y.head()

Unnamed: 0,solubility
0,0.572519
1,0.274809
2,0.618321
3,0.458015
4,0.648855


In [680]:
x_train, x_test, y_train, y_test = train_test_split(
                        df_sol_X, df_sol_y, 
                        train_size = 0.8,
                        test_size = 0.2,
                        random_state = 10
                        )

# Machine Learning Modeling

## Linear Regression (LR)

In [699]:
# Create the Linear Regression model
# Use cross validation
# Check the R2 score for train and test


lm = LinearRegression()
cv = 5

scores = cross_validate(estimator=lm, X=x_train, y=y_train, cv=cv,
    scoring=('r2', 'neg_mean_squared_error'),
    return_train_score=True)

#print(scores.keys())
print("Train R2 score: {}".format(scores['train_r2']))
print("Test R2 score: {}".format(scores['test_r2']))

Train R2 score: [0.28046008 0.2243075  0.23393399 0.2614297  0.27306358]
Test R2 score: [-0.15607619  0.21526166  0.1788878  -0.00400742 -0.04637872]


In [700]:
# Test the model

lm.fit(x_train, y_train)
y_pred = lm.predict(x_test)
r2 = metrics.r2_score(y_test, y_pred)
print(r2)

0.019538122610365583


In [683]:
# Check the Adjusted R2

n_observations = len(y_train)
n_independent_variables = x_train.shape[1]

# Adj_r2 = 1 - (1 - r2) * (n_observations - 1) / (n_observations - n_independent_variables - 1)

for cross_val_r2 in scores['test_r2']:
    Adj_r2 = 1 - (1 - cross_val_r2) * (n_observations - 1) / (n_observations - n_independent_variables - 1)
    print(Adj_r2)

-0.23646559686856494
0.1606937582045065
0.121790598501067
-0.07382250485921205
-0.11914015871283445


### Pearson Correlation and P Value


* https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html

In [684]:
# Get the P-Value and Pearson Correlation

p_value_list = []
pearson_cor_list = []

for column in x_train.columns:
    p_value_list.append((
        column,
        pearsonr(x_train[column], y_train['solubility'])[1]
    ))

    pearson_cor_list.append((
        column,
        pearsonr(x_train[column], y_train['solubility'])[0]
    ))

# P_value_list

# é pra ser acima de 0.5 ou 0.05?
for p_value in p_value_list:
    if p_value[1] > 0.05:
        print((p_value[0], True))
    else:
        print((p_value[0], False))

('total_score', False)
('score', False)
('dslf_fa13', True)
('fa_atr', False)
('fa_dun', False)
('fa_elec', False)
('fa_intra_rep', False)
('fa_intra_sol_xover4', False)
('fa_rep', False)
('fa_sol', False)
('hbond_bb_sc', False)
('hbond_lr_bb', False)
('hbond_sc', False)
('hbond_sr_bb', False)
('lk_ball_wtd', False)
('omega', False)
('p_aa_pp', False)
('pro_close', False)
('rama_prepro', True)
('ref', False)
('time', True)


In [685]:
print(pearson_cor_list)

[('total_score', 0.3728542151106772), ('score', 0.37285421755565656), ('dslf_fa13', -0.021612082615918415), ('fa_atr', 0.3747633387978939), ('fa_dun', -0.3316716816947804), ('fa_elec', 0.35939734818916225), ('fa_intra_rep', -0.33879880876089635), ('fa_intra_sol_xover4', -0.3723242091406435), ('fa_rep', -0.3008201264822252), ('fa_sol', -0.3692202027268628), ('hbond_bb_sc', 0.3843437641453255), ('hbond_lr_bb', 0.18118716381572553), ('hbond_sc', 0.3660864985706804), ('hbond_sr_bb', 0.3686373398087358), ('lk_ball_wtd', 0.37945815340623745), ('omega', -0.2218947840483731), ('p_aa_pp', 0.3295659505177393), ('pro_close', -0.11133538376392353), ('rama_prepro', -0.10075373697664987), ('ref', -0.31363544858811976), ('time', -0.024636387977706065)]
