In [101]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv('delaney-processed.csv')
data

Unnamed: 0,Compound ID,ESOL predicted log solubility in mols per litre,Minimum Degree,Molecular Weight,Number of H-Bond Donors,Number of Rings,Number of Rotatable Bonds,Polar Surface Area,measured log solubility in mols per litre,smiles
0,Amigdalin,-0.974,1,457.432,7,3,7,202.32,-0.770,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...
1,Fenfuram,-2.885,1,201.225,1,2,2,42.24,-3.300,Cc1occc1C(=O)Nc2ccccc2
2,citral,-2.579,1,152.237,0,0,4,17.07,-2.060,CC(C)=CCCC(C)=CC(=O)
3,Picene,-6.618,2,278.354,0,5,0,0.00,-7.870,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43
4,Thiophene,-2.232,2,84.143,0,1,0,0.00,-1.330,c1ccsc1
...,...,...,...,...,...,...,...,...,...,...
1123,halothane,-2.608,1,197.381,0,0,0,0.00,-1.710,FC(F)(F)C(Cl)Br
1124,Oxamyl,-0.908,1,219.266,1,0,1,71.00,0.106,CNC(=O)ON=C(SC)C(=O)N(C)C
1125,Thiometon,-3.323,1,246.359,0,0,7,18.46,-3.091,CCSCCSP(=S)(OC)OC
1126,2-Methylbutane,-2.245,1,72.151,0,0,1,0.00,-3.180,CCC(C)C


In [102]:
# 清洗数据
del data['Compound ID']
del data['smiles']
data

Unnamed: 0,ESOL predicted log solubility in mols per litre,Minimum Degree,Molecular Weight,Number of H-Bond Donors,Number of Rings,Number of Rotatable Bonds,Polar Surface Area,measured log solubility in mols per litre
0,-0.974,1,457.432,7,3,7,202.32,-0.770
1,-2.885,1,201.225,1,2,2,42.24,-3.300
2,-2.579,1,152.237,0,0,4,17.07,-2.060
3,-6.618,2,278.354,0,5,0,0.00,-7.870
4,-2.232,2,84.143,0,1,0,0.00,-1.330
...,...,...,...,...,...,...,...,...
1123,-2.608,1,197.381,0,0,0,0.00,-1.710
1124,-0.908,1,219.266,1,0,1,71.00,0.106
1125,-3.323,1,246.359,0,0,7,18.46,-3.091
1126,-2.245,1,72.151,0,0,1,0.00,-3.180


In [103]:
# 数据归一化：数据减最小值除以最大值减最小值
cols = data.columns
for col in cols:
    data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())
data

Unnamed: 0,ESOL predicted log solubility in mols per litre,Minimum Degree,Molecular Weight,Number of H-Bond Donors,Number of Rings,Number of Rotatable Bonds,Polar Surface Area,measured log solubility in mols per litre
0,0.808672,0.5,0.577050,0.636364,0.375,0.304348,0.753015,0.821700
1,0.631613,0.5,0.242098,0.090909,0.250,0.086957,0.157213,0.629742
2,0.659965,0.5,0.178053,0.000000,0.000,0.173913,0.063533,0.723824
3,0.285741,1.0,0.342932,0.000000,0.625,0.000000,0.000000,0.283005
4,0.692115,1.0,0.089031,0.000000,0.125,0.000000,0.000000,0.779211
...,...,...,...,...,...,...,...,...
1123,0.657278,0.5,0.237072,0.000000,0.000,0.000000,0.000000,0.750379
1124,0.814787,0.5,0.265684,0.090909,0.000,0.043478,0.264255,0.888164
1125,0.591031,0.5,0.301104,0.000000,0.000,0.304348,0.068706,0.645599
1126,0.690911,0.5,0.073353,0.000000,0.000,0.043478,0.000000,0.638847


In [105]:
from sklearn.model_selection import train_test_split

# linear regression
cols_num = len(cols)
w, b = np.random.randn(cols_num-1), np.random.randn(1)

# 数据分组
x_train, x_test, y_train, y_test = train_test_split(data[cols[:cols_num-1]], data[cols[cols_num-1]], test_size=0.2, random_state=1)

# 训练函数
def my_linear_regression(x, y, w, b, times=10000, learning_rate=0.001):
    for t in range(times):
        y_pre = x.dot(w) + b
        err = y - y_pre
        L2 = 0.5 * np.mean(err**2)
        
        g_b = -np.mean(err)
        b -= learning_rate * g_b
        
        g_w = -(x.T).dot(err)
        w -= learning_rate * g_w
        
        if t % (times / 10) == 0:
            print(L2)

# 训练
my_linear_regression(x_train, y_train, w, b, times=10000, learning_rate=0.0005)
# 检测
y_pre = x_test.dot(w) + b
y_matrix = np.zeros([2, len(y_pre)], dtype=np.float64)
y_matrix[0], y_matrix[1] = y_test, y_pre

abs_err = np.mean(y_test - y_pre)
corr_err = np.corrcoef(y_matrix)
print('abs error:  ' + str(abs_err))         # 越接近0越好
print('corr error: ' + str(corr_err[0][1])) # 越接近1越好

0.025230599328707227
0.003314282802832069
0.0031529137431239107
0.003140469673106869
0.003134906223968836
0.0031297575210381978
0.0031246581721522475
0.0031195864496015918
0.0031145408675746127
0.0031095211990939597
abs error:  -0.009620616567528373
corr error: 0.8779781098466226


In [106]:
# sklearn.linear_model.LinearRegression
from sklearn import linear_model

LR = linear_model.LinearRegression()
LR.fit(x_train, y_train)
y_pre_LR = LR.predict(x_test)

#检测
y_matrix = np.zeros([2, len(y_pre_LR)], dtype=np.float64)
y_matrix[0], y_matrix[1] = y_test, y_pre_LR

abs_err = np.mean(y_test - y_pre_LR)
corr_err = np.corrcoef(y_matrix)
print('abs error:  ' + str(abs_err))         # 越接近0越好
print('corr error: ' + str(corr_err[0][1])) # 越接近1越好

abs error:  -0.0009157362138782252
corr error: 0.9160711757096791


In [107]:
# sklearn.ensemble.RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor()
RFR.fit(x_train, y_train)
y_pre_RFR = RFR.predict(x_test)

#检测
y_matrix = np.zeros([2, len(y_pre_RFR)], dtype=np.float64)
y_matrix[0], y_matrix[1] = y_test, y_pre_RFR

abs_err = np.mean(y_test - y_pre_RFR)
corr_err = np.corrcoef(y_matrix)
print('abs error:  ' + str(abs_err))         # 越接近0越好
print('corr error: ' + str(corr_err[0][1])) # 越接近1越好

abs error:  -0.0028708381346504814
corr error: 0.9602840683625763
