In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
dataset = './dataset/qsar_fish_toxicity.csv'
data = pd.read_csv(dataset)

In [3]:
data.head()

Unnamed: 0,CIC0,SM1_Dz(Z),GATS1i,NdsCH,NdssC,MLOGP,LC50 [-LOG(mol/L)]
0,3.26,0.829,1.676,0,1,1.453,3.77
1,2.189,0.58,0.863,0,0,1.348,3.115
2,2.125,0.638,0.831,0,0,1.348,3.531
3,3.027,0.331,1.472,1,0,1.807,3.51
4,2.094,0.827,0.86,0,0,1.886,5.39


In [4]:
column_names = list(data.columns)
column_names

['CIC0',
 'SM1_Dz(Z)',
 'GATS1i',
 'NdsCH',
 'NdssC',
 'MLOGP',
 'LC50 [-LOG(mol/L)]']

In [5]:
data.tail()

Unnamed: 0,CIC0,SM1_Dz(Z),GATS1i,NdsCH,NdssC,MLOGP,LC50 [-LOG(mol/L)]
903,2.801,0.728,2.226,0,2,0.736,3.109
904,3.652,0.872,0.867,2,3,3.983,4.04
905,3.763,0.916,0.878,0,6,2.918,4.818
906,2.831,1.393,1.077,0,1,0.906,5.317
907,4.057,1.032,1.183,1,3,4.754,8.201


In [6]:
data.isnull().sum()

CIC0                  0
SM1_Dz(Z)             0
GATS1i                0
NdsCH                 0
NdssC                 0
MLOGP                 0
LC50 [-LOG(mol/L)]    0
dtype: int64

In [7]:
data.describe()

Unnamed: 0,CIC0,SM1_Dz(Z),GATS1i,NdsCH,NdssC,MLOGP,LC50 [-LOG(mol/L)]
count,908.0,908.0,908.0,908.0,908.0,908.0,908.0
mean,2.898129,0.628468,1.293591,0.229075,0.485683,2.109285,4.064431
std,0.756088,0.428459,0.394303,0.605335,0.861279,1.433181,1.455698
min,0.667,0.0,0.396,0.0,0.0,-2.884,0.053
25%,2.347,0.223,0.95075,0.0,0.0,1.209,3.15175
50%,2.934,0.57,1.2405,0.0,0.0,2.127,3.9875
75%,3.407,0.89275,1.56225,0.0,1.0,3.105,4.9075
max,5.926,2.171,2.92,4.0,6.0,6.515,9.612


In [8]:
data.std()

CIC0                  0.756088
SM1_Dz(Z)             0.428459
GATS1i                0.394303
NdsCH                 0.605335
NdssC                 0.861279
MLOGP                 1.433181
LC50 [-LOG(mol/L)]    1.455698
dtype: float64

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data=scaler.fit_transform(data)
print(scaled_data)

[[ 0.47887329  0.46828848  0.97036948 ...  0.59748428 -0.45817453
  -0.20237219]
 [-0.93840825 -0.11318414 -1.09263435 ... -0.56421963 -0.53147852
  -0.652576  ]
 [-1.02310108  0.02225928 -1.173835   ... -0.56421963 -0.53147852
  -0.36664503]
 ...
 [ 1.14450599  0.67145361 -1.05457155 ...  6.40600388  0.56459064
   0.5179539 ]
 [-0.08883332  1.78535897 -0.54960505 ...  0.59748428 -0.8400534
   0.86093359]
 [ 1.53356366  0.94234045 -0.28062792 ...  2.92089212  1.84636323
   2.84320501]]


In [10]:
df = pd.DataFrame(scaled_data)
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.478873,0.468288,0.970369,-0.378635,0.597484,-0.458175,-0.202372
1,-0.938408,-0.113184,-1.092634,-0.378635,-0.56422,-0.531479,-0.652576
2,-1.023101,0.022259,-1.173835,-0.378635,-0.56422,-0.531479,-0.366645
3,0.170538,-0.694657,0.452715,1.274253,-0.56422,-0.211035,-0.381079
4,-1.064124,0.463618,-1.100247,-0.378635,-0.56422,-0.155883,0.911109


In [11]:
df.columns = column_names
df

Unnamed: 0,CIC0,SM1_Dz(Z),GATS1i,NdsCH,NdssC,MLOGP,LC50 [-LOG(mol/L)]
0,0.478873,0.468288,0.970369,-0.378635,0.597484,-0.458175,-0.202372
1,-0.938408,-0.113184,-1.092634,-0.378635,-0.564220,-0.531479,-0.652576
2,-1.023101,0.022259,-1.173835,-0.378635,-0.564220,-0.531479,-0.366645
3,0.170538,-0.694657,0.452715,1.274253,-0.564220,-0.211035,-0.381079
4,-1.064124,0.463618,-1.100247,-0.378635,-0.564220,-0.155883,0.911109
...,...,...,...,...,...,...,...
903,-0.128533,0.232430,2.366006,-0.378635,1.759188,-0.958736,-0.656700
904,0.997617,0.568703,-1.082484,2.927141,2.920892,1.308103,-0.016792
905,1.144506,0.671454,-1.054572,-0.378635,6.406004,0.564591,0.517954
906,-0.088833,1.785359,-0.549605,-0.378635,0.597484,-0.840053,0.860934


In [12]:
X = df.drop(['LC50 [-LOG(mol/L)]'], axis=1)

In [13]:
X.head()

Unnamed: 0,CIC0,SM1_Dz(Z),GATS1i,NdsCH,NdssC,MLOGP
0,0.478873,0.468288,0.970369,-0.378635,0.597484,-0.458175
1,-0.938408,-0.113184,-1.092634,-0.378635,-0.56422,-0.531479
2,-1.023101,0.022259,-1.173835,-0.378635,-0.56422,-0.531479
3,0.170538,-0.694657,0.452715,1.274253,-0.56422,-0.211035
4,-1.064124,0.463618,-1.100247,-0.378635,-0.56422,-0.155883


In [14]:
y = df['LC50 [-LOG(mol/L)]']
y.head()

0   -0.202372
1   -0.652576
2   -0.366645
3   -0.381079
4    0.911109
Name: LC50 [-LOG(mol/L)], dtype: float64

In [15]:
model = LinearRegression(n_jobs=-1) 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [16]:
params = {
    'fit_intercept' : [True, False],
    'positive' : [True, False]
}

In [17]:
model.fit(X_train, y_train)

LinearRegression(n_jobs=-1)

In [18]:
y_pred = model.predict(X_test)

In [19]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.4457532114727547

In [20]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=model,param_grid=params,scoring='r2')

In [21]:
model=grid.fit(X_train,y_train)
model.best_params_

{'fit_intercept': False, 'positive': False}

In [22]:
model.best_score_

0.5870883168345037