###  数据解析

In [14]:
# Import required libraries
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
import pandas as pd 
import numpy as np

In [15]:
# Load the data
medical = pd.read_csv('train_fenlie1.csv', low_memory=False)
medical.head(5)

Unnamed: 0,ID,Molecule_max_phase,Molecular weight,RO5_violations,AlogP,Label,features_5,features_7,features_8,features_9,...,features_3159,features_3160,features_3161,features_3162,features_3163,features_3164,features_3165,features_3166,features_3167,features_3168
0,1003,0,0.206754,0.0,0.60804,3.190476,0,2,0,10,...,0.337172495,-0.388373413,0,176,448.539,2.3342,8,2,88.73,5
1,1819,0,0.130056,0.0,0.591848,9.740969,0,0,0,5,...,0.184109041,-0.381651516,0,118,310.361,2.0396,7,2,91.74,3
2,6090,0,0.162482,0.0,0.592965,10.545341,0,0,0,3,...,0.262929532,-0.337353934,0,130,368.784,2.0564,8,0,99.59,5
3,3916,0,0.112266,0.0,0.65215,3.206803,0,1,0,0,...,0.345461189,-0.507686517,0,104,278.311,3.11792,4,2,65.98,2
4,8480,0,0.161722,0.0,0.741485,10.950807,0,1,0,0,...,0.302594082,-0.467687072,0,136,367.412,4.7209,6,1,64.86,4


### 数据整理

In [16]:
medical.dtypes

ID                      int64
Molecule_max_phase      int64
Molecular weight      float64
RO5_violations        float64
AlogP                 float64
Label                 float64
features_5              int64
features_7              int64
features_8              int64
features_9              int64
features_10             int64
features_11             int64
features_12             int64
features_3152         float64
features_3153         float64
features_3154         float64
features_3155         float64
features_3156         float64
features_3157          object
features_3158          object
features_3159          object
features_3160          object
features_3161           int64
features_3162           int64
features_3163         float64
features_3164         float64
features_3165           int64
features_3166           int64
features_3167         float64
features_3168           int64
dtype: object

In [17]:
for cat in ['Molecule_max_phase', 'RO5_violations']:
    print("Number of levels in category '{0}': \b {1:2.2f} ".format(cat, medical[cat].unique().size))


Number of levels in category 'Molecule_max_phase':  5.00 
Number of levels in category 'RO5_violations':  5.00 


In [18]:
medical = medical.fillna(-999)
pd.isnull(medical).any()

ID                    False
Molecule_max_phase    False
Molecular weight      False
RO5_violations        False
AlogP                 False
Label                 False
features_5            False
features_7            False
features_8            False
features_9            False
features_10           False
features_11           False
features_12           False
features_3152         False
features_3153         False
features_3154         False
features_3155         False
features_3156         False
features_3157         False
features_3158         False
features_3159         False
features_3160         False
features_3161         False
features_3162         False
features_3163         False
features_3164         False
features_3165         False
features_3166         False
features_3167         False
features_3168         False
dtype: bool

#### 去掉不相关特征

In [19]:
medical_new = medical.drop(['ID','Label'], axis=1)
medical_new = medical_new.fillna(-999)
pd.isnull(medical_new).any()

Molecule_max_phase    False
Molecular weight      False
RO5_violations        False
AlogP                 False
features_5            False
features_7            False
features_8            False
features_9            False
features_10           False
features_11           False
features_12           False
features_3152         False
features_3153         False
features_3154         False
features_3155         False
features_3156         False
features_3157         False
features_3158         False
features_3159         False
features_3160         False
features_3161         False
features_3162         False
features_3163         False
features_3164         False
features_3165         False
features_3166         False
features_3167         False
features_3168         False
dtype: bool

In [20]:
medical_new = np.array(medical_new.values,dtype=float)
medical_new.shape

(6924, 28)

In [21]:
# medical_new.astype(float)
medical_new.dtype

dtype('float64')

In [22]:
np.isnan(medical_new)
medical_new[np.isnan(medical_new)] = -999

In [23]:
np.isnan(medical_new)

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [24]:
medical_new.shape

(6924, 28)

In [25]:
medical_label = medical['Label'].values
# medical_label = medical_label[:,np.newaxis]
medical_label

array([ 3.19047635,  9.74096862, 10.54534144, ..., 18.6113011 ,
        2.50143595,  0.04879016])

## 使用TPOT进行数据分析

In [27]:
X_train, X_test, y_train, y_test = train_test_split(medical_new[:1000],medical_label[:1000],
                                                    train_size=0.75, test_size=0.25, random_state=42)
X_train.shape, X_test.shape, y_train.size, y_test.size

((750, 28), (250, 28), 750, 250)

In [28]:
tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_medical_pipeline_fenlie1.py')

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=300.0, style=ProgressStyle(de…

Generation 1 - Current best internal CV score: -16.571675001758944
Generation 2 - Current best internal CV score: -16.164681434920954
Generation 3 - Current best internal CV score: -16.09442069515546
Generation 4 - Current best internal CV score: -16.09167945001971
Generation 5 - Current best internal CV score: -15.981117264229795

Best pipeline: ExtraTreesRegressor(input_matrix, bootstrap=True, max_features=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=100)
-13.32962215559516


medical_new = medical.drop(['ID','Label'], axis=1)