In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("data/data.csv")
data.head()

Unnamed: 0,id,author,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
0,0,Thompson,tube,7.0,3770.0,0.1754,,10.8,432.0,3.6
1,1,Thompson,tube,,6049.0,-0.0416,10.3,10.3,762.0,6.2
2,2,Thompson,,13.79,2034.0,0.0335,7.7,7.7,457.0,2.5
3,3,Beus,annulus,13.79,3679.0,-0.0279,5.6,15.2,2134.0,3.0
4,4,,tube,13.79,686.0,,11.1,11.1,457.0,2.8


In [3]:
data.columns

Index(['id', 'author', 'geometry', 'pressure [MPa]', 'mass_flux [kg/m2-s]',
       'x_e_out [-]', 'D_e [mm]', 'D_h [mm]', 'length [mm]',
       'chf_exp [MW/m2]'],
      dtype='object')

In [4]:
data['x_e_out [-]'].isnull()

0        False
1        False
2        False
3        False
4         True
         ...  
31639    False
31640     True
31641    False
31642     True
31643    False
Name: x_e_out [-], Length: 31644, dtype: bool

### 根据x_e_out [-]列是否为空值进行数据train/test划分

In [5]:
test = data.loc[data['x_e_out [-]'].isnull(),:]
train = data.loc[data['x_e_out [-]'].isnull() == False,:]

In [30]:
test.to_csv('test.csv')
train.to_csv('train.csv')

In [6]:
train.head()

Unnamed: 0,id,author,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
0,0,Thompson,tube,7.0,3770.0,0.1754,,10.8,432.0,3.6
1,1,Thompson,tube,,6049.0,-0.0416,10.3,10.3,762.0,6.2
2,2,Thompson,,13.79,2034.0,0.0335,7.7,7.7,457.0,2.5
3,3,Beus,annulus,13.79,3679.0,-0.0279,5.6,15.2,2134.0,3.0
5,5,,,17.24,3648.0,-0.0711,,1.9,696.0,3.6


In [7]:
test.head()

Unnamed: 0,id,author,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
4,4,,tube,13.79,686.0,,11.1,11.1,457.0,2.8
7,7,Peskov,tube,18.0,750.0,,10.0,10.0,1650.0,2.2
10,10,Thompson,tube,,,,1.9,1.9,152.0,3.2
12,12,Thompson,,6.89,7500.0,,,12.8,1930.0,4.8
23,23,Beus,annulus,15.51,1355.0,,5.6,15.2,2134.0,2.1


In [8]:
#处理train_data
train.shape

(21229, 10)

In [9]:
#定义X,y
y = train['x_e_out [-]']
X = train.drop(columns=['id','x_e_out [-]'])

In [11]:
X.head()

Unnamed: 0,author,geometry,pressure [MPa],mass_flux [kg/m2-s],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
0,Thompson,tube,7.0,3770.0,,10.8,432.0,3.6
1,Thompson,tube,,6049.0,10.3,10.3,762.0,6.2
2,Thompson,,13.79,2034.0,7.7,7.7,457.0,2.5
3,Beus,annulus,13.79,3679.0,5.6,15.2,2134.0,3.0
5,,,17.24,3648.0,,1.9,696.0,3.6


In [12]:
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline

In [13]:
# set up preprocessing for numeric columns
imp_knn = KNNImputer(n_neighbors=3)
scaler = StandardScaler()

In [14]:
# set up preprocessing for categorical columns
imp_constant = SimpleImputer(strategy='constant')
ohe = OneHotEncoder(handle_unknown='ignore')

In [15]:
from sklearn.compose import make_column_selector, make_column_transformer

In [16]:
# select columns by data type
num_cols = make_column_selector(dtype_include='number')
cat_cols = make_column_selector(dtype_exclude='number')

In [17]:
# do all preprocessing
preprocessor = make_column_transformer(
    (make_pipeline(imp_knn, scaler), num_cols),
    (make_pipeline(imp_constant, ohe), cat_cols))

- 定义模型

In [18]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor() #0.372568

- 定义pipe

In [19]:
pipe = make_pipeline(preprocessor, rfr)

In [20]:
pipe.steps

[('columntransformer',
  ColumnTransformer(transformers=[('pipeline-1',
                                   Pipeline(steps=[('knnimputer',
                                                    KNNImputer(n_neighbors=3)),
                                                   ('standardscaler',
                                                    StandardScaler())]),
                                   <sklearn.compose._column_transformer.make_column_selector object at 0x000001F56AB514F0>),
                                  ('pipeline-2',
                                   Pipeline(steps=[('simpleimputer',
                                                    SimpleImputer(strategy='constant')),
                                                   ('onehotencoder',
                                                    OneHotEncoder(handle_unknown='ignore'))]),
                                   <sklearn.compose._column_transformer.make_column_selector object at 0x000001F567F17FD0>)])),
 ('randomforest

In [21]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X, y).mean()

0.37256796827674665

In [22]:
pipe.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('knnimputer',
                                                                   KNNImputer(n_neighbors=3)),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001F56AB514F0>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('onehotencoder',
                                                                   OneHot

In [23]:
#处理test值
X_test = test.drop(columns=['id','x_e_out [-]'])

In [25]:
result = pipe.predict(X_test)

In [26]:
result

array([ 0.01832564, -0.07108461, -0.014575  , ...,  0.044827  ,
        0.00868467,  0.004043  ])

In [27]:
pd.DataFrame({'id':test.id, 'x_e_out [-]':result}).set_index('id').to_csv('sub_baseline_rf.csv')

- 0.083932