In [38]:
import numpy as np
import pandas as pd
#
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
#
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
#
from sklearn.linear_model import LogisticRegression
#
from sklearn.model_selection import train_test_split
#
import joblib
#
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

# 应用场景1. 整洁代码

In [8]:
# 1. 数据导入
url="https://gitee.com/rysben/public/raw/master/datasets/HR_Analytics/aug_train.csv"
df = pd.read_csv(url)

In [10]:
# 2. 数据编码

## 序数特征字典
relevent_experience_map = {'Has relevent experience':1, 'No relevent experience':0}

experience_map = {'<1':0, '>20':21}
for i in range(20):
    v = i + 1
    experience_map[str(v)] = v

last_new_job_map = {'never':0, '>4':5}
for i in range(4):
    v = i + 1
    last_new_job_map[str(v)] = v


## 类别特征转换为数字特征
def encode(df_pre):
    df_pre.loc[:,'relevent_experience'] = df_pre['relevent_experience'].map(relevent_experience_map)
    df_pre.loc[:,'last_new_job'] = df_pre['last_new_job'].map(last_new_job_map)
    df_pre.loc[:,'experience'] = df_pre['experience'].map(experience_map)
    return df_pre

df = encode(df)

In [11]:
# 3. 定义使用不同转换方式的列集合
num_cols = ['city_development_index','relevent_experience', 'experience','last_new_job', 'training_hours']
cat_cols = ['gender', 'enrolled_university', 'education_level', 'major_discipline', 'company_size', 'company_type']

# 4. 分别创建特征转换pipeline
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale',MinMaxScaler())
])
cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot',OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# 5. 创建ColumnTransformer，将pipeline分别应用到列集合
col_trans = ColumnTransformer(transformers=[
    ('num_pipeline', num_pipeline, num_cols),
    ('cat_pipeline', cat_pipeline, cat_cols)
    ],
    remainder='drop',    #忽略数据框中的其它列
    n_jobs=-1)

# 6. 向pipeline添加模型
clf = LogisticRegression(random_state=0)
clf_pipeline = Pipeline(steps=[
    ('col_trans', col_trans),
    ('model', clf)
])

In [12]:
# 7. 展示pipeline
from sklearn import set_config

set_config(display='diagram')
display(clf_pipeline)

In [15]:
# 8. 划分数据
X = df[num_cols+cat_cols]
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [16]:
# 9. 向pipeline传数据
clf_pipeline.fit(X_train, y_train)
score = clf_pipeline.score(X_test, y_test)
print(f"Model score: {score}") # model accuracy

Model score: 0.761482254697286


In [18]:
# 10. 保存模型
#joblib.dump(clf_pipeline,"pipe.joblib")
#same_pipe = joblib.load("pipe.joblib")

# 应用场景2. 超参优化

In [None]:
How to find the best hyperparameter sets: Add a pipeline to Grid Search
How to find the best data preparation method: Skip a step in a pipeline
How to Find the best hyperparameter sets and the best data preparation method

In [19]:
# 1. 获取流程参数
clf_pipeline.get_params()
    # step1__step2__…__parameter.

#clf_pipeline.set_params(model_C = 10)    #直接改变参数

{'memory': None,
 'steps': [('col_trans', ColumnTransformer(n_jobs=-1,
                     transformers=[('num_pipeline',
                                    Pipeline(steps=[('impute', SimpleImputer()),
                                                    ('scale', MinMaxScaler())]),
                                    ['city_development_index',
                                     'relevent_experience', 'experience',
                                     'last_new_job', 'training_hours']),
                                   ('cat_pipeline',
                                    Pipeline(steps=[('impute',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('one-hot',
                                                     OneHotEncoder(handle_unknown='ignore',
                                                                   sparse=False))]),
                                    ['gender', 'enroll

In [24]:
# 2. 优化参数集
## 参数空间示例
grid_params = {'model__penalty' : ['none', 'l2'],
               'model__C' : np.logspace(-4, 4, 10)}

## 将pipeline添加到网格搜索，pipeline最后一步是model，可以直接输入GridSearchCV()
gs = GridSearchCV(clf_pipeline, grid_params, cv=5, scoring='accuracy')
gs.fit(X_train, y_train)

print("Best Score of train set: "+str(gs.best_score_))
print("Best parameter set: "+str(gs.best_params_))
print("Test Score: "+str(gs.score(X_test,y_test)))





Best Score of train set: 0.7657576386383733
Best parameter set: {'model__C': 0.046415888336127774, 'model__penalty': 'l2'}
Test Score: 0.7609603340292276


# 应用场景3. 选择合适的预处理方法

In [26]:
#  调整pipeline：向num_pipeline中添加StandardScaler()
num_pipeline2 = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('minmax_scale', MinMaxScaler()),
    ('std_scale', StandardScaler()),
])

col_trans2 = ColumnTransformer(transformers=[
    ('num_pipeline',num_pipeline2,num_cols),
    ('cat_pipeline',cat_pipeline,cat_cols)
    ],
    remainder='drop',
    n_jobs=-1)
    
clf_pipeline2 = Pipeline(steps=[
    ('col_trans', col_trans2),
    ('model', clf)
])

In [27]:
# 网格搜索
##如果使用字典列表，网格搜索将执行case1中每个参数的组合；然后执行case中每个参数的组合。
grid_step_params = [{'col_trans__num_pipeline__minmax_scale': ['passthrough']},
                    {'col_trans__num_pipeline__std_scale': ['passthrough']}]

##
gs2 = GridSearchCV(clf_pipeline2, grid_step_params, scoring='accuracy')
gs2.fit(X_train, y_train)

print("Best Score of train set: "+str(gs2.best_score_))
print("Best parameter set: "+str(gs2.best_params_))
print("Test Score: "+str(gs2.score(X_test,y_test)))

Best Score of train set: 0.7653009325028811
Best parameter set: {'col_trans__num_pipeline__std_scale': 'passthrough'}
Test Score: 0.761482254697286


# 应用场景4. 2+3

In [28]:
##添加parameters到数据预处理case字典，合并字典语法：merge_dict = {**dict_1,**dict_2}
grid_params = {'model__penalty' : ['none', 'l2'],
               'model__C' : np.logspace(-4, 4, 5)}
               
grid_step_params2 = [{**{'col_trans__num_pipeline__minmax_scale': ['passthrough']}, **grid_params},
                    {**{'col_trans__num_pipeline__std_scale': ['passthrough']}, **grid_params}]

##
gs3 = GridSearchCV(clf_pipeline2, grid_step_params2, scoring='accuracy')
gs3.fit(X_train, y_train)

print("Best Score of train set: "+str(gs3.best_score_))
print("Best parameter set: "+str(gs3.best_params_))
print("Test Score: "+str(gs3.score(X_test,y_test)))



Best Score of train set: 0.7681068052598142
Best parameter set: {'col_trans__num_pipeline__minmax_scale': 'passthrough', 'model__C': 0.01, 'model__penalty': 'l2'}
Test Score: 0.7622651356993737


In [30]:
pd.DataFrame(gs3.cv_results_).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_col_trans__num_pipeline__minmax_scale,param_model__C,param_model__penalty,param_col_trans__num_pipeline__std_scale,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.617204,0.562099,0.408072,0.481478,passthrough,0.0001,none,,{'col_trans__num_pipeline__minmax_scale': 'pas...,0.763536,0.768352,0.766069,0.755954,0.772268,0.765236,0.005457,4
1,0.074948,0.006196,0.016479,0.001814,passthrough,0.0001,l2,,{'col_trans__num_pipeline__minmax_scale': 'pas...,0.750489,0.750734,0.750734,0.750734,0.750408,0.75062,0.000142,19
2,0.138971,0.016333,0.015243,0.001666,passthrough,0.01,none,,{'col_trans__num_pipeline__minmax_scale': 'pas...,0.763536,0.768352,0.766069,0.755954,0.772268,0.765236,0.005457,4
3,0.10578,0.006412,0.016448,0.001223,passthrough,0.01,l2,,{'col_trans__num_pipeline__minmax_scale': 'pas...,0.763209,0.771941,0.772594,0.759869,0.77292,0.768107,0.005474,1
4,0.150457,0.011433,0.015254,0.001849,passthrough,1.0,none,,{'col_trans__num_pipeline__minmax_scale': 'pas...,0.763536,0.768352,0.766069,0.755954,0.772268,0.765236,0.005457,4


# 应用场景5. 自定义数据转换

In [41]:
# 自定义数据转换方法类
from sklearn.base import TransformerMixin

class Encode(TransformerMixin):
    
    def __init__(self):
        # Making Dictionaries of ordinal features
        self.rel_exp_map = {
            'Has relevent experience': 1,
            'No relevent experience': 0}
            
    def fit(self, df, y = None):
        return self
        
    def transform(self, df, y = None):
        df_pre = df.copy()
        df_pre.loc[:,'relevent_experience'] = df_pre['relevent_experience']\
                               .map(self.rel_exp_map)
        return df_pre

In [42]:
# 将类添加到pipeline
pipeline = Pipeline(steps=[
    ('Encode', Encode()),
    ('col_trans', col_trans),
    ('model', LogisticRegression())
])

# fit, transform, or grid search ...

# 应用场景6. 模型选择

In [34]:
# 1. 创建一个以模型作为输入的类
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

class ClfSwitcher(BaseEstimator):

    def __init__(self, estimator = LogisticRegression()):
        self.estimator = estimator
        
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
        
    def predict(self, X, y=None):
        return self.estimator.predict(X)
        
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
        
    def score(self, X, y):
        return self.estimator.score(X, y)

In [43]:
# 2. 将类加入pipeline
clf_pipeline = Pipeline(steps=[
    ('Encode', Encode()),
    ('col_trans', col_trans),
    ('model', ClfSwitcher())
])

In [44]:
# 3. 网格搜索
from sklearn.model_selection import GridSearchCV

grid_params = [
    {'model__estimator': [LogisticRegression()]},
    {'model__estimator': [SVC(gamma='auto')]}
]

gs = GridSearchCV(clf_pipeline, grid_params, scoring='accuracy')
gs.fit(X_train, y_train)

print("Best Score of train set: "+str(gs.best_score_))
print("Best parameter set: "+str(gs.best_params_))
print("Test Score: "+str(gs.score(X_test,y_test)))

Best Score of train set: 0.765105386765759
Best parameter set: {'model__estimator': LogisticRegression()}
Test Score: 0.761482254697286
