In [19]:
# Import module
# Basic modules
import traceback
import numpy as np
import pandas as pd

# Data operation modules
from sklearn.grid_search import GridSearchCV    #Performing grid search

# Regressors modules
from sklearn.svm import LinearSVR
from sklearn.svm import NuSVR
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LassoLars


In [20]:
# Initializations

# Define the data_name dict
lst_id = [u'id']
lst_age = [u'年龄']
lst_gender = [u'性别']
lst_date = [u'体检日期']
lst_enzyme = [u'*天门冬氨酸氨基转换酶', u'*丙氨酸氨基转换酶', u'*碱性磷酸酶', u'*r-谷氨酰基转换酶']
lst_protein = [u'*总蛋白', u'白蛋白', u'*球蛋白', u'白球比例']
lst_fat = [u'甘油三酯', u'总胆固醇', u'高密度脂蛋白胆固醇', u'低密度脂蛋白胆固醇']
lst_kidney = [u'尿素', u'肌酐', u'尿酸']
lst_liver = [u'乙肝表面抗原', u'乙肝表面抗体', u'乙肝e抗原', u'乙肝e抗体', u'乙肝核心抗体']
lst_redcell = [u'红细胞计数', u'血红蛋白', u'红细胞压积', u'红细胞平均体积', u'红细胞平均血红蛋白量', u'红细胞平均血红蛋白浓度', u'红细胞体积分布宽度']  
lst_whitecell = [u'白细胞计数', u'中性粒细胞%', u'淋巴细胞%', u'单核细胞%', u'嗜酸细胞%', u'嗜碱细胞%']
lst_bloodcell = [u'血小板计数', u'血小板平均体积', u'血小板体积分布宽度', u'血小板比积']
lst_target = [u'血糖']

# Define the clf_name dict
dict_models = {
        'LinearSVR': {'C':[0.1,1,10,100]},
        'NuSVR': {'C':[0.1,1,10,100], 'kernel': ['rbf']},
        'LinearRegression': {'fit_intercept': [True, False]},
        'LogisticRegression': {'penalty': ['l1', 'l2']},
        'ElasticNet': {'alpha':[0.25, 0.5, 0.75], 'l1_ratio':[0.25, 0.5, 0.75]},
        'LassoLars': {'alpha':[0.25, 0.5, 0.75]}
    }

# Get combinations of the hyper-parameters from the clf_name
def get_pars(clf_name):
    return dict_models[clf_name]

# Define cross-validation iterations
k_fcv = 5

# Define pathes
load_train_path = '../datasets/d_train_20180102.csv'
load_test_path = '../datasets/d_test_A_20180102.csv'
save_path = '../datasets/d_predict_20180110.csv'


In [21]:
# Define useful functions

def train_base_learner(clf_name, k_fcv, train_fea, train_target):
    clf = GridSearchCV(eval(clf_name)(), get_pars(clf_name), cv = k_fcv)
    clf.fit(train_fea, train_target)
    return clf, clf.best_params_

def predict_base_learner(clf, test_fea):
    pre_target = clf.predict(test_fea)
    return pre_target

def cleansing_all(data, lst_one, lst_two, lst_gender):
    del data[u'体检日期']
    for key in lst_one: data[key] = data[key].fillna(data[key].mean())
    for key in lst_two: data[key] = data[key].fillna(0)
    data[lst_gender] = data[lst_gender].replace([u'女', u'男', u'??'], [0, 1, 0])
    data = data.dropna()
    new_data = data.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))) # 归一化，可选
    return new_data

In [22]:
# Load data
data_train = pd.read_csv(load_train_path, sep=',', encoding="gb2312")
data_test = pd.read_csv(load_test_path, sep=',', encoding="gb2312")

# Set features and methods
# 在这里设置特征分块，分类器
lst_fea = [lst_enzyme, lst_protein, lst_fat, lst_kidney, lst_liver, lst_redcell, lst_whitecell, lst_bloodcell, lst_age, lst_gender]
lst_clf = ['LinearRegression', 'ElasticNet']

# Table restore the predictions
data_pre = pd.DataFrame()

# Cleansing the data
lst_one = lst_enzyme + lst_protein + lst_fat + lst_kidney + lst_redcell + lst_whitecell + lst_bloodcell
lst_two = lst_liver
new_train = cleansing_all(data_train, lst_one, lst_two, lst_gender)
new_test = cleansing_all(data_test, lst_one, lst_two, lst_gender)

# Learn and predict (depart the features)
for i_fea in lst_fea:
    for j_clf in lst_clf:
        now_clf, now_par = train_base_learner(j_clf, k_fcv, new_train[i_fea], new_train[lst_target])
        now_pre = predict_base_learner(now_clf, new_test[i_fea])
        now_name = '|'.join(i_fea) + '_' + j_clf
        data_pre[now_name] = list(now_pre.reshape(now_pre.shape[0]))

data_pre

Unnamed: 0,*天门冬氨酸氨基转换酶|*丙氨酸氨基转换酶|*碱性磷酸酶|*r-谷氨酰基转换酶_LinearRegression,*天门冬氨酸氨基转换酶|*丙氨酸氨基转换酶|*碱性磷酸酶|*r-谷氨酰基转换酶_ElasticNet,*总蛋白|白蛋白|*球蛋白|白球比例_LinearRegression,*总蛋白|白蛋白|*球蛋白|白球比例_ElasticNet,甘油三酯|总胆固醇|高密度脂蛋白胆固醇|低密度脂蛋白胆固醇_LinearRegression,甘油三酯|总胆固醇|高密度脂蛋白胆固醇|低密度脂蛋白胆固醇_ElasticNet,尿素|肌酐|尿酸_LinearRegression,尿素|肌酐|尿酸_ElasticNet,乙肝表面抗原|乙肝表面抗体|乙肝e抗原|乙肝e抗体|乙肝核心抗体_LinearRegression,乙肝表面抗原|乙肝表面抗体|乙肝e抗原|乙肝e抗体|乙肝核心抗体_ElasticNet,红细胞计数|血红蛋白|红细胞压积|红细胞平均体积|红细胞平均血红蛋白量|红细胞平均血红蛋白浓度|红细胞体积分布宽度_LinearRegression,红细胞计数|血红蛋白|红细胞压积|红细胞平均体积|红细胞平均血红蛋白量|红细胞平均血红蛋白浓度|红细胞体积分布宽度_ElasticNet,白细胞计数|中性粒细胞%|淋巴细胞%|单核细胞%|嗜酸细胞%|嗜碱细胞%_LinearRegression,白细胞计数|中性粒细胞%|淋巴细胞%|单核细胞%|嗜酸细胞%|嗜碱细胞%_ElasticNet,血小板计数|血小板平均体积|血小板体积分布宽度|血小板比积_LinearRegression,血小板计数|血小板平均体积|血小板体积分布宽度|血小板比积_ElasticNet,年龄_LinearRegression,年龄_ElasticNet,性别_LinearRegression,性别_ElasticNet
0,0.093329,0.072453,0.144030,0.072453,0.065798,0.072453,0.070040,0.072453,0.071663,0.072453,0.080928,0.072453,0.089587,0.072453,0.075420,0.072453,0.079027,0.072453,0.078519,0.072453
1,0.091085,0.072453,0.162111,0.072453,0.090129,0.072453,0.083040,0.072453,0.071663,0.072453,0.083930,0.072453,0.085657,0.072453,0.075403,0.072453,0.074515,0.072453,0.078519,0.072453
2,0.081552,0.072453,0.159775,0.072453,0.077478,0.072453,0.074467,0.072453,0.071663,0.072453,0.086600,0.072453,0.088403,0.072453,0.069780,0.072453,0.048570,0.072453,0.078519,0.072453
3,0.082471,0.072453,0.149546,0.072453,0.133132,0.072453,0.067292,0.072453,0.077965,0.072453,0.075828,0.072453,0.074828,0.072453,0.074854,0.072453,0.077899,0.072453,0.066145,0.072453
4,0.076325,0.072453,0.152901,0.072453,0.052587,0.072453,0.080269,0.072453,0.071663,0.072453,0.081052,0.072453,0.079299,0.072453,0.072036,0.072453,0.066618,0.072453,0.066145,0.072453
5,0.075943,0.072453,0.172562,0.072453,0.056106,0.072453,0.080638,0.072453,0.071663,0.072453,0.079694,0.072453,0.077912,0.072453,0.076545,0.072453,0.072259,0.072453,0.078519,0.072453
6,0.063050,0.072453,0.162024,0.072453,0.076930,0.072453,0.056045,0.072453,0.083282,0.072453,0.074057,0.072453,0.078038,0.072453,0.065988,0.072453,0.073387,0.072453,0.066145,0.072453
7,0.078918,0.072453,0.173123,0.072453,0.069991,0.072453,0.067262,0.072453,0.080479,0.072453,0.083310,0.072453,0.083153,0.072453,0.075588,0.072453,0.075643,0.072453,0.066145,0.072453
8,0.076404,0.072453,0.141973,0.072453,0.071069,0.072453,0.068191,0.072453,0.080515,0.072453,0.080086,0.072453,0.092152,0.072453,0.070856,0.072453,0.094820,0.072453,0.066145,0.072453
9,0.098332,0.072453,0.144306,0.072453,0.089465,0.072453,0.084382,0.072453,0.079129,0.072453,0.076390,0.072453,0.078092,0.072453,0.077977,0.072453,0.090308,0.072453,0.066145,0.072453
