In [None]:
import pandas as pd
import imblearn
import numpy as np
import traceback
pd.set_option('display.max_rows', 20,'max_info_columns', 9999,'display.max_columns', 9999)
import matplotlib.pyplot as plt
import copy
%matplotlib inline
import seaborn as sns
from sklearn import preprocessing,metrics
import datetime as dt
import gc
from sklearn import svm,linear_model
from collections import Counter
from sklearn.model_selection import train_test_split
import csv
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder,Imputer

from sklearn.ensemble import RandomForestRegressor,VotingClassifier
from sklearn.kernel_ridge import KernelRidge
from sklearn import neighbors
from sklearn.preprocessing import scale
from sklearn.model_selection import StratifiedKFold


from sklearn.feature_selection import RFE 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.cross_validation import  cross_val_score
from sklearn.preprocessing import LabelEncoder
import warnings
import xlrd
warnings.filterwarnings('ignore')

In [None]:
train=pd.read_csv('/home/tsl-yu/文档/天池-精准医疗/d_train_20180102.csv')

test=pd.read_csv('/home/tsl-yu/文档/天池-精准医疗/d_test_A_20180102.csv')

mer=pd.read_excel('/home/tsl-yu/文档/天池-精准医疗/std.xlsx')

## 性别处理函数 

In [None]:
def sex(df):
    col=df.columns
    dic={ '男': 0 ,'女':1  }
    for c in col:
        if c=='性别':
            df[c]=df[c].map(dic)
            
    return df

## 年龄范围选择 

In [None]:
def select_age(df,n=10,m=90):
    df=df[df['年龄']>n][df['年龄']<m]
    return df

## label范围选择 

In [None]:
def select_label(df):
    
    ma=np.percentile(df['血糖'],98)
    df=df[df['血糖']<ma]
    return df

## 年龄阶段划分函数 

In [None]:
def age_split(df,n=5):
    col=df.columns
    
    for c in col:
        if c=='年龄':
            df['age_cata']=pd.cut(df[c],n,labels=False)
            
    return df

##  空值填充|

In [None]:
def nul_fill(df):
    for col in df.columns:
        l=df[col][df[col].isnull()]
        if len(l)!=len(df[col]):
            mean=np.nanmean(df[col])
            df[col]=df[col].fillna(mean)
        
    return df

## 日期处理函数 

In [None]:
def pro_date(df):
    col=df.columns
    
    for c in col:
        if c=='体检日期':
            df['体检日期']=[dt.datetime.strptime(str,'%d/%m/%Y') for str in df['体检日期']]
            
            df['month']=[date.month for date in df['体检日期']]
            df['day']=[date.day for date in df['体检日期']]
            df['weekday']=[date.dayofweek for date in df['体检日期']]
            df.drop(c,axis=1,inplace=True)
    return df

##  去掉空值超过0.7的函数

In [None]:
def drop_null(df):
    for col in df.columns:
        a=df[col].isnull().sum()
        if a>len(df[col])*0.7:
            df=df.drop(col,axis=1)
            
    return df

##  以年龄、性别为依据，分组填充

In [None]:
def fill_nul(df,n=27):
    col=df.columns    
    for c in col:
        if c=='年龄':
            df['split']=pd.cut(df[c],n,labels=False)
         
    for age,group in df.groupby('split'):
            for sex,data in group.groupby('性别'):
                index=data.index
                df.iloc[index]=nul_fill(data)
    df=df.drop('split',axis=1)
    return df

## 对是否是高血糖 进行判断产生label

In [None]:
def b_g(label,x=10):
    out=[]
    for a in label:

        if a>x:
            out.append(1)
        else:
            out.append(0)
    return np.array(out)
    

正常血糖范围为3.9-6.1

## 利用标准数据产生新特征 

In [None]:
def new_feature(data,std):
    df=copy.deepcopy(data)
    col_df=df.columns
    col_std=std.columns
    for i,col in enumerate(col_df):
        if col in col_std:
            df[str(i)+'std']=df[col]-std[col][0]
            df[str(i)+'_std']=df[str(i)+'std']/std[col][0]
            index=df[df['性别']==1].index
            df[str(i)+'std'][index]=df[col][index]-std[col][1]
            df[str(i)+'_std'][index]=df[str(i)+'std'][index]/std[col][1]
    return df

## 交叉验证函数 

In [None]:
def cv_function(model,train,label,test,n=10):
    try:
        kf = KFold(n_splits=n,shuffle=True)
        l=[]
        
        err=[]
        if type(train)==pd.DataFrame:
            train=train.fillna(-999).values
        if type(test)==pd.DataFrame:
            test=test.fillna(-999).values
        
        if type(label)==pd.core.series.Series:
            label=label.values
    
        for tr_index,te in kf.split(train,label):
            train_x,test_x=train[tr_index],train[te]
            train_y,test_y=label[tr_index],label[te]
            model.fit(train_x,train_y)
            a=model.predict(test_x)
            err.append(np.mean((a-test_y)**2))
            print (err[-1])
            
            out=model.predict(test)
            l.append(out)
                
            
        print('mean error   ',np.mean(err))
        return l,err
    except :
        print('error')

##  交叉验证，分类评测结果

In [None]:
def cla_cv_function(model,train,label,test,n=10):
    try:
        kf = KFold(n_splits=n,shuffle=True)
        l=[]
        
        err=[]
        if type(train)==pd.DataFrame:
            train=train.fillna(-999).values
        if type(test)==pd.DataFrame:
            test=test.fillna(-999).values
        
        if type(label)==pd.core.series.Series:
            label=label.values
    
        for tr_index,te in kf.split(train,label):
            train_x,test_x=train[tr_index],train[te]
            train_y,test_y=label[tr_index],label[te]
            model.fit(train_x,train_y)
            a=model.predict(test_x)
            err.append(metrics.f1_score(test_y,a))
            print (metrics.confusion_matrix(test_y,a))
            
            out=model.predict(test)
            l.append(out)
                
            
        print('mean error   ',np.mean(err))
        return l,err
    except :
        print('error')

##  训练数据构建

In [None]:
#train=select_age(train)
#train=select_label(train)

In [None]:
#train=train[train['血糖']<30]

In [None]:
label=train['血糖']

train1=pd.concat([train.drop('血糖',axis=1),test],axis=0,ignore_index=True)
train1.drop('id',axis=1,inplace=True)

In [None]:
train1=sex(train1)
#train1=age_split(train1,8)
train1=pro_date(train1)
train1=drop_null(train1)

In [None]:
train1=fill_nul(train1)
train1=fill_nul(train1,20)

In [None]:
train1=train1.fillna(train1.median(axis=0))

In [None]:
ss=StandardScaler()

In [None]:
train2=ss.fit_transform(train1)

In [None]:
index_7=label[label<7].index
index7_=label[label>7].index

In [None]:
x1,y1=train2[index_7],label[index_7]
x2,y2=train2[index7_],label[index7_]

In [None]:
label2=b_g(label,7)

In [None]:
y2.shape

## 数据分析 

In [None]:
plt.figure(figsize=(5,5))
sns.distplot(train['血糖'])

 实际分析发现血糖分布：
   大于6.1             911
   大于10              139
   大于15              27
   大于20              4

In [None]:
np.percentile(train['血糖'],95)

In [None]:
label.shape

## 对不均匀数据进行采样 

In [None]:
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN
from imblearn.ensemble import EasyEnsemble

In [None]:
smote_enn =SMOTEENN(random_state=0,n_neighbors=6,n_jobs=-1,m=3,ratio=0.4)

In [None]:
x_sam,y_sam=smote_enn.fit_sample(train1.iloc[:5642],b_g(label,10))

In [None]:
sns.countplot(y_sam)

##  XGB 模型

In [None]:
train_x,test_x,train_y,test_y=train_test_split(train1.iloc[:5642],label,test_size=0.1,random_state=0)

train_=xgb.DMatrix(train_x,train_y)
test_=xgb.DMatrix(test_x,test_y)

watchlist=[(train_,'train'),(test_,'test')]

In [None]:
    params = {
            'booster' : 'gbtree',
            'objective': 'reg:linear',
            'eta': 0.1,
            'gamma' : 0.0,
            'max_depth' :4,
            'min_child_weight' :1,
            'eval_metric':'rmse',
            'seed': 11,
            'missing': -999,
            'colsample_bytree' :1,
             'colsample_bybooster':1,
          
            'silent' : 1,
            
            
            }

In [None]:
model = xgb.train(params,train_,num_boost_round=500 ,evals=watchlist,early_stopping_rounds=30)

In [None]:
score=model.get_score()

impt=pd.DataFrame()
impt['name']=score.keys()
impt['importance']=score.values()

impt=impt.sort_values(by ='importance',axis=0,ascending=False)
impt.reset_index(drop=True,inplace=True)

In [None]:
sns.distplot((train1['年龄']))

In [None]:
out1=model.predict(xgb.DMatrix(train1.iloc[5642:,:]))

In [None]:
sns.distplot(out1)

In [None]:
sns.distplot(out1)

In [None]:
out[out>6.1].shape

In [None]:
out1[out1>6.1].shape

In [None]:
sns.jointplot(x=out,y=out1,kind='reg')

In [None]:
np.mean((np.exp(out)-np.exp(test_y))**2)

In [None]:
pd.DataFrame(out).to_csv('re_test_a.csv',index=False,header=False)

In [None]:
train1.shape

In [None]:
xrg=xgb.XGBClassifier(max_depth=6,learning_rate=0.1,
n_estimators=261,nthread=-1,gamma=0.0,colsample_bytree=1,
missing=-999,colsample_bylevel=1)

In [None]:
k,err=cla_cv_function(svc,x_sam,y_sam,train2[:5642],n=10)

In [None]:
Counter(k[1])

In [None]:
err1=[1/a for a in err]
err1=err1/sum(err1)

out=[(k[i])*err1[i] for i in range(len(k))]

out=sum(out)

In [None]:
pd.DataFrame(out).to_csv('classify_add_7_8_10.csv',index=False,header=False)

In [None]:
l_index,_=index_cv_function(xrg,train1.iloc[:5529],label,n=10)

In [None]:
for i in range(10):
    print(train.iloc[l_index[i]]['血糖'][train.iloc[l_index[i]]['血糖']>15].mean())

## Lgb 模型 

In [None]:
train_x1,test_x1,train_y1,test_y1=train_test_split(train2[:5641,:],label,test_size=0.1,random_state=0)

train_1=lgb.Dataset(train_x1,label=train_y1)#,feature_name=list(train_x1.columns))
test_1=lgb.Dataset(test_x1,label=test_y1,)#feature_name=list(test_x1.columns))

In [None]:
param = {
    'max_depth':4,
    'num_leaves':16,
    'learning_rate':0.06,
    'gamma'        :0,
    'scale_pos_weight':1,
    'num_threads':-1,
    'colsample_bytree':1,
    'metric':'rmse',
    'subsample':1

}
param['is_unbalance']='true'

In [None]:
model=lgb.train(param,train_1,num_boost_round=500,valid_sets=[train_1,test_1],early_stopping_rounds=25)

In [None]:
lrg=lgb.LGBMRegressor( num_leaves=16, max_depth=4, learning_rate=0.06,
                      n_estimators=249, max_bin=255, subsample_for_bin=200000, objective=None,
                      min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, 
                      subsample=1.0, subsample_freq=1, colsample_bytree=1.0, reg_alpha=0.0,
                      reg_lambda=0.0, random_state=None, n_jobs=-1, silent=True
)

In [None]:
lk,lerr=cv_function(lrg,train1.iloc[:5641],label,train1.iloc[5641:],n=10)

In [None]:
lerr1=[1/a for a in lerr]
lerr1=lerr1/sum(lerr1)

out=[lk[i]*lerr1[i] for i in range(len(lk))]

out=sum(out)

In [None]:
pd.DataFrame(out).to_csv('lgb_cv_qu1.csv',index=False,header=False)

## xgb 产生新特征 

In [None]:
gbdt=xgb.XGBRegressor(max_depth=4,learning_rate=0.06,
n_estimators=180,nthread=-1,gamma=0.0,colsample_bytree=1,
missing=-999,colsample_bylevel=1)

In [None]:
gbdt.fit(train1.iloc[:5642].fillna(-999),label)

In [None]:
tre=pd.DataFrame(gbdt.apply(train1.fillna(-999)))

## 支持向量机模型 

In [None]:
svc= svm.SVC(probability=True)

In [None]:
svc.fit(x_sam,y_sam)

In [None]:
prob2=svc.predict(train1.iloc[:5642])

In [None]:
metrics.accuracy_score(b_g(label,10),prob2)

In [None]:
metrics.recall_score(b_g(label,10),prob2)

In [None]:
metrics.f1_score(b_g(label,10),prob2)

In [None]:
metrics.confusion_matrix(b_g(label,10),prob2)

In [None]:
out_10=svc.predict(train1.iloc[5642:])

In [None]:
Counter(out_10)

In [None]:
Counter(b_g(label,10))

## voting

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
svc= svm.SVC(probability=True)
knn=neighbors.KNeighborsClassifier()
xgbc=xgb.XGBClassifier(max_depth=6,learning_rate=0.1,
n_estimators=261,nthread=-1,gamma=0.0,colsample_bytree=1,
missing=-999,colsample_bylevel=1)
sgd=SGDClassifier(loss='log')

In [None]:
voting_clf = VotingClassifier( estimators=[("svc",svc),('knn',knn),('xgbc',xgbc),('sgd',sgd)],voting='soft',)

In [None]:
voting_clf.fit(x_sam,y_sam)

In [None]:
prob2=voting_clf.predict(train1.iloc[:5642].values)

In [None]:
out_10=voting_clf.predict_proba(train1.iloc[5642:].values)

## KNN  算法 

In [None]:
knn=neighbors.KNeighborsClassifier()

In [None]:
knn.fit(x_sam,y_sam)

In [None]:
out_knn=knn.predict(train2[:5642])

In [None]:
Counter(out_knn)

## 线性模型 

In [None]:
reg = linear_model.RidgeCV(alphas=[0.001])

In [None]:
reg.fit(train2[:5641,:],label)

In [None]:
reg.alpha_

In [None]:
k,err=cv_function(reg,train2[:5641,:],label,train2[5641:,:],n=10)

In [None]:
out_reg=reg.predict(nul_fill(train1).iloc[5529:])

In [None]:
pd.DataFrame(out_reg).to_csv('qujizhi_linear.csv',index=False,header=False)

### elastic net


In [None]:
regr=linear_model.ElasticNetCV(cv=5,l1_ratio=[.01,.1, .5, .7, .9, .95, .99, 1])

In [None]:

regr.fit(train2[:5641,:],np.log1p(label))

In [None]:
regr.alpha_

In [None]:
k,err=cv_function(regr,train2[:5641,:],np.log1p(label),train2[5641:,:],n=10)

## kernel ridge 

In [None]:
krr=KernelRidge(alpha=100)

In [None]:
k,err=cv_function(krr,train2[:5641,:],np.log1p(label),train2[5641:,:],n=10)

## lasso 

In [None]:
lasso=linear_model.LassoCV(alphas=[0.01,0.1,1,10])

In [None]:
lasso.fit(train2[:5641,:],np.log1p(label))

In [None]:
lasso.alpha_

In [None]:
k,err=cv_function(lasso,train2[:5641,:],np.log1p(label),train2[5641:,:],n=10)

In [None]:
(np.argwhere(out_10[:,1]>0.4)+1)