In [2]:
import os
import pandas as pd
import warnings
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,OneHotEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from catboost import CatBoostClassifier
from xgboost import XGBClassifier 

%matplotlib inline
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']

In [3]:
path = '/home/kesci/data/competition_A/'
train_df = pd.read_csv(path+'train_set.csv') 
test_df  = pd.read_csv(path+'test_set.csv') 
submission  =  pd.read_csv(path+'submission_example.csv') 
print('Train Shape:{}\nTest Shape:{}'.format(train_df.shape,test_df.shape))
train_df.head()

In [4]:
train_df.describe()

In [5]:
train_df.columns

In [6]:
print(pd.isnull(train_df).sum())

In [10]:
## 对于数值型变量的缺失值用中位数进行填充
feature1 = ['最低血压','腰围','最高血压','体重指数','肥胖腰围','身高','体重','好胆固醇','总胆固醇','坏胆固醇','体育活动']
for i in feature1:
    train_df[i] = train_df[i].fillna(train_df[i].median())
    test_df[i] = test_df[i].fillna(test_df[i].median())
## 对于分类型变量的缺失值用众数进行填充
feature2 = ['收入','未婚','视力不佳','高血压','慢性疲劳','肝炎','教育','家族肝炎'] # ,'ALF'
for i in feature2:
    train_df[i] = train_df[i].fillna(train_df[i].mode())
    train_df[i] = train_df[i].fillna(0.0)

feature3 =['收入','未婚','视力不佳','高血压','慢性疲劳','糖尿病','教育','家族肝炎'] # ,'ALF'
for i in feature3:
    test_df[i] = test_df[i].fillna(test_df[i].mode())
    test_df[i] = test_df[i].fillna(0.0)

num_columns = ['年龄','体重','身高','体重指数', '腰围', '最高血压', '最低血压',
                '好胆固醇', '坏胆固醇', '总胆固醇','体育活动']
zero_to_one_columns = ['血脂异常','PVD']
str_columns = ['性别','区域','教育','未婚','护理来源','视力不佳','饮酒','高血压',
                '家庭高血压', '糖尿病', '家族糖尿病','家族肝炎', '慢性疲劳','肥胖腰围','收入'] # ,'ALF'

In [12]:
g = sns.FacetGrid(train_df, col='肝炎')
g.map(plt.hist, '年龄', bins=20)

In [13]:
# 字符编码
for i in tqdm(str_columns):
    lbl = LabelEncoder()
    train_df[i] = lbl.fit_transform(train_df[i].astype(str))
    test_df[i]  = lbl.fit_transform(test_df[i].astype(str))

In [14]:
# 数值归一化
train_df[num_columns] = MinMaxScaler().fit_transform(train_df[num_columns])
test_df[num_columns]  = MinMaxScaler().fit_transform(test_df[num_columns])

In [15]:
all_columns = [i for i in train_df.columns if i not in ['肝炎','ID']]

train_x,train_y = train_df[all_columns].values,train_df['肝炎'].values
test_x  = test_df[all_columns].values
submission['hepatitis'] = 0

In [16]:
kfold = StratifiedKFold(n_splits=10, shuffle=False)
model = CatBoostClassifier(
    iterations=500,
    od_type='Iter',
    od_wait=120,
    max_depth=8,
    learning_rate=0.01,
    l2_leaf_reg=9,
    random_seed=2020,
    fold_len_multiplier=1.1,
    loss_function='Logloss',
    logging_level='Verbose'
    )
for train, valid in kfold.split(train_x, train_y):
    X_train, Y_train = train_x[train], train_y[train]
    X_valid, Y_valid = train_x[valid], train_y[valid]
    model.fit(X_train,Y_train, eval_set=(X_valid, Y_valid),use_best_model=True)
    Y_valid_pred_prob = model.predict_proba(X_valid)
    submission['hepatitis'] += model.predict_proba(test_x)[:,1] / 10

In [19]:
!wget -nv -O kesci_submit https://cdn.kesci.com/submit_tool/v4/kesci_submit&&chmod +x kesci_submit
submission.to_csv('submission.csv',index=False)
!./kesci_submit -token 'b2252605d4bcb662' -file '/home/kesci/work/sub_result.csv'