In [4]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

## Preprocessing

In [5]:
def train_preprocess(df):
    # Drop id column
    df = df.drop("id",axis = 1)
    # Drop Nan
    df = df.dropna()
    # Divide X,Y
    df_X = df.iloc[:,:-1]
    df_Y = df.iloc[:,-1]
    # Normalization
    train_mean = df_X.mean()
    train_std = df_X.std()
    df_X = (df_X-train_mean)/train_std
    return df_X,df_Y,train_mean,train_std

In [6]:
train = pd.read_csv("../data/train.csv")
train_X,train_Y,train_mean,train_std = train_preprocess(train)

In [13]:
train_X

Unnamed: 0,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul)
0,2.993989,4.484377,-1.150888,-1.155294,-1.013082,-1.233235,-1.179124,-1.274245,1.120810,-0.971312,4.546444
1,0.838401,-0.318966,0.748939,0.218978,1.217478,-0.588464,0.520688,0.520054,-1.982278,1.017564,-0.259124
2,0.420347,-0.447463,0.571658,0.602257,0.513381,0.175656,1.022181,0.626337,-1.206506,1.017564,-0.630452
3,-0.759347,-0.400551,-0.902336,-0.638837,-1.058935,-0.818821,-0.759433,-0.984771,1.411725,1.017564,-0.520306
4,-0.303408,-0.447463,0.200509,0.011463,0.417848,-0.147420,0.133137,0.133548,-1.206506,1.017564,-0.437695
...,...,...,...,...,...,...,...,...,...,...,...
82,-0.833813,-0.453582,1.279732,0.552399,1.987262,0.838602,0.009448,1.238166,0.151095,1.017564,-0.455219
83,-0.035592,-0.447463,0.597136,-0.139705,1.372513,-0.542910,-0.718065,0.255227,1.508697,-0.971312,-0.602081
84,-0.823362,1.451438,0.518830,0.236650,0.732572,-0.847868,-0.134043,0.204298,-0.624677,-0.971312,0.717176
85,-1.505311,1.210761,-0.057481,-0.283399,0.232138,-0.561603,-0.892867,-0.272906,-0.721649,1.017564,0.238205


## Model Selection

In [11]:
# Use xgboost as classifier
clf = xgb.XGBClassifier()
  
# 6 main parameters in XGBoost
parameters = {
        'n_estimators':[100,150,200],               # boosting rounds
        'max_depth':[3,6,10],                    # the max depth of tree
        'learning_rate':[0.01,0.1,0.2],       # learning rate
        'subsample':[0.7,0.8,0.9,1],          # subsample ratio of training samples
        'colsample_bytree':[0.5,0.9,1],  # subsample ratio of columns when constructing each tree
        'min_child_weight':range(1,9,1)               # minimum sum of instance weight(hessian) needed in a child
        }

In [21]:
grid_search = GridSearchCV(clf,parameters,scoring = 'accuracy',n_jobs = 4)
grid_search.fit(train_X, train_Y)
print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.8}
best scrores:  0.9307189542483659


## Train

In [22]:
def trainingXGB(X,Y):
#     xgboost = xgb.XGBClassifier(n_estimators = 150, max_depth = 3, learning_rate = 0.01, subsample = 1, colsample_bytree = 0.5,min_child_weight = 4)
    xgboost = xgb.XGBClassifier(n_estimators = 100, max_depth = 3, learning_rate = 0.01, subsample = 0.8, colsample_bytree = 0.5,min_child_weight = 3)
    xgboost.fit(X,Y)
    return xgboost

In [23]:
trained_xgb = trainingXGB(train_X,train_Y)

## Predict

In [24]:
def test_preprocess(df,mean,std):
    # Drop id column
    df = df.drop("id",axis = 1)
    # Normalization
    df = (df-mean)/std
    return df

In [25]:
test = pd.read_csv("test.csv")
test_X = test_preprocess(test,train_mean,train_std)
pre_Y = trained_xgb.predict(test_X)
# write into csv
ids = np.array(test['id'])
ids = np.expand_dims(ids,axis = 1)
pre_Y = np.expand_dims(pre_Y,axis = 1)
res = np.concatenate((ids,pre_Y),axis = 1)
res = pd.DataFrame(res,columns = ['id','label'])
res.to_csv('./xgb_submission.csv',index = False)