In [4]:
import numpy as np
import pandas as pd

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")

In [6]:
test=pd.read_csv('test.csv')
train=pd.read_csv('train.csv')

In [7]:
train.head()

Unnamed: 0,id,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul),label
0,0,3556.0,2489.0,265.19,77.53,176.55,0.0,4.2,307.91,52,0,7515.0,1
1,1,1906.0,134.0,1442.61,551.9,876.07,112.1,168.15,1735.48,20,1,1756.0,0
2,2,1586.0,71.0,1332.74,684.2,655.26,244.95,216.52,1820.04,28,1,1311.0,0
3,3,683.0,94.0,419.23,255.8,162.17,72.05,44.68,538.22,55,1,1443.0,0
4,4,1032.0,71.0,1102.72,480.27,625.3,188.78,130.77,1427.97,28,1,1542.0,0


In [8]:
total=pd.concat([train,test],axis=0)
total.reset_index(drop=True, inplace=True)
total.columns=['id','HLADR','neuCD64','CD3','CD8','CD4','NK','CD19','CD45','Age','Sex','monoCD64','label']

In [None]:
# cd45白细胞共同抗原
# CD3 T淋巴细胞
# CD4T 细胞是辅助性T淋巴细胞，其主要功能是增强吞噬细胞介导的抗感染作用和增强B细胞介导的体液免疫应答
# CD8 T细胞是抑制/杀伤性T淋巴细胞，主要功能是特异性直接杀伤靶细胞
# 机体维持正常的免疫功能状态有赖于T淋巴细胞亚群维持一定的比例，尤其是CD4/CD8比值相对稳定
# CD64的表达增高,能够作为判断在肾移植术后患者是否感染的指标
# AGE 分组
# NK natural killer cell 自然杀伤细胞
# cd3 955～2860 # 低于955降低的话细胞免疫功能减弱，机体易感染
# CD19  正常 90-560个
# CD45 1530-3700
# 1.4<CD4/CD8<2.0  # 比值高于2.0有自身免疫疾病/器官移植排斥反应
# 1.0<CD4/CD8<2.87
# 450<CD4<1440 # 增高发生自身免疫疾病
# 320<CD8<1250 # 降低发生自身免疫疾病
# 年龄分等级 10-20岁 20-30岁 30-40岁 40-50岁 50岁以上 60岁

In [9]:
total['CD4/CD8']=total['CD4']/total['CD8']
total['CD4/CD8']=total['CD4/CD8'].apply(lambda x: 0 if (x<=2) & (x>=1) else 1)
total['CD4']=total['CD4'].apply(lambda x: 0 if (x<=1440) & (x>=450) else 1)
total['CD8']=total['CD8'].apply(lambda x: 0 if (x<=1250) & (x>=320) else 1)
total['CD3']=total['CD3'].apply(lambda x: 0 if (x<=2860) & (x>=955) else 1)
total['CD19']=total['CD19'].apply(lambda x: 0 if (x<=560) & (x>=90) else 1)
total['CD45']=total['CD45'].apply(lambda x: 0 if (x<=3700) & (x>=1530) else 1)
total['Age']=total['Age'].apply(lambda x: int(x/10))

In [10]:
total=total.loc[:,['id','HLADR','neuCD64','CD3','CD8','CD4','NK','CD19','CD45','Age','Sex','monoCD64','CD4/CD8', 'label']]

In [11]:
X_train=total.iloc[:87,1:-1]
y_train=total.iloc[:87,-1]

In [12]:
X_test=total.iloc[87:,1:]

In [13]:
# Input contains NaN, infinity or a value too large for dtype('float64')
X_train[X_train.isnull().T.any()]

Unnamed: 0,HLADR,neuCD64,CD3,CD8,CD4,NK,CD19,CD45,Age,Sex,monoCD64,CD4/CD8
39,,,0,0,0,68.46,0,0,2,0,,1


In [14]:
X_train=X_train.drop(39)
y_train=y_train.drop(39)

In [15]:
y_train.columns=['label']
y_train=pd.DataFrame(y_train)

In [16]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)
y_res.groupby(["label"]).size()

label
0.0    57
1.0    57
dtype: int64

In [17]:
parameters = {'criterion':('gini','entropy'), 
              'min_samples_split':[2,5,10],
              'min_samples_leaf':[1,2,5],
              'max_features':["auto", "sqrt", "log2"],
              'oob_score':[True,False],
              'class_weight':["balanced","balanced_subsample"]}

estimator = RandomForestClassifier(100,random_state = 88)
clf_rf = GridSearchCV(estimator, parameters,cv = 5)
clf_rf.fit(X_res,y_res)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=88),
             param_grid={'class_weight': ['balanced', 'balanced_subsample'],
                         'criterion': ('gini', 'entropy'),
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [1, 2, 5],
                         'min_samples_split': [2, 5, 10],
                         'oob_score': [True, False]})

In [24]:
clf_bestrf = clf_rf.best_estimator_

In [26]:
predict=clf_bestrf.predict(X_test.iloc[:,:-1])
predict=pd.DataFrame(predict)
predict.columns=["label"]
predict.to_csv('predict.csv',index=True,index_label="id")