# Target: positivity

In [28]:
import pandas as pd

data = pd.read_csv('../알츠하이머/data/modeling.csv')

In [29]:
# data[(data.positivity==1)]

In [30]:
data.corr().iloc[:,1].sort_values(ascending=False)

positivity        1.000000
PQ                0.867611
GCP               0.861187
PC                0.848809
GCA               0.848655
Composite*        0.844886
FC                0.840276
LTC               0.829676
CN                0.796282
PUT               0.777077
THA               0.769440
OC                0.763990
Diagnosis code    0.409148
APOE              0.392635
GDS               0.366383
CDR-SOB           0.299935
MTC               0.283585
CDR               0.221290
Age               0.216661
Onset age         0.186080
Eduction          0.130179
ID                0.018108
Sex              -0.017150
SNSB             -0.063811
SGDepS           -0.118978
BS               -0.187940
Pons             -0.270811
Mid              -0.276745
MMSE             -0.354964
Name: positivity, dtype: float64

# 전체 데이터 사용

In [31]:
put = data.drop(['positivity','ID'], axis=1)
target = data['positivity']

In [32]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(put, target, test_size=0.25, stratify=target)

## RF

In [33]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=150, random_state=0)
rfc.fit(train_x, train_y)

RandomForestClassifier(n_estimators=150, random_state=0)

In [34]:
pred_rfc = rfc.predict(test_x)

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
print('accuracy_score:',accuracy_score(test_y,pred_rfc))
print('recall_score',recall_score(test_y,pred_rfc,average='macro'))
print('precision_score', precision_score(test_y,pred_rfc,average='macro'))
print('f1_score',f1_score(test_y,pred_rfc,average='macro'))
print('confusion_matrix',confusion_matrix(test_y,pred_rfc))

accuracy_score: 0.9761904761904762
recall_score 0.98
precision_score 0.9722222222222222
f1_score 0.9755102040816326
confusion_matrix [[17  0]
 [ 1 24]]


## XGB

In [35]:
from xgboost.sklearn import XGBClassifier
xgbc= XGBClassifier(n_estimators=100)
xgbc.fit(train_x, train_y)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [36]:
pred_xgb = xgbc.predict(test_x)

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
print('accuracy_score:',accuracy_score(test_y,pred_xgb))
print('recall_score',recall_score(test_y,pred_xgb,average='macro'))
print('precision_score', precision_score(test_y,pred_xgb,average='macro'))
print('f1_score',f1_score(test_y,pred_xgb,average='macro'))
print('confusion_matrix',confusion_matrix(test_y,pred_xgb))

accuracy_score: 0.9761904761904762
recall_score 0.98
precision_score 0.9722222222222222
f1_score 0.9755102040816326
confusion_matrix [[17  0]
 [ 1 24]]


# 다소 높은 상관관계이상 데이터만 사용

In [37]:
corr = data.corr()
lt = corr[(corr['positivity']>=0.4)|(corr['positivity']<=-0.4)].index
lt = list(lt)

In [38]:
lt

['positivity',
 'Diagnosis code',
 'FC',
 'LTC',
 'PC',
 'OC',
 'GCA',
 'GCP',
 'PQ',
 'CN',
 'PUT',
 'THA',
 'Composite*']

In [39]:
put2 = data[lt]
target = data['positivity']

In [40]:
train_x, test_x, train_y, test_y = train_test_split(put2, target, test_size=0.2, stratify=target)

In [41]:
rfc = RandomForestClassifier()
rfc.fit(train_x, train_y)

RandomForestClassifier()

In [42]:
pred_rfc = rfc.predict(test_x)

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
print('accuracy_score:',accuracy_score(test_y,pred_rfc))
print('recall_score',recall_score(test_y,pred_rfc,average='macro'))
print('precision_score', precision_score(test_y,pred_rfc,average='macro'))
print('f1_score',f1_score(test_y,pred_rfc,average='macro'))
print('confusion_matrix',confusion_matrix(test_y,pred_rfc))

accuracy_score: 1.0
recall_score 1.0
precision_score 1.0
f1_score 1.0
confusion_matrix [[14  0]
 [ 0 20]]


In [43]:
xgbc= XGBClassifier()
xgbc.fit(train_x, train_y)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [44]:
pred_xgb = xgbc.predict(test_x)

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
print('accuracy_score:',accuracy_score(test_y,pred_xgb))
print('recall_score',recall_score(test_y,pred_xgb,average='macro'))
print('precision_score', precision_score(test_y,pred_xgb,average='macro'))
print('f1_score',f1_score(test_y,pred_xgb,average='macro'))
print('confusion_matrix',confusion_matrix(test_y,pred_xgb))

accuracy_score: 1.0
recall_score 1.0
precision_score 1.0
f1_score 1.0
confusion_matrix [[14  0]
 [ 0 20]]


##  상관계수 높은 것들만 사용

In [45]:
corr = data.corr()
lt = corr[(corr['positivity']>=0.7)|(corr['positivity']<=-0.7)].index
lt = list(lt)

In [46]:
lt

['positivity',
 'FC',
 'LTC',
 'PC',
 'OC',
 'GCA',
 'GCP',
 'PQ',
 'CN',
 'PUT',
 'THA',
 'Composite*']

In [47]:
data.corr().loc['positivity'].sort_values(ascending=False)

positivity        1.000000
PQ                0.867611
GCP               0.861187
PC                0.848809
GCA               0.848655
Composite*        0.844886
FC                0.840276
LTC               0.829676
CN                0.796282
PUT               0.777077
THA               0.769440
OC                0.763990
Diagnosis code    0.409148
APOE              0.392635
GDS               0.366383
CDR-SOB           0.299935
MTC               0.283585
CDR               0.221290
Age               0.216661
Onset age         0.186080
Eduction          0.130179
ID                0.018108
Sex              -0.017150
SNSB             -0.063811
SGDepS           -0.118978
BS               -0.187940
Pons             -0.270811
Mid              -0.276745
MMSE             -0.354964
Name: positivity, dtype: float64

In [48]:
put2 = data[lt]
target = data['positivity']

In [49]:
train_x, test_x, train_y, test_y = train_test_split(put2, target, test_size=0.2, stratify=target)

In [50]:
rfc = RandomForestClassifier()
rfc.fit(train_x, train_y)

RandomForestClassifier()

In [51]:
pred_rfc = rfc.predict(test_x)

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
print('accuracy_score:',accuracy_score(test_y,pred_rfc))
print('recall_score',recall_score(test_y,pred_rfc,average='macro'))
print('precision_score', precision_score(test_y,pred_rfc,average='macro'))
print('f1_score',f1_score(test_y,pred_rfc,average='macro'))
print('confusion_matrix',confusion_matrix(test_y,pred_rfc))

accuracy_score: 0.9411764705882353
recall_score 0.9392857142857143
precision_score 0.9392857142857143
f1_score 0.9392857142857143
confusion_matrix [[13  1]
 [ 1 19]]


In [52]:
xgbc= XGBClassifier()
xgbc.fit(train_x, train_y)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [53]:
pred_xgb = xgbc.predict(test_x)

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
print('accuracy_score:',accuracy_score(test_y,pred_xgb))
print('recall_score',recall_score(test_y,pred_xgb,average='macro'))
print('precision_score', precision_score(test_y,pred_xgb,average='macro'))
print('f1_score',f1_score(test_y,pred_xgb,average='macro'))
print('confusion_matrix',confusion_matrix(test_y,pred_xgb))

accuracy_score: 1.0
recall_score 1.0
precision_score 1.0
f1_score 1.0
confusion_matrix [[14  0]
 [ 0 20]]


In [54]:
lt

['positivity',
 'FC',
 'LTC',
 'PC',
 'OC',
 'GCA',
 'GCP',
 'PQ',
 'CN',
 'PUT',
 'THA',
 'Composite*']