# 데이터 전처리

In [21]:
import pandas as pd

data = pd.read_csv('../알츠하이머/data/FS_raw.csv')

In [22]:
data.positivity.unique()

array(['BAPL1', 'BAPL2', 'BAPL3'], dtype=object)

In [23]:
# 필요없는 컬럼 제거
df = data.copy()
df = df.drop(['ID','PET ligand type', 'Onset age'], axis=1)

In [24]:
# object형 데이터 인코딩
df['positivity'] = df['positivity'].apply(lambda x: 0 if x=='BAPL1' else 1)
df.APOE = df.APOE.apply(lambda x: 1 if x in ['E3/E4', 'E4/E4'] else 0)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168 entries, 0 to 167
Data columns (total 60 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   positivity      168 non-null    int64  
 1   Sex             168 non-null    int64  
 2   Age             168 non-null    int64  
 3   Eduction        168 non-null    int64  
 4   Diagnosis code  168 non-null    int64  
 5   APOE            168 non-null    int64  
 6   SNSB            168 non-null    int64  
 7   MMSE            168 non-null    int64  
 8   CDR             168 non-null    float64
 9   CDR-SOB         168 non-null    float64
 10  GDS             168 non-null    int64  
 11  SGDepS          168 non-null    int64  
 12  (SUV)FC-l       168 non-null    float64
 13  (SUV)FC-r       168 non-null    float64
 14  (SUV)LTC-l      168 non-null    float64
 15  (SUV)LTC-r      168 non-null    float64
 16  (SUV)PC-l       168 non-null    float64
 17  (SUV)PC-r       168 non-null    flo

## H0: 0/1/2/3/4/5

In [5]:
ndf = df.copy()
ndf_corr = pd.DataFrame(ndf.corr().iloc[:,4].sort_values(ascending=False))
ndf_corr = ndf_corr.rename(columns={'Diagnosis code':'corr'})

strong_corr = ndf_corr[(ndf_corr['corr']>0.4)|(ndf_corr['corr']<=-0.4)]
lt = strong_corr.index

nndf = ndf[lt]
X = nndf.drop('Diagnosis code', axis=1)
y = nndf['Diagnosis code']

from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [25]:
df.positivity.unique()

array([0, 1], dtype=int64)

In [27]:
df.corr().iloc[:,4].sort_values(ascending=False)

Diagnosis code    1.000000
GDS               0.795828
CDR-SOB           0.667014
(SUV)PUT-l        0.459694
(SUV)PUT-r        0.445789
(SUV)GCA-l        0.442903
(SUV)GCA-r        0.440731
(SUV)GCP-l        0.439247
(SUV)GCP-r        0.429696
(SUV)PQ-r         0.427482
(SUV)PQ-l         0.415662
(SUV)FC-r         0.414658
(SUV)LTC-r        0.410479
positivity        0.409148
CDR               0.405876
(SUV)FC-l         0.402696
(SUV)CN-r         0.396309
(SUV)PC-r         0.394875
(SUV)LTC-l        0.394690
(SUV)CN-l         0.384433
(SUV)PC-l         0.376564
(SUV)OC-l         0.372480
(SUV)OC-r         0.372480
APOE              0.321481
(SUV)THA-l        0.232711
(SUV)THA-r        0.228025
SGDepS            0.116523
(SUV)CBL-l        0.108709
Eduction          0.102318
(SUV)MTC-l        0.073680
(SUV)MTC-r        0.052372
Age               0.039415
(Vol)GCP-r        0.020688
(Vol)CBL-l        0.007633
(Vol)GCA-r       -0.020246
(Vol)GCP-l       -0.021651
(Vol)CBL-r       -0.029384
(

In [6]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=8, n_estimators=150, random_state=42)
rfc.fit(train_x, train_y)

RandomForestClassifier(max_depth=8, n_estimators=150, random_state=42)

In [7]:
pred= rfc.predict(test_x)

pred_proba = rfc.predict_proba(test_x)

In [8]:
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
print(accuracy_score(test_y, pred))
print(f1_score(test_y, pred, average='micro'))

0.7941176470588235
0.7941176470588235


In [9]:
from sklearn.model_selection import GridSearchCV

# 모델 선택
model = RandomForestClassifier(random_state=42)

# 그리드 정의
param_grid = {
    'n_estimators': [75,100,125,150,175],  # 첫 번째 하이퍼파라미터 값들의 리스트
    'max_depth': [None,7, 8, 9, 10,20],  # 두 번째 하이퍼파라미터 값들의 리스트
}

# 그리드 서치 수행
grid_search = GridSearchCV(model, param_grid, cv=5)  # 5-겹 교차 검증을 사용
grid_search.fit(X, y)

# 최적의 하이퍼파라미터 조합과 성능 출력
print("최적의 하이퍼파라미터:", grid_search.best_params_)
print("최고 성능:", grid_search.best_score_)



최적의 하이퍼파라미터: {'max_depth': 7, 'n_estimators': 100}
최고 성능: 0.7385026737967915


## 가설1: 0,1,2,3,4 / 5

In [10]:
ndf = df.copy()
ndf['Diagnosis code'] = ndf['Diagnosis code'].map(lambda x: 1 if x==5 else 0)
ndf_corr = pd.DataFrame(ndf.corr().iloc[:,4].sort_values(ascending=False))
ndf_corr = ndf_corr.rename(columns={'Diagnosis code':'corr'})

strong_corr = ndf_corr[(ndf_corr['corr']>0.4)|(ndf_corr['corr']<=-0.4)]
lt = strong_corr.index

nndf = ndf[lt]
X = nndf.drop('Diagnosis code', axis=1)
y = nndf['Diagnosis code']

from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [11]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
rfc.fit(train_x, train_y)

RandomForestClassifier(random_state=42)

In [12]:
pred= rfc.predict(test_x)
pred_proba = rfc.predict_proba(test_x)

In [13]:
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
print(accuracy_score(test_y, pred))
print(f1_score(test_y, pred, average='micro'))
print(roc_auc_score(test_y, pred_proba[:,1]))

1.0
1.0
1.0


In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# 모델 선택
model = RandomForestClassifier(random_state=42)

# 그리드 정의
param_grid = {
    'n_estimators': [75,100,125,150,175],  # 첫 번째 하이퍼파라미터 값들의 리스트
    'max_depth': [None,7, 8, 9, 10,20],  # 두 번째 하이퍼파라미터 값들의 리스트
}

# 그리드 서치 수행
grid_search = GridSearchCV(model, param_grid, cv=5)  # 5-겹 교차 검증을 사용
grid_search.fit(train_x, train_y)

# 최적의 하이퍼파라미터 조합과 성능 출력
print("최적의 하이퍼파라미터:", grid_search.best_params_)
print("최고 성능:", grid_search.best_score_)

최적의 하이퍼파라미터: {'max_depth': None, 'n_estimators': 125}
최고 성능: 0.8943019943019943


In [15]:
best_model = grid_search.best_estimator_
best_model.fit(X, y)
grid_pred = best_model.predict(test_x)
grid_pred_proba = rfc.predict_proba(test_x)
print(accuracy_score(test_y, grid_pred))
print(f1_score(test_y, grid_pred, average='micro'))
print(roc_auc_score(test_y, grid_pred_proba[:,1]))

1.0
1.0
1.0


In [16]:
len(lt)

18

## 가설2: 0,1,2,3 / 4,5

In [17]:
ndf = df.copy()
ndf['Diagnosis code'] = ndf['Diagnosis code'].map(lambda x: 1 if x in [4,5] else 0)
ndf_corr = pd.DataFrame(ndf.corr().iloc[:,4].sort_values(ascending=False))
ndf_corr = ndf_corr.rename(columns={'Diagnosis code':'corr'})

strong_corr = ndf_corr[(ndf_corr['corr']>0.4)|(ndf_corr['corr']<=-0.4)]
lt = strong_corr.index

nndf = ndf[lt]
X = nndf.drop('Diagnosis code', axis=1)
y = nndf['Diagnosis code']

from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [18]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=150, max_depth=8)
rfc.fit(train_x, train_y)

RandomForestClassifier(max_depth=8, n_estimators=150)

In [19]:
pred= rfc.predict(test_x)
pred_proba = rfc.predict_proba(test_x)

In [20]:
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
print(accuracy_score(test_y, pred))
print(f1_score(test_y, pred, average='micro'))
print(roc_auc_score(test_y, pred_proba[:,1]))

0.8235294117647058
0.8235294117647058
0.9221453287197232


In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# 모델 선택
model = RandomForestClassifier(random_state=42)

# 그리드 정의
param_grid = {
    'n_estimators': [140, 145,150,155, 160],  # 첫 번째 하이퍼파라미터 값들의 리스트
    'max_depth': [None,7, 8, 9, 10,20],  # 두 번째 하이퍼파라미터 값들의 리스트
}

# 그리드 서치 수행
grid_search = GridSearchCV(model, param_grid, cv=5)  # 5-겹 교차 검증을 사용
grid_search.fit(train_x, train_y)

# 최적의 하이퍼파라미터 조합과 성능 출력
print("최적의 하이퍼파라미터:", grid_search.best_params_)
print("최고 성능:", grid_search.best_score_)

최적의 하이퍼파라미터: {'max_depth': None, 'n_estimators': 140}
최고 성능: 0.9028490028490029


## 가설3: 0,1,2/3,4,5

In [22]:
ndf = df.copy()
ndf['Diagnosis code'] = ndf['Diagnosis code'].map(lambda x: 1 if x in [3,4,5] else 0)
ndf_corr = pd.DataFrame(ndf.corr().iloc[:,4].sort_values(ascending=False))
ndf_corr = ndf_corr.rename(columns={'Diagnosis code':'corr'})

strong_corr = ndf_corr[(ndf_corr['corr']>0.4)|(ndf_corr['corr']<=-0.4)]
lt = strong_corr.index

nndf = ndf[lt]
X = nndf.drop('Diagnosis code', axis=1)
y = nndf['Diagnosis code']

from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [23]:
rfc = RandomForestClassifier(n_estimators=150, max_depth=8, random_state=42)
rfc.fit(train_x, train_y)

pred= rfc.predict(test_x)
pred_proba = rfc.predict_proba(test_x)

from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
print(accuracy_score(test_y, pred))
print(f1_score(test_y, pred, average='micro'))
print(roc_auc_score(test_y, pred_proba[:,1]))

0.9705882352941176
0.9705882352941176
1.0


In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# 모델 선택
model = RandomForestClassifier(random_state=42)

# 그리드 정의
param_grid = {
    'n_estimators': [30,35,40,45,50,55,60,65],  # 첫 번째 하이퍼파라미터 값들의 리스트
    'max_depth': [None,7, 8, 9, 10,20],  # 두 번째 하이퍼파라미터 값들의 리스트
}

# 그리드 서치 수행
grid_search = GridSearchCV(model, param_grid, cv=5)  # 5-겹 교차 검증을 사용
grid_search.fit(train_x, train_y)

# 최적의 하이퍼파라미터 조합과 성능 출력
print("최적의 하이퍼파라미터:", grid_search.best_params_)
print("최고 성능:", grid_search.best_score_)

최적의 하이퍼파라미터: {'max_depth': None, 'n_estimators': 30}
최고 성능: 0.9475783475783475


## 가설4: 0,1,2/3,4/5

In [25]:
def f(x):
    if x==5:
        return 2
    elif x in [3,4]:
        return 1
    else:
        return 0
    
ndf = df.copy()
ndf['Diagnosis code'] = ndf['Diagnosis code'].map(f)
ndf_corr = pd.DataFrame(ndf.corr().iloc[:,4].sort_values(ascending=False))
ndf_corr = ndf_corr.rename(columns={'Diagnosis code':'corr'})

strong_corr = ndf_corr[(ndf_corr['corr']>0.4)|(ndf_corr['corr']<=-0.4)]
lt = strong_corr.index

nndf = ndf[lt]
X = nndf.drop('Diagnosis code', axis=1)
y = nndf['Diagnosis code']

from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [26]:
from sklearn.model_selection import train_test_split

# 데이터 준비
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

rfc = RandomForestClassifier(n_estimators=150, max_depth=8, random_state=42)
rfc.fit(train_x, train_y)

pred= rfc.predict(test_x)
pred_proba = rfc.predict_proba(test_x)

from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
print(accuracy_score(test_y, pred))
print(f1_score(test_y, pred, average='micro'))

0.7647058823529411
0.7647058823529412


In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# 모델 선택
model = RandomForestClassifier(random_state=42)

# 그리드 정의
param_grid = {
    'n_estimators': [1,2,3,4,5,6,7,8,9,10],  # 첫 번째 하이퍼파라미터 값들의 리스트
    'max_depth': [None,7, 8, 9, 10,20],  # 두 번째 하이퍼파라미터 값들의 리스트
}

# 그리드 서치 수행
grid_search = GridSearchCV(model, param_grid, cv=5)  # 5-겹 교차 검증을 사용
grid_search.fit(train_x, train_y)

# 최적의 하이퍼파라미터 조합과 성능 출력
print("최적의 하이퍼파라미터:", grid_search.best_params_)
print("최고 성능:", grid_search.best_score_)

최적의 하이퍼파라미터: {'max_depth': None, 'n_estimators': 9}
최고 성능: 0.8732193732193732


In [28]:
best_model = grid_search.best_estimator_
grid_pred = best_model.predict(test_x)
grid_pred_proba = best_model.predict_proba(test_x)
print(accuracy_score(test_y, grid_pred))
print(f1_score(test_y, grid_pred, average='micro'))
print(roc_auc_score(test_y, grid_pred_proba, multi_class='ovr'))

0.7941176470588235
0.7941176470588235
0.8838326105567486


In [29]:
grid_pred_proba

array([[0.        , 0.11111111, 0.88888889],
       [0.        , 1.        , 0.        ],
       [0.        , 0.11111111, 0.88888889],
       [0.        , 0.        , 1.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 0.22222222, 0.77777778],
       [0.        , 0.11111111, 0.88888889],
       [0.33333333, 0.66666667, 0.        ],
       [0.22222222, 0.77777778, 0.        ],
       [0.        , 0.44444444, 0.55555556],
       [0.11111111, 0.55555556, 0.33333333],
       [0.11111111, 0.22222222, 0.66666667],
       [0.        , 0.        , 1.        ],
       [0.22222222, 0.77777778, 0.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 0.88888889, 0.11111111],
       [0.        , 0.11111111, 0.88888889],
       [0.        , 0.77777778, 0.22222222],
       [0.        , 0.44444444, 0.55555556],
       [0.        , 0.22222222, 0.77777778],
       [0.        , 0.        , 1.        ],
       [0.