# 폐암환자 생존률 예측

In [1]:
!ls -l

total 552
-rw-r--r-- 1 root root  49082 Feb  9 00:36 housing.csv
-rw-r--r-- 1 root root   4551 Feb  9 00:36 iris.csv
-rw-r--r-- 1 root root  23279 Feb  9 00:36 pima-indians-diabetes.csv
drwxr-xr-x 1 root root   4096 Feb  4 15:26 sample_data
-rw-r--r-- 1 root root  87776 Feb  9 00:36 sonar.csv
-rw-r--r-- 1 root root  21257 Feb  9 00:36 ThoraricSurgery.csv
-rw-r--r-- 1 root root 361279 Feb  9 00:36 wine.csv


### 데이터 탐색

In [3]:
import numpy as np
import pandas as pd

In [5]:
df = pd.read_csv('ThoraricSurgery.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,293,1,3.8,2.8,0,0,0,0,0,0,12,0,0,0,1,0,62,0
1,1,2,2.88,2.16,1,0,0,0,1,1,14,0,0,0,1,0,60,0
2,8,2,3.19,2.5,1,0,0,0,1,0,11,0,0,1,1,0,66,1
3,14,2,3.98,3.06,2,0,0,0,1,1,14,0,0,0,1,0,80,1
4,17,2,2.21,1.88,0,0,1,0,0,0,12,0,0,0,1,0,56,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       470 non-null    int64  
 1   1       470 non-null    int64  
 2   2       470 non-null    float64
 3   3       470 non-null    float64
 4   4       470 non-null    int64  
 5   5       470 non-null    int64  
 6   6       470 non-null    int64  
 7   7       470 non-null    int64  
 8   8       470 non-null    int64  
 9   9       470 non-null    int64  
 10  10      470 non-null    int64  
 11  11      470 non-null    int64  
 12  12      470 non-null    int64  
 13  13      470 non-null    int64  
 14  14      470 non-null    int64  
 15  15      470 non-null    int64  
 16  16      470 non-null    int64  
 17  17      470 non-null    int64  
dtypes: float64(2), int64(16)
memory usage: 66.2 KB


In [7]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
count,470.0,470.0,470.0,470.0,470.0,470.0,470.0,470.0,470.0,470.0,470.0,470.0,470.0,470.0,470.0,470.0,470.0,470.0
mean,235.5,3.095745,3.281638,4.568702,0.780851,0.065957,0.144681,0.065957,0.687234,0.165957,11.73617,0.074468,0.004255,0.017021,0.821277,0.004255,62.534043,0.148936
std,135.821574,0.722309,0.871395,11.767857,0.535375,0.248472,0.352154,0.248472,0.464114,0.372439,0.702243,0.262811,0.065163,0.129488,0.383529,0.065163,8.706902,0.356405
min,1.0,1.0,1.44,0.96,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0
25%,118.25,3.0,2.6,1.96,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,1.0,0.0,57.0,0.0
50%,235.5,3.0,3.16,2.4,1.0,0.0,0.0,0.0,1.0,0.0,12.0,0.0,0.0,0.0,1.0,0.0,62.0,0.0
75%,352.75,3.0,3.8075,3.08,1.0,0.0,0.0,0.0,1.0,0.0,12.0,0.0,0.0,0.0,1.0,0.0,69.0,0.0
max,470.0,8.0,6.3,86.3,2.0,1.0,1.0,1.0,1.0,1.0,14.0,1.0,1.0,1.0,1.0,1.0,87.0,1.0


### 데이터셋 만들기

In [8]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [9]:
seed = 2021
np.random.seed(seed)
tf.random.set_seed(seed)

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(df.iloc[:, :-1].values, df.iloc[:, -1].values, test_size=0.25, stratify=df.iloc[:, -1].values, random_state=seed)
X_train.shape, X_test.shape

((352, 17), (118, 17))

### 모델 정의

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [12]:
model = Sequential()
model.add(Dense(30, input_shape=(17,), activation='relu')) # input_shape은 feature 개수에 따름, 30은 hidden layer의 node 개수
model.add(Dense(1, activation='sigmoid')) # output layer, 이진 분류이므로 sigmoid
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 30)                540       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 31        
Total params: 571
Trainable params: 571
Non-trainable params: 0
_________________________________________________________________


### 모델 컴파일(실행환경 설정)

In [13]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

### 모델 학습

In [14]:
model.fit(X_train, Y_train, epochs=30, validation_split=0.2, batch_size=10)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f85136f4240>

### 정확도 평가

In [15]:
acc = model.evaluate(X_test, Y_test)
acc



[0.46611106395721436, 0.8474576473236084]

In [16]:
print(f'Accuracy: {acc[1]:.4f}')

Accuracy: 0.8475
