In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# 분류모델
## 의사결정나무 분류모델

In [2]:
from sklearn.tree import DecisionTreeClassifier
# 학습용, 평가용 데이터 구분 패키지
from sklearn.model_selection import train_test_split

* 학습용 : 평가용 = 7:3 or 8:2
* train : validation : new data = 60 : 15 : 25 --> validation : 중간점검용

### 데이터 불러오기

In [3]:
# path 저장
path = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"

In [4]:
# csv 불러오기
raw = pd.read_csv(path)
df = raw.copy()

### 데이터 확인하기

In [5]:
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [6]:
print(df.shape)
df.info()

(891, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
# 기초 요약 통계량
# 평균 먼저 보기 -> 중양값과 비교 -> 최대, 최소 확인
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

### 결측치 확인하기

In [9]:
# 결측치 확인 및 개수 새기
df['Age'].isnull().sum()

177

In [10]:
# 결측치를 그냥 삭제할 수 있는 경우는 결측치가 전체의 5% 미만이어야 한다.
177 / 891 * 100

19.865319865319865

In [11]:
# 단순대치법 : 평균으로 대체
age_mean = df.Age.mean()
df.Age.fillna(age_mean, inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [13]:
# Embarked
df.Embarked.value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [14]:
# 최빈값 대치
embarked_mode = df.Embarked.mode()[0]
df.Embarked.fillna(embarked_mode, inplace=True)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [16]:
# Sex >> 0, 1 라벨링 (원핫 인코딩)
from sklearn.preprocessing import LabelEncoder
df['Sex'] = LabelEncoder().fit_transform(df.Sex)
df.Sex

0      1
1      0
2      0
3      0
4      1
      ..
886    1
887    0
888    0
889    1
890    1
Name: Sex, Length: 891, dtype: int32

In [17]:
# Embarked도 라벨링하기
from sklearn.preprocessing import LabelEncoder
df['Embarked'] = LabelEncoder().fit_transform(df.Embarked)
df.Embarked

0      2
1      0
2      2
3      2
4      2
      ..
886    2
887    2
888    2
889    0
890    1
Name: Embarked, Length: 891, dtype: int32

In [None]:
# 파생변수 만들기
# SibSp & Parch >> Family


In [20]:
# Cabin 컬럼 삭제
df.drop(columns=['Cabin'], inplace=True)

### 분석 데이터셋 준비
* X : 독립변수
* y : 결과

In [18]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [None]:
X = df[['Pclass', 'Sex', 'Age', 'Embarked', 'Family']]
y = df['Survived']

### 데이터 분할하기
* 8 : 2

In [None]:
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# 각 데이터 크기 확인하기
# .shape
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

### 모델링
* 의사결정나무 모델 가져오기

In [None]:
dt = DecisionTreeClassifier(random_state=42)

# 학습 (== 모델 훈련) >> 모델 생성
dt.fit(X_train, y_train)

In [None]:
# 학습 완료된 모델을 가지고 test 데이터로 예측
y_pred = dt.predict(X_test)

In [None]:
# 목표 : error 최소화
# y_pred와 y_real의 차이가 0에 가깝도록 !


### 모델 성능 비교 - 정확도 측정

In [None]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, y_pred)
