## 의사결정트리(decision tree)
- 선택을 해야하는 상황에 주로 쓰임.
- 모델을 만드는데 있어서 '트리'라는 자료 구조를 사용하여 의사를 결정한다.
<br><br>

### 1. 조건
- 1) 연봉
- 2) 출퇴근시간
- 3) 무료 커피
<br><br>

### 2. 의사 결정 전략
- 1) 교사학습: 답을 알려주고 예측함
- 2) 비교사학습: 
<br><br>

### 3. 최고의 분할 선택
- purity(순도)  / 순수(pure): 단일 클래스로 이루어진 부분집합
- 의사결정트리에서는 1번 분기할 때 마다 2개씩 나뉘어진다.(트리처럼)
- 순도를 최소화 하는 선(구분할 수 있는 선?)을 찾기 위해 학습을 한다.
- 정보 획득량이 최대인 구분선을 찾는 것이 알고리즘을 만드는 법.
- 정보 획득, 순도 계산 방법
<br><br>

### 4. 정보 획득, 순도 계산 방법
- 순도 측정 방법: 대표적으로 엔트로피(entropy)
- 엔트로피가 1에 가까울수록 많이 복잡하다
- 0에 가까울수록 순수하다


## Titanic

In [73]:
import pandas as pd
import numpy as np

In [74]:
train = pd.read_csv('train.csv')
train.head()
train.shape

(891, 12)

In [75]:
test = pd.read_csv('test.csv')
test.shape

(418, 11)

In [76]:
print(train.head())
print(train.info())
# cabin 결측값이 많네 

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
<c

In [77]:
train.describe()
# PassengerId는 survived와 관련이 없군.
# 

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [78]:
train.isnull().sum()
# train의 Age 열에 대한 평균값으로 NaN 대체하여 Age_mean열에 저장.

train['Age_mean'] = train['Age']

train.Age[train.Age.isnull()] = train.Age.mean()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [79]:
# NaN값 대체 방법
# train 데이터에 적용했던 것 그대로 test 데이터에 적용해야해!! 검증은 test로 하니까.
train.Age_mean.fillna(train['Age'].mean, inplace = True)
train.Age_mean.isnull().sum()

0

In [148]:
test['Age_mean'] = test.Age
test.Age_mean.fillna(test['Age'].mean, inplace = True)
test.Age_mean.isnull().sum()

0

In [149]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Age_mean         0
Embarked_S       0
Embarked_C       0
Embarked_Q       0
FamilySize       0
Family           0
Family_S         0
Family_M         0
Family_L         0
Gender           0
dtype: int64

In [150]:
# 성별에 따른 데이터 빈도 수 계산
train.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [151]:
# female = True / male = False 저장
train['Gender'] = train.Sex == 'female'
test['Gender'] = test.Sex == 'female'

In [152]:
train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [153]:
train['Embarked_S'] = train['Embarked'] == 'S'
train['Embarked_C'] = train['Embarked'] == 'C'
train['Embarked_Q'] = train['Embarked'] == 'Q'

In [154]:
train['Embarked_S'].sum() # 644건이 True
train['Embarked_C'].sum() # 168
train['Embarked_Q'].sum() # 77

77

In [155]:
test['Embarked_S'] = test['Embarked'] == 'S'
test['Embarked_C'] = test['Embarked'] == 'C'
test['Embarked_Q'] = test['Embarked'] == 'Q'

In [156]:
train['FamilySize'] = train.SibSp+train.Parch+1
train['FamilySize'].value_counts()

1     537
2     161
3     102
4      29
6      22
5      15
7      12
11      7
8       6
Name: FamilySize, dtype: int64

In [157]:
test['FamilySize'] = test.SibSp+test.Parch+1

In [158]:
train.head()
train.Family = train.FamilySize
train.FamilySize

0      2
1      2
2      1
3      2
4      1
      ..
886    1
887    1
888    4
889    1
890    1
Name: FamilySize, Length: 891, dtype: int64

In [159]:
test['Family'] = test['FamilySize']

In [160]:
train.loc[train.FamilySize ==1, 'Family'] = 'S'
train.loc[(train.FamilySize >= 2) & (train.FamilySize < 5), 'Family'] = 'M'
train.loc[train.FamilySize >=5, 'Family'] = 'L'

In [161]:
test.loc[test.FamilySize ==1, 'Family'] = 'S'
test.loc[(test.FamilySize >= 2) & (test.FamilySize < 5), 'Family'] = 'M'
test.loc[test.FamilySize >=5, 'Family'] = 'L'

In [162]:
train[['Family','FamilySize']].head()

Unnamed: 0,Family,FamilySize
0,M,2
1,M,2
2,S,1
3,M,2
4,S,1


In [163]:
train['Family_S'] = train['Family'] == 'S'
train['Family_M'] = train['Family'] == 'M'
train['Family_L'] = train['Family'] == 'L'

In [164]:
test['Family_S'] = test['Family'] == 'S'
test['Family_M'] = test['Family'] == 'M'
test['Family_L'] = test['Family'] == 'L'

In [165]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Age_mean,Embarked_S,Embarked_C,Embarked_Q,FamilySize,Family,Family_S,Family_M,Family_L,Gender
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,22,True,False,False,2,M,False,True,False,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,38,False,True,False,2,M,False,True,False,True
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,26,True,False,False,1,S,True,False,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,35,True,False,False,2,M,False,True,False,True
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,35,True,False,False,1,S,True,False,False,False


In [166]:
train['Family'].value_counts()

S    537
M    292
L     62
Name: Family, dtype: int64

In [167]:
test['Family'].value_counts()

S    253
M    145
L     20
Name: Family, dtype: int64

In [168]:
train['Pclass'] = train['Pclass'].astype('category')
train['Pclass']

0      3
1      1
2      3
3      1
4      3
      ..
886    2
887    1
888    3
889    1
890    3
Name: Pclass, Length: 891, dtype: category
Categories (3, int64): [1, 2, 3]

In [169]:
# 어떤 피쳐를 쓸 것인가?
train.columns


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Age_mean',
       'Embarked_S', 'Embarked_C', 'Embarked_Q', 'FamilySize', 'Family',
       'Family_S', 'Family_M', 'Family_L', 'Gender'],
      dtype='object')

In [170]:
# 독립변수(입력데이터)
fn = ['Gender','Age_mean','Embarked_S', 'Embarked_C', 'Embarked_Q','Family_S', 'Family_M', 'Family_L']
xTrain = train[fn]
xTrain.head()

Unnamed: 0,Gender,Age_mean,Embarked_S,Embarked_C,Embarked_Q,Family_S,Family_M,Family_L
0,False,22,True,False,False,False,True,False
1,True,38,False,True,False,False,True,False
2,True,26,True,False,False,True,False,False
3,True,35,True,False,False,False,True,False
4,False,35,True,False,False,True,False,False


In [171]:
# 종속변수(출력데이터)
yLabel = train['Survived']

In [172]:
# 문제: xTrain, 정답: yLabel → 모델링(의사결정 알고리듬) → 모델
# 테스트 입력데이터: xTest ----------------------------> 입력 => 생존 여부 출력(예측결과) => 

In [174]:
xTest = test[fn]

In [176]:
# 모델링
from sklearn.tree import DecisionTreeClassifier
# 터미널 노드까지 내려가지 말고 3에서 잘라
model = DecisionTreeClassifier(max_depth = 3, random_state=2020)
model

DecisionTreeClassifier(max_depth=3, random_state=2020)

In [178]:
model.fit(xTrain, yLabel)

TypeError: float() argument must be a string or a number, not 'method'