# Classification and Regression Tree (CART)

## Titanic Dataset
- Kaggle knowledge competition: https://www.kaggle.com/c/titanic

In [1]:
# read in the data
import pandas as pd
url = 'data/titanic.csv'
titanic = pd.read_csv(url)

# encode female as 0 and male as 1
titanic['Sex'] = titanic.Sex.map({'female':0, 'male':1})

# fill in the missing values for age with the median age
titanic.Age.fillna(titanic.Age.median(), inplace=True)

# create a DataFrame of dummy variables for Embarked
embarked_dummies = pd.get_dummies(titanic.Embarked, prefix='Embarked')
embarked_dummies.drop(embarked_dummies.columns[0], axis=1, inplace=True)

# concatenate the original DataFrame and the dummy DataFrame
titanic = pd.concat([titanic, embarked_dummies], axis=1)

# print the updated DataFrame
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C,0,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S,0,1
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S,0,1


In [2]:
# define X and y
feature_cols = ['Pclass', 'Sex', 'Age', 'Embarked_Q', 'Embarked_S']
X = titanic[feature_cols]
y = titanic.Survived

- class sklearn.tree.DecisionTreeClassifier(criterion=’gini’, splitter=’best’, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False)[source]¶

In [11]:
# fit a classification tree with max_depth=3 on all data
from sklearn.tree import DecisionTreeClassifier
treeclf = DecisionTreeClassifier(max_leaf_nodes=10, max_depth=4, random_state=1234)
treeclf.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=10,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1234,
            splitter='best')

In [12]:
treeclf.decision_path(X).todense()

matrix([[1, 0, 1, ..., 0, 0, 0],
        [1, 1, 0, ..., 0, 0, 0],
        [1, 1, 0, ..., 0, 1, 0],
        ...,
        [1, 1, 0, ..., 0, 1, 0],
        [1, 0, 1, ..., 0, 0, 0],
        [1, 0, 1, ..., 0, 0, 0]])

## CART 그림 도출 - 1 (추천 방법)

- `sklearn.tree.export_graphviz`를 이용하여 **.dot** 파일을 생성
- 생성된 dot 파일을 텍스트 편집기에서 불러들인 후, 이를 복사하여 [WebGraphviz.com](http://webgraphviz.com/)에 붙여넣으면 의사결정나무 그림을 얻을 수 있음

In [14]:
# Alternatively, you can create tree images
# using dot file and WebGraphviz (http://webgraphviz.com/)
from sklearn.tree import export_graphviz

with open("titanic_tree.dot", "w") as f:
    f = export_graphviz(treeclf, 
                        out_file = f, 
                        feature_names = X.columns,
                        class_names = ["Dead", "Survived"], # 클래스가 [0,1]이므로 이 순서를 맞춰서 이름을 할당
                        filled = True,
                        rounded = True)

## CART 그림 도출 - 2

- **GraphViz** 프로그램 다운로드 후 설치: http://www.graphviz.org/
- 환경변수(PATH)에 GraphViz 실행파일 경로를 추가: ex) 'C:\Program Files (x86)\Graphviz2.38\bin'를 환경변수에 추가
- **pydotplus** 패키지 설치: pip install pydotplus

In [None]:
# !pip install pydotplus

In [None]:
# create a Graphviz file and make pdf or png files
# You must install GraphViz
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
import pydotplus
from IPython.display import Image 
dot_data = StringIO()
export_graphviz(treeclf, out_file= dot_data)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
# graph.write_pdf("titanic_tree.pdf")
# graph.write_png("titanic_tree.png")

## Hands-on

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                   random_state=123)

In [17]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                  test_size=0.375,
                                                  random_state=123)

In [18]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(445, 5)
(267, 5)
(179, 5)


#### Decision Tree 모델 사용

In [23]:
from sklearn.metrics import accuracy_score

max_depth_set = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
dts = []

for max_depth in max_depth_set:
    dt = DecisionTreeClassifier(max_depth=max_depth)
    dt.fit(X_train, y_train)
    
    y_train_pred = dt.predict(X_train)
    y_val_pred = dt.predict(X_val)
    # print(accuracy_score(y_val, y_val_pred))
    
    accuracy_train = accuracy_score(y_train, y_train_pred)
    accuracy_val = accuracy_score(y_val, y_val_pred)
    
    print(max_depth, '\t', accuracy_train, '\t', accuracy_val)
    
    dts.append(dt)

3 	 0.7910112359550562 	 0.8052434456928839
4 	 0.8179775280898877 	 0.8277153558052435
5 	 0.8269662921348314 	 0.8202247191011236
6 	 0.8561797752808988 	 0.7940074906367042
7 	 0.8674157303370786 	 0.7415730337078652
8 	 0.8808988764044944 	 0.7677902621722846
9 	 0.898876404494382 	 0.7602996254681648
10 	 0.903370786516854 	 0.7602996254681648
11 	 0.9078651685393259 	 0.7565543071161048
12 	 0.9123595505617977 	 0.7528089887640449


- Training Accuracy는 상승하지만, Validation Accuracy는 더 이상 좋아지 않는다.
- 최적의 Decision Tree는 4이다. 

In [35]:
best_dt = dts[1] # C=4
print(best_dt)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


#### Logistic Regression 사용

In [26]:
from sklearn.linear_model import LogisticRegression

C_set = [0.01, 0.1, 1.0, 10.0, 100.0]
lrs = []

for C in C_set:
    lr = LogisticRegression(C=C, random_state=1234)
    lr.fit(X_train, y_train)
    
    y_train_pred = lr.predict(X_train)
    y_val_pred = lr.predict(X_val)
    # print(accuracy_score(y_val, y_val_pred))
    
    accuracy_train = accuracy_score(y_train, y_train_pred)
    accuracy_val = accuracy_score(y_val, y_val_pred)
    
    print(C, '\t', accuracy_train, '\t', accuracy_val)
    
    lrs.append(lr)

0.01 	 0.6943820224719102 	 0.6779026217228464
0.1 	 0.7887640449438202 	 0.7902621722846442
1.0 	 0.7775280898876404 	 0.7902621722846442
10.0 	 0.7842696629213484 	 0.7940074906367042
100.0 	 0.7820224719101123 	 0.797752808988764




- C가 100으로 했을 때 Validation Accuracy가 가장 최적합이다.

In [34]:
best_lr = lrs[4] # C=100.0
print(best_lr)

LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1234, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)


In [32]:
print(accuracy_score(y_test, best_dt.predict(X_test)))
print(accuracy_score(y_test, best_lr.predict(X_test)))

0.8379888268156425
0.8100558659217877


### Refitting
- 실전에서 사용할 때 Refitting을 해서 적용하다.

In [36]:
X_con = pd.concat([X_train, X_val])
y_con = pd.concat([y_train, y_val])

In [38]:
# Refitting
best_dt.fit(X_con, y_con)
best_lr.fit(X_con, y_con)



LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1234, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [39]:
print(accuracy_score(y_test, best_dt.predict(X_test)))
print(accuracy_score(y_test, best_lr.predict(X_test)))

0.8435754189944135
0.8100558659217877


### Make the go-live model

In [42]:
best_dt.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

## Homework
1. k-NN도 여러 가지 세팅으로 Training and Validation
2. Test set에 베스트 모델을 평가