# 지도 학습 - Classification

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import font_manager, rc, cm
import matplotlib as mpl

# 마이너스 깨짐 현상 해결
mpl.rcParams['axes.unicode_minus'] = False
# 한글 폰트 지정
plt.rcParams['font.family'] = 'NanumGothic'

import sklearn
# 의사결정나무
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Titanic

## 데이터 가져오기

In [103]:
file_path = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'

raw = pd.read_csv(file_path)
titanic = raw.copy()
print(titanic.shape)
titanic.head(2)

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [104]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [105]:
# 요약 통계량
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## EDA
* 함부로 컬럼 없애기 겹치지 않기

### 결측치 처리

In [106]:
# Age : 평균으로 전처리
age_mean = titanic.Age.mean()
titanic['Age_new'] = titanic.Age.fillna(age_mean)

In [107]:
# Embarked : 최빈값으로 전처리
embarked_mode = titanic.Embarked.mode()[0]
titanic['Embarked_new'] = titanic.Embarked.fillna(embarked_mode)

In [108]:
titanic.info() # 결측치 채워졌는지 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   891 non-null    int64  
 1   Survived      891 non-null    int64  
 2   Pclass        891 non-null    int64  
 3   Name          891 non-null    object 
 4   Sex           891 non-null    object 
 5   Age           714 non-null    float64
 6   SibSp         891 non-null    int64  
 7   Parch         891 non-null    int64  
 8   Ticket        891 non-null    object 
 9   Fare          891 non-null    float64
 10  Cabin         204 non-null    object 
 11  Embarked      889 non-null    object 
 12  Age_new       891 non-null    float64
 13  Embarked_new  891 non-null    object 
dtypes: float64(3), int64(5), object(6)
memory usage: 97.6+ KB


In [109]:
# Cabin 전처리 >> 없애기
titanic.drop(columns=['Age', 'Cabin', 'Embarked'], inplace=True)

In [110]:
titanic.info() # 결측치 처리 완료. 데이터 타입 변경해야함. (인코딩)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   891 non-null    int64  
 1   Survived      891 non-null    int64  
 2   Pclass        891 non-null    int64  
 3   Name          891 non-null    object 
 4   Sex           891 non-null    object 
 5   SibSp         891 non-null    int64  
 6   Parch         891 non-null    int64  
 7   Ticket        891 non-null    object 
 8   Fare          891 non-null    float64
 9   Age_new       891 non-null    float64
 10  Embarked_new  891 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


In [111]:
# Sex, Embarked 컬럼 >> label encoding
from sklearn.preprocessing import LabelEncoder # labelencoder를 가져오는 이유는 sex 컬럼은 one-hot이지만, 다른 컬럼도 인코딩해야할 것이 있기 때문이다.

titanic['Sex_label'] = LabelEncoder().fit_transform(titanic.Sex)
titanic['Embarked_label'] = LabelEncoder().fit_transform(titanic.Embarked_new)

In [112]:
# SibSp, Parch 값을 더하기 >> FamiliySize (파생변수 생성)
titanic['FamilySize'] = titanic.SibSp + titanic.Parch

In [113]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PassengerId     891 non-null    int64  
 1   Survived        891 non-null    int64  
 2   Pclass          891 non-null    int64  
 3   Name            891 non-null    object 
 4   Sex             891 non-null    object 
 5   SibSp           891 non-null    int64  
 6   Parch           891 non-null    int64  
 7   Ticket          891 non-null    object 
 8   Fare            891 non-null    float64
 9   Age_new         891 non-null    float64
 10  Embarked_new    891 non-null    object 
 11  Sex_label       891 non-null    int32  
 12  Embarked_label  891 non-null    int32  
 13  FamilySize      891 non-null    int64  
dtypes: float64(2), int32(2), int64(6), object(4)
memory usage: 90.6+ KB


In [114]:
titanic.head(2) # 인코딩 완료 >> 분석 데이터 준비하기

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Age_new,Embarked_new,Sex_label,Embarked_label,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.25,22.0,S,1,2,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833,38.0,C,0,0,1


In [115]:
# 분석 데이터 준비
# 독립변수
# X = df[['PassengerId', 'Pclass', 'Sex_label', 'Age_new', 'Fare', 'Embarked_label', 'FamilySize']]
X = titanic[['Pclass', 'Sex_label', 'Age_new', 'Fare', 'Embarked_label', 'FamilySize']]
# 종속변수
y = titanic['Survived']

In [116]:
# 훈련, 테스트 데이터 준비
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(712, 6) (712,) (179, 6) (179,)


## 모델링 - 의사결정나무

In [117]:
# 첫 번쨰 모델 : 의사결정나무
model = DecisionTreeClassifier(random_state=42)

# fitting
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)

In [118]:
# 모델 성능 평가 - 정확도 측정
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, y_pred)
print('정확도 : {:.2f}%'.format(acc*100))

정확도 : 75.42%


* passengerID 있 : 75.98%
* passengerID 없 : 75.42%

## 모델링 - 서포트벡터머신
* 라벨인코딩이 문제가 있기 때문에 원핫인코딩을 해주어야 한다.
* 서포트벡터머신에 맞는 인코딩 방법이 있는 것 같음. 이거 연습 + 공부하기

### 추가 전처리

In [119]:
# 데이터 다시 로드
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PassengerId     891 non-null    int64  
 1   Survived        891 non-null    int64  
 2   Pclass          891 non-null    int64  
 3   Name            891 non-null    object 
 4   Sex             891 non-null    object 
 5   SibSp           891 non-null    int64  
 6   Parch           891 non-null    int64  
 7   Ticket          891 non-null    object 
 8   Fare            891 non-null    float64
 9   Age_new         891 non-null    float64
 10  Embarked_new    891 non-null    object 
 11  Sex_label       891 non-null    int32  
 12  Embarked_label  891 non-null    int32  
 13  FamilySize      891 non-null    int64  
dtypes: float64(2), int32(2), int64(6), object(4)
memory usage: 90.6+ KB


In [121]:
# sex, embarked_new >> 원핫인코딩
sex_onehot = pd.get_dummies(titanic.Sex)
embarked_onehot = pd.get_dummies(titanic.Embarked_new)

# 원핫 인코딩으로 나눈 데이터를 df에 추가하기
titanic = pd.concat([titanic, sex_onehot, embarked_onehot], axis=1)
print(titanic.shape)
titanic.head(2)

(891, 19)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Age_new,Embarked_new,Sex_label,Embarked_label,FamilySize,female,male,C,Q,S
0,1,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.25,22.0,S,1,2,1,False,True,False,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833,38.0,C,0,0,1,True,False,True,False,False


In [122]:
# 분석 데이터 셋 준비
# 독립변수
# X = df[['PassengerId', 'Pclass', 'Fare', 'Age_new', 'FamilySize', 'Sex_label', 'Embarked_label', 'female', 'male', 'C', 'Q', 'S']]
X = titanic[['Pclass', 'Fare', 'Age_new', 'FamilySize', 'female', 'male', 'C', 'Q', 'S']]
# 종속변수
y = titanic.Survived

# training / test data
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(712, 9) (712,) (179, 9) (179,)


In [123]:
# 두 번째 모델 : 서포트 벡터 머신
from sklearn import svm

model = svm.SVC(kernel='linear', random_state=42)

# fitting
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)

In [124]:
# 모델 성능 평가 - 정확도 측정
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, y_pred)
print('정확도 : {:.2f}%'.format(acc*100))

정확도 : 78.21%


* Linear 커널 사용 안함
    * FamilySize 있을 경우 : 59.78%
    * PassengerID 없 : 65.92% / sex와 embarked를 라벨인코딩으로 한 경우 : 65.92%
* Linear 커널 사용함.
    * 78.21%

In [125]:
# 혼동행렬
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix(y_test, y_pred)

array([[88, 17],
       [22, 52]], dtype=int64)

In [126]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.80      0.84      0.82       105
           1       0.75      0.70      0.73        74

    accuracy                           0.78       179
   macro avg       0.78      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179



## 모델링 - 랜덤포레스트

In [130]:
# 분석 데이터 셋 준비
# 독립변수
# features : 'PassengerId', 'Pclass', 'Fare', 'Age_new', 'FamilySize', 'Sex_label', 'Embarked_label', 'female', 'male', 'C', 'Q', 'S'
X = titanic[['Pclass', 'Fare', 'Age_new', 'FamilySize', 'female', 'male', 'C', 'Q', 'S']]
# 종속변수
y = titanic.Survived

# training / test data
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(712, 9) (712,) (179, 9) (179,)


In [131]:
from sklearn.ensemble import RandomForestClassifier

model = \
RandomForestClassifier(n_estimators=50,
                       max_depth=3,
                       random_state=42)

# fitting
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)

In [132]:
# 모델 성능 평가 - 정확도 측정
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, y_pred)
print('정확도 : {:.2f}%'.format(acc*100))

정확도 : 79.89%


* label 인코딩 사용 : 77.65%
* one-hot 인코딩 사용 : 79.89%

# Iris

In [133]:
file_path = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv'

raw = pd.read_csv(file_path)
iris = raw.copy()
print(iris.shape)
iris.head(2)

(150, 5)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [134]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [135]:
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


## EDA

In [136]:
# 독립변수 정규화 >> min-max 정규화
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

iris[['sepal_length']] = scaler.fit_transform(iris[['sepal_length']])
iris[['sepal_width']] = scaler.fit_transform(iris[['sepal_width']])
iris[['petal_length']] = scaler.fit_transform(iris[['petal_length']])
iris[['petal_width']] = scaler.fit_transform(iris[['petal_width']])

In [137]:
iris.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,0.222222,0.625,0.067797,0.041667,setosa
1,0.166667,0.416667,0.067797,0.041667,setosa


In [138]:
# 종속변수 인코딩 >> 라벨 인코딩
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

iris['species'] = encoder.fit_transform(iris['species'])

In [139]:
iris.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,0.222222,0.625,0.067797,0.041667,0
1,0.166667,0.416667,0.067797,0.041667,0


In [140]:
# 분석 데이터 셋 준비하기
# 독립변수
X = iris.iloc[:, :-1]
# 종속변수
y = iris.species

# training / test data
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(120, 4) (120,) (30, 4) (30,)


## 모델링 - KNN (K-최근접 이웃)

In [141]:
# 모델 : KNN (K-최근접 이웃)
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=3)

# fitting
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)

In [142]:
# 모델 성능 평가 - 정확도 측정
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, y_pred)
print('정확도 : {:.2f}%'.format(acc*100))

정확도 : 100.00%


### 추가 분석

In [143]:
# 혼동 행렬
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[10,  0,  0],
       [ 0,  9,  0],
       [ 0,  0, 11]], dtype=int64)

In [144]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



## 모델링 - 로지스틱 회귀

In [145]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [146]:
# 모델 성능 평가 - 정확도 측정
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, y_pred)
print('정확도 : {:.2f}%'.format(acc*100))

정확도 : 96.67%
