# K-NN (타이타닉 데이터)
- 스케일러별 결과 비교

In [1]:
# 필요 라이브러리 import
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import LogisticRegression

## 1. 데이터 불러오기

In [2]:
file_url = "https://raw.githubusercontent.com/bigdata-young/bigdata_16th/main/data/titanic_train.csv"
df_raw = pd.read_csv(file_url, index_col=0)
pd.options.display.float_format = '{:.2f}'.format

In [3]:
df_raw.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


## 2. 전처리 - 결측치 대체, 변수 변환 및 제거, 훈련/시험 set 분리

In [5]:
# 전처리 사용자 정의 함수
def pre_processing(df : pd.DataFrame) :
    # 컬럼명 소문자 변환
    df.columns = df.columns.str.lower()
    # embarked 결측치 S로 대체
    df['embarked'] = df['embarked'].fillna("S")
    # fare 결측치 0으로 대체
    df['fare'] = df['fare'].fillna(0)
    # name 변수에서 title 추출. 수가 적은 title은 일괄적으로 Rare로 대체
    df['title'] = df['name'].str.extract('([A-Za-z]+)\.')
    rare_title = [i for i in set(df['title'])
                if list(df['title']).count(i) < 10]
    df['title'] = df['title'].replace(rare_title, 'Rare')
    # age 결측치 title 그룹별 평균으로 대체
    df['age'] = df['age'].fillna(df.groupby('title')['age'].transform('mean'))
    # cabin_class 변수 생성 (cabin 있으면 1, 없으면 0)
    df['cabin_class'] = df['cabin'].apply(lambda x : 1 if type(x) == str else 0)
    # family_class 변수 생성 (동승한 가족 있으면 1, 없으면 0)
    df['family'] = df['sibsp'] + df['parch']
    df['family_class'] = df['family'].apply(lambda x : 1 if x > 0 else 0)
    # ticket, name, title, cabin, sibsp, parch, family 변수 삭제 
    df_clean = df.drop(columns=['ticket','name','title','cabin','sibsp','parch','family'])
    return pd.get_dummies(df_clean, columns = ['sex', 'embarked'], drop_first=True)

In [6]:
df_clean1 = pre_processing(df_raw)
df_clean1.head()

Unnamed: 0_level_0,survived,pclass,age,fare,cabin_class,family_class,sex_male,embarked_Q,embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,22.0,7.25,0,1,1,0,1
2,1,1,38.0,71.28,1,1,0,0,0
3,1,3,26.0,7.92,0,0,0,0,1
4,1,1,35.0,53.1,1,1,0,0,1
5,0,3,35.0,8.05,0,0,1,0,1


In [7]:
df_clean1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   survived      891 non-null    int64  
 1   pclass        891 non-null    int64  
 2   age           891 non-null    float64
 3   fare          891 non-null    float64
 4   cabin_class   891 non-null    int64  
 5   family_class  891 non-null    int64  
 6   sex_male      891 non-null    uint8  
 7   embarked_Q    891 non-null    uint8  
 8   embarked_S    891 non-null    uint8  
dtypes: float64(2), int64(4), uint8(3)
memory usage: 51.3 KB


In [8]:
df_clean1.describe()

Unnamed: 0,survived,pclass,age,fare,cabin_class,family_class,sex_male,embarked_Q,embarked_S
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.38,2.31,29.76,32.2,0.23,0.4,0.65,0.09,0.73
std,0.49,0.84,13.28,49.69,0.42,0.49,0.48,0.28,0.45
min,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,21.77,7.91,0.0,0.0,0.0,0.0,0.0
50%,0.0,3.0,30.0,14.45,0.0,0.0,1.0,0.0,1.0
75%,1.0,3.0,35.9,31.0,0.0,1.0,1.0,0.0,1.0
max,1.0,3.0,80.0,512.33,1.0,1.0,1.0,1.0,1.0


In [9]:
# 훈련/시험 set 분리
X1_train, X1_test, y1_train, y1_test = train_test_split(
    df_clean1.drop('survived', axis=1),
    df_clean1['survived'], test_size=0.2, random_state=100
)

## 3. 표준화 스케일링

In [10]:
st_scaler1 = StandardScaler()
X1_train_st_scaled = st_scaler1.fit_transform(X1_train)
X1_test_st_scaled = st_scaler1.transform(X1_test)
X1_test_st_scaled

array([[ 0.84505322, -2.11458232, -0.43457715, ..., -1.38062393,
        -0.29780029,  0.60981061],
       [ 0.84505322, -0.83725886, -0.48379579, ..., -1.38062393,
         3.35795504, -1.6398534 ],
       [ 0.84505322, -0.23616546, -0.46887187, ...,  0.7243102 ,
        -0.29780029,  0.60981061],
       ...,
       [ 0.84505322, -0.38643881, -0.29477854, ...,  0.7243102 ,
        -0.29780029,  0.60981061],
       [ 0.84505322,  0.16717501, -0.18713123, ...,  0.7243102 ,
        -0.29780029,  0.60981061],
       [ 0.84505322, -1.5886256 , -0.34344275, ..., -1.38062393,
        -0.29780029, -1.6398534 ]])

In [11]:
knn1_st = KNeighborsClassifier()
knn1_st.fit(X1_train_st_scaled, y1_train)
st_pred1 = knn1_st.predict(X1_test_st_scaled)
st_pred1

array([1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 1], dtype=int64)

In [12]:
accuracy_score(y1_test, st_pred1)

0.7988826815642458

## 7. 결과 비교

In [None]:
mm_scaler = MinMaxScaler()
mm_scaled = mm_scaler.fit_transform(df_train2)
df_mm_scaled = pd.DataFrame(mm_scaled, columns = df_train2.columns)
df_mm_scaled.describe()

In [13]:
# logistic regression과 비교
result1_pred = [st_pred1, mm_pred1, rb_pred1, lr_pred1]
pd.options.display.float_format = '{:.4f}'.format
result1 = pd.Series((accuracy_score(y1_test, i) for i in result1_pred),
                    index=["standard","min-max","robust","logistic"]
                   ,name="accuracy_score")
result1

NameError: name 'mm_pred1' is not defined

In [None]:
# k 값에 따라 비교
def result1_k(k) :
    # standard
    knn1_st_k = KNeighborsClassifier(k)
    knn1_st_k.fit(X1_train_st_scaled, y1_train)
    st_k_pred1 = knn1_st_k.predict(X1_test_st_scaled)
    # min-max
    knn1_mm_k = KNeighborsClassifier(k)
    knn1_mm_k.fit(X1_train_mm_scaled, y1_train)
    mm_k_pred1 = knn1_mm_k.predict(X1_test_mm_scaled)
    # robust
    knn1_rb_k = KNeighborsClassifier(k)
    knn1_rb_k.fit(X1_train_rb_scaled, y1_train)
    rb_k_pred1 = knn1_rb_k.predict(X1_test_rb_scaled)
    # 결과
    result1_k = pd.DataFrame([accuracy_score(y1_test, st_k_pred1),
                              accuracy_score(y1_test, mm_k_pred1),
                              accuracy_score(y1_test, rb_k_pred1)],
                             index=['standard','min-max','robust'], columns=[k])
    return result1_k

In [None]:
result1_k(10)

In [None]:
# k값 변화시키며 비교
def result1_k_range(a, b) :
    result1_k = pd.DataFrame()
    for k in range(a, b+1) :
        # standard
        knn1_st_k = KNeighborsClassifier(k)
        knn1_st_k.fit(X1_train_st_scaled, y1_train)
        st_k_pred1 = knn1_st_k.predict(X1_test_st_scaled)
        # min-max
        knn1_mm_k = KNeighborsClassifier(k)
        knn1_mm_k.fit(X1_train_mm_scaled, y1_train)
        mm_k_pred1 = knn1_mm_k.predict(X1_test_mm_scaled)
        # robust
        knn1_rb_k = KNeighborsClassifier(k)
        knn1_rb_k.fit(X1_train_rb_scaled, y1_train)
        rb_k_pred1 = knn1_rb_k.predict(X1_test_rb_scaled)
        # 결과 합치기
        result1_k = pd.concat([result1_k, 
                               pd.DataFrame([accuracy_score(y1_test, st_k_pred1),
                                             accuracy_score(y1_test, mm_k_pred1),
                                             accuracy_score(y1_test, rb_k_pred1)],
                                           index=['standard','min-max','robust'], columns=[k]).T])

    return result1_k

In [None]:
result1_k_range(1, 20)