In [88]:
import numpy as np
import pandas as pd

titanic = pd.read_csv('./data/titanic_train.csv')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [89]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


- 정확도의 문제점 : 이진 분류의 경우 데이터의 구성에 따라서 모델의 성능을 왜곡할 수 있다.

In [90]:
from sklearn.base import BaseEstimator

class UserClassifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    
    # 성별이 1이면 0, 아니면 1로 예측
    def predict(self, X):
        pass
    
        

In [91]:
titanic_label = titanic['Survived']
titanic_data  = titanic.drop(['Survived'], axis = 1)

In [62]:
# 불필요한 피처 제거(PassengerId, Name, Ticket)

titanic_data = titanic_data.drop(['PassengerId', 'Name', 'Ticket'], axis = 1)
titanic_data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.0,1,0,7.2500,,S
1,1,female,38.0,1,0,71.2833,C85,C
2,3,female,26.0,0,0,7.9250,,S
3,1,female,35.0,1,0,53.1000,C123,S
4,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,,S
887,1,female,19.0,0,0,30.0000,B42,S
888,3,female,,1,2,23.4500,,S
889,1,male,26.0,0,0,30.0000,C148,C


In [63]:
# Preprocessing
# - age: 평균으로 채우기
titanic_data['Age'] = titanic_data['Age'].fillna(titanic_data['Age'].mean())
titanic_data['Age']
# - cabin : N 으로 채우기
titanic_data['Cabin'] = titanic_data['Cabin'].fillna('N')
titanic_data['Cabin']
# - Embarked : N 으로 채우기
titanic_data['Embarked'] = titanic_data['Embarked'].fillna('N')
titanic_data['Embarked']

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [64]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Cabin     891 non-null    object 
 7   Embarked  891 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 55.8+ KB


In [81]:
# 레이블 인코딩 ( Cabin, Sex, Embarked )
# 여기다가 for 문만 추가해주면 강사님이 해주신거랑 똑같이 나옴.
from sklearn.preprocessing import LabelEncoder

titanic_data['Cabin'] = titanic['Cabin'].str[:1]
titanic_data['Cabin'].value_counts()
cabin = ['C', 'B', 'D', 'E', 'A', 'F', 'G', 'T', 'N']
encoder = LabelEncoder()
encoder.fit(cabin)
cabin_labels = encoder.transform(cabin)

print(cabin_labels)

sex = ['male', 'female']
encoder = LabelEncoder()
encoder.fit(sex)
sex_labels = encoder.transform(sex)

print(sex_labels)

embarked = ['S', 'C', 'Q', 'N']
encoder  = LabelEncoder()
encoder.fit(embarked)
embarked_labels = encoder.transform(embarked)

print(embarked_labels)


[2 1 3 4 0 5 6 8 7]
[1 0]
[3 0 2 1]


In [92]:
from sklearn.preprocessing import LabelEncoder

# 레이블 인코딩 Cabin , Sex , Embarked
def drop_features(frm):
    frm.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return frm


def pre_processing(frm):
    frm['Age'].fillna(frm['Age'].mean(), inplace=True)
    frm['Cabin'].fillna('N', inplace=True)
    frm['Embarked'].fillna('N', inplace=True)
    return frm


def label_encoder(frm):
    frm['Cabin'] = frm['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        encoder = LabelEncoder()
        encoder.fit(frm[feature])
        frm[feature] = encoder.transform(frm[feature])
    return frm

def transform_features(frm):
    frm = drop_features(frm)
    frm = pre_processing(frm)
    frm = label_encoder(frm)
    return frm

In [93]:
titanic_data_feature = transform_features(titanic_data)

In [94]:
titanic_data_feature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    int32  
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Cabin     891 non-null    int32  
 7   Embarked  891 non-null    int32  
dtypes: float64(2), int32(3), int64(3)
memory usage: 45.4 KB
