- 타이타닉 데이터 : https://dacon.io/competitions/open/235539/overview/description

In [29]:
import pandas as pd
import numpy as np
import os
from tensorflow.keras.layers import Dense
from tensorflow import keras

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

- Column
    - PassengerID : 탑승객 고유 아이디
	- Survival : 탑승객 생존 유무 (0: 사망, 1: 생존)
	- Pclass : 등실의 등급
	- Name : 이름
	- Sex : 성별
	- Age : 나이
	- Sibsp : 함께 탐승한 형제자매, 아내, 남편의 수
	- Parch : 함께 탐승한 부모, 자식의 수
	- Ticket :티켓 번호
	- Fare : 티켓의 요금
	- Cabin : 객실번호
	- Embarked : 배에 탑승한 항구 이름 ( C = Cherbourn, Q = Queenstown, S = Southampton)

#### 데이터 전처리

In [30]:
df_train = pd.read_csv('./Data/train.csv')
df_test = pd.read_csv('./Data/test.csv')

In [31]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [32]:
# 결측치 확인
# Age, Cabin, Embarked 칼럼의 결측치 존재
df_train.info()

# Pclass 별 평균값으로 Age 칼럼의 결측치 채우기
df_train.pivot_table(values = 'Age', index = 'Pclass', aggfunc = 'mean')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0_level_0,Age
Pclass,Unnamed: 1_level_1
1,38.233441
2,29.87763
3,25.14062


In [33]:
train_Age_null = df_train[df_train.Age.isnull()]
train_firstclass = train_Age_null[train_Age_null.Pclass == 1]
train_second = train_Age_null[train_Age_null.Pclass == 2]
train_third = train_Age_null[train_Age_null.Pclass == 3]

train_firstclass = train_firstclass.fillna(value = '38')
train_secondclass = train_second.fillna(value = '30')
train_thirdclass = train_third.fillna(value = '25')

train_drop_na = df_train.dropna(subset = ['Age'])

train_concat = pd.concat([train_drop_na, train_firstclass, train_secondclass, train_thirdclass])
df_train = train_concat
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 0 to 888
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    object 
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        362 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(1), int64(5), object(6)
memory usage: 90.5+ KB


In [34]:
# 이상치 확인
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.0,0.0,0.0
25%,223.5,0.0,2.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,1.0,0.0,31.0
max,891.0,1.0,3.0,8.0,6.0,512.3292


In [35]:
df_train['Age'] = df_train['Age'].astype('int')
# Age 연령대별 분류
for i in range(len(df_train)):
    if (df_train.loc[i,'Age'] > 0) & (df_train.loc[i,'Age'] <= 10) :
        df_train.loc[i,'Agee'] = '유아'
    elif (df_train.loc[i,'Age'] > 10) & (df_train.loc[i,'Age'] <= 20) :
        df_train.loc[i,'Agee'] = '10대'
    elif (df_train.loc[i, 'Age'] > 20) & (df_train.loc[i,'Age'] <= 30) :
        df_train.loc[i, 'Agee'] = '20대'
    elif (df_train.loc[i,'Age'] > 30) & (df_train.loc[i,'Age'] <= 40) :
        df_train.loc[i, 'Agee'] = '30대'
    elif (df_train.loc[i,'Age'] > 40) & (df_train.loc[i, 'Age'] <= 50) :
        df_train.loc[i, 'Agee'] = '40대'
    elif (df_train.loc[i, 'Age'] > 50) & (df_train.loc[i, 'Age'] <= 60) :
        df_train.loc[i, 'Agee'] = '50대'
    elif (df_train.loc[i, 'Age'] > 60) & (df_train.loc[i, 'Age'] <= 70) :
        df_train.loc[i, 'Agee'] = '60대'
    elif (df_train.loc[i, 'Age'] > 70) & (df_train.loc[i, 'Age'] <= 80) :
        df_train.loc[i,'Agee'] = '70대'
    elif (df_train.loc[i,'Age'] > 80) & (df_train.loc[i, 'Age'] <= 90) :
        df_train.loc[i, 'Agee'] = '80대'
    elif (df_train.loc[i, 'Age'] > 90) & (df_train.loc[i,'Age'] <= 100) :
        df_train.loc[i, 'Agee'] = '90대'

In [36]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Agee
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S,20대
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,30대
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S,20대
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S,30대
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S,30대


In [37]:
df_train = df_train.drop(columns='Cabin')

In [38]:
df_train[df_train['Agee'].isnull()=='True']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Agee


In [39]:
# 결측치 제거
df_train.dropna()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Agee
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.2500,S,20대
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C,30대
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.9250,S,20대
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1000,S,30대
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.0500,S,30대
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,25,0,0,2629,7.2292,C,20대
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,25,8,2,CA. 2343,69.5500,S,20대
868,869,0,3,"van Melkebeke, Mr. Philemon",male,25,0,0,345777,9.5000,S,20대
878,879,0,3,"Laleff, Mr. Kristo",male,25,0,0,349217,7.8958,S,20대


In [40]:
# EDA
# 

#### RandomForest

In [21]:
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(df_train, df_test, random_state=42)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth = 3, n_jobs=-1, random_state = 42)
rf.fit(train_input, train_target)

ValueError: Found input variables with inconsistent numbers of samples: [891, 418]