In [61]:
from google.colab import drive
drive.mount('/content/drive')
data_path = '/content/drive/MyDrive/data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [62]:
import pandas as pd
import numpy as np

In [63]:
train = pd.read_csv(f'{data_path}titanic_train.csv')
test = pd.read_csv(f'{data_path}titanic_test.csv')

In [64]:
train.head()

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
1,462,0,3,"Morley, Mr. William",male,34.0,0,0,364506,8.05,,S
2,1286,0,3,"Kink-Heilmann, Mr. Anton",male,29.0,3,1,315153,22.025,,S
3,1130,1,2,"Hiltunen, Miss. Marta",female,18.0,1,1,250650,13.0,,S
4,461,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.55,E12,S


In [65]:
# 범주형 변수 분석
train.select_dtypes('object')

Unnamed: 0,name,gender,ticket,cabin,embarked
0,"Artagaveytia, Mr. Ramon",male,PC 17609,,C
1,"Morley, Mr. William",male,364506,,S
2,"Kink-Heilmann, Mr. Anton",male,315153,,S
3,"Hiltunen, Miss. Marta",female,250650,,S
4,"Anderson, Mr. Harry",male,19952,E12,S
...,...,...,...,...,...
911,"Lesurer, Mr. Gustave J",male,PC 17755,B101,C
912,"Ryan, Mr. Patrick",male,371110,,Q
913,"Coleff, Mr. Peju",male,349210,,S
914,"Rekic, Mr. Tido",male,349249,,S


In [66]:
cols = train.select_dtypes('object').columns
cols

Index(['name', 'gender', 'ticket', 'cabin', 'embarked'], dtype='object')

In [67]:
train_ft = train[cols].copy()
test_ft = test[cols].copy()
train_ft.shape, test_ft.shape

((916, 5), (393, 5))

In [68]:
train_ft.isnull().sum()

Unnamed: 0,0
name,0
gender,0
ticket,0
cabin,706
embarked,0


In [69]:
test_ft.isnull().sum()

Unnamed: 0,0
name,0
gender,0
ticket,0
cabin,308
embarked,2


In [70]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(706, 310)

In [71]:
train_ft['cabin'] = train_ft['cabin'].fillna('etc')
train_ft['cabin']

Unnamed: 0,cabin
0,etc
1,etc
2,etc
3,etc
4,E12
...,...
911,B101
912,etc
913,etc
914,etc


In [72]:
test_ft['cabin'] = test_ft['cabin'].fillna('etc')
embarked_mode = train_ft['embarked'].mode()[0]
test_ft['embarked'] = test_ft['embarked'].fillna(embarked_mode)
test_ft['cabin']

Unnamed: 0,cabin
0,C87
1,etc
2,etc
3,etc
4,etc
...,...
388,etc
389,etc
390,etc
391,etc


In [73]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

In [74]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown = 'ignore')
enc.fit(train_ft[cols])

In [75]:
tmp = pd.DataFrame(
    enc.transform(train_ft[cols]).toarray(),
    columns = enc.get_feature_names_out()
)
train_ft = pd.concat([train_ft, tmp], axis = 1).drop(columns = cols)
train_ft

Unnamed: 0,"name_Abbott, Master. Eugene Joseph","name_Abbott, Mr. Rossmore Edward","name_Abbott, Mrs. Stanton (Rosa Hunt)","name_Abelseth, Miss. Karen Marie","name_Abelseth, Mr. Olaus Jorgensen","name_Abelson, Mr. Samuel","name_Abrahamsson, Mr. Abraham August Johannes","name_Adahl, Mr. Mauritz Nils Martin","name_Aks, Mrs. Sam (Leah Rosen)","name_Albimona, Mr. Nassef Cassem",...,cabin_F G73,cabin_F2,cabin_F33,cabin_F4,cabin_G6,cabin_T,cabin_etc,embarked_C,embarked_Q,embarked_S
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [76]:
tmp = pd.DataFrame(
    enc.transform(test_ft[cols]).toarray(),
    columns = enc.get_feature_names_out()
)
test_ft = pd.concat([test_ft, tmp], axis = 1).drop(columns = cols)
test_ft

Unnamed: 0,"name_Abbott, Master. Eugene Joseph","name_Abbott, Mr. Rossmore Edward","name_Abbott, Mrs. Stanton (Rosa Hunt)","name_Abelseth, Miss. Karen Marie","name_Abelseth, Mr. Olaus Jorgensen","name_Abelson, Mr. Samuel","name_Abrahamsson, Mr. Abraham August Johannes","name_Adahl, Mr. Mauritz Nils Martin","name_Aks, Mrs. Sam (Leah Rosen)","name_Albimona, Mr. Nassef Cassem",...,cabin_F G73,cabin_F2,cabin_F33,cabin_F4,cabin_G6,cabin_T,cabin_etc,embarked_C,embarked_Q,embarked_S
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [77]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [78]:
train_ft[train_ft.columns] = scaler.fit_transform(train_ft)
train_ft.head()

Unnamed: 0,"name_Abbott, Master. Eugene Joseph","name_Abbott, Mr. Rossmore Edward","name_Abbott, Mrs. Stanton (Rosa Hunt)","name_Abelseth, Miss. Karen Marie","name_Abelseth, Mr. Olaus Jorgensen","name_Abelson, Mr. Samuel","name_Abrahamsson, Mr. Abraham August Johannes","name_Adahl, Mr. Mauritz Nils Martin","name_Aks, Mrs. Sam (Leah Rosen)","name_Albimona, Mr. Nassef Cassem",...,cabin_F G73,cabin_F2,cabin_F33,cabin_F4,cabin_G6,cabin_T,cabin_etc,embarked_C,embarked_Q,embarked_S
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [79]:
test_ft[test_ft.columns] = scaler.transform(test_ft)
test_ft.head()

Unnamed: 0,"name_Abbott, Master. Eugene Joseph","name_Abbott, Mr. Rossmore Edward","name_Abbott, Mrs. Stanton (Rosa Hunt)","name_Abelseth, Miss. Karen Marie","name_Abelseth, Mr. Olaus Jorgensen","name_Abelson, Mr. Samuel","name_Abrahamsson, Mr. Abraham August Johannes","name_Adahl, Mr. Mauritz Nils Martin","name_Aks, Mrs. Sam (Leah Rosen)","name_Albimona, Mr. Nassef Cassem",...,cabin_F G73,cabin_F2,cabin_F33,cabin_F4,cabin_G6,cabin_T,cabin_etc,embarked_C,embarked_Q,embarked_S
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [80]:
target = train['survived']
target

Unnamed: 0,survived
0,0
1,0
2,0
3,1
4,1
...,...
911,1
912,0
913,0
914,0


In [81]:
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(train_ft, target, test_size = 0.2, random_state = 42)
x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

((732, 1757), (184, 1757), (732,), (184,))

In [82]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state = 42)
model.fit(x_train, y_train)

In [83]:
pred = model.predict(x_valid)

In [84]:
from sklearn.metrics import accuracy_score
accuracy_score(y_valid, pred)

0.8315217391304348

In [85]:
model.fit(train_ft, target)

In [86]:
pred = model.predict(test_ft)
pred

array([0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,

In [87]:
pd.DataFrame(pred, columns = ['target']).to_csv('이동규_로지스틱모델.csv', index = False)