In [100]:
# 1️⃣ 데이터 분리 (X, y)
# 2️⃣ 결측치 최소 처리 (drop or fillna)
# 3️⃣ 인코딩 (문자열 → 숫자)
# 4️⃣ 기본 모델 학습 (Logistic Regression or Neural Net)
# 5️⃣ 성능 확인 (train/test split)

import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler

In [101]:
pd.set_option('display.max_columns', None)  # 전체 컬럼 출력
pd.set_option('display.max_rows', None) 

In [102]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [103]:
X = train.drop(columns=['Survived','Name','Cabin','Ticket'], axis=1)
y = train['Survived']

In [104]:
X = X.fillna(0)
test = test.fillna(0)

In [105]:
X['Fare'] = np.log1p(X['Fare'])
test['Fare'] = np.log1p(test['Fare'])

In [106]:
int_cols = ['Age', 'Fare', 'SibSp', 'Parch']
scaler = StandardScaler()
scaler.fit(X[int_cols]) # mean, sd 학습

X[int_cols] = scaler.transform(X[int_cols])
test[int_cols] = scaler.transform(test[int_cols])

In [107]:
X = pd.get_dummies(X, columns=['Sex','Embarked'])
test = pd.get_dummies(test, columns=['Sex','Embarked'])

In [108]:
test, _ = test.align(X, join='right', axis=1, fill_value=0)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [109]:
print(type(X_train))
print(type(y_train))
print(X_train.shape)
print(y_train.shape)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(712, 12)
(712,)


In [110]:
X_train.dtypes

PassengerId      int64
Pclass           int64
Age            float64
SibSp          float64
Parch          float64
Fare           float64
Sex_female        bool
Sex_male          bool
Embarked_0        bool
Embarked_C        bool
Embarked_Q        bool
Embarked_S        bool
dtype: object

In [113]:
model = keras.Sequential([
    keras.Input(shape=(X_train.shape[1],)),
    layers.Dense(64, activation='relu', name='layer1'),
    layers.Dense(32, activation='relu', name='layer2'),
    layers.Dense(1, activation='sigmoid', name='layer3')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4930 - loss: 8.5191 - val_accuracy: 0.5866 - val_loss: 0.6519
Epoch 2/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5702 - loss: 1.0821 - val_accuracy: 0.5810 - val_loss: 1.0000
Epoch 3/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6025 - loss: 0.7345 - val_accuracy: 0.6816 - val_loss: 0.6183
Epoch 4/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6643 - loss: 0.6414 - val_accuracy: 0.7151 - val_loss: 0.5955
Epoch 5/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6475 - loss: 0.6612 - val_accuracy: 0.6201 - val_loss: 0.6465
Epoch 6/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6896 - loss: 0.5991 - val_accuracy: 0.6927 - val_loss: 0.5636
Epoch 7/20
[1m23/23[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x31f967710>

In [114]:
val_loss, val_acc = model.evaluate(X_val, y_val)
print("Validation Accuracy:", val_acc)

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7933 - loss: 0.4616 
Validation Accuracy: 0.7932960987091064


In [115]:
pred = model.predict(test)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [117]:
submission = pd.DataFrame({
    'PassengerId': pd.read_csv('test.csv')['PassengerId'],
    'Survived': (pred > 0.5).astype(int).flatten()
})

submission.to_csv('submission/submission_v2_feature_engineering.csv', index=False)