In [1]:
# 1️⃣ 데이터 분리 (X, y)
# 2️⃣ 결측치 최소 처리 (drop or fillna)
# 3️⃣ 인코딩 (문자열 → 숫자)
# 4️⃣ 기본 모델 학습 (Logistic Regression or Neural Net)
# 5️⃣ 성능 확인 (train/test split)

import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
pd.set_option('display.max_columns', None)  # 전체 컬럼 출력
pd.set_option('display.max_rows', None) 

In [3]:
train = pd.read_csv('train.csv')

In [4]:
X = train.drop(columns=['Survived','Name','Cabin','Ticket'], axis=1)
y = train['Survived']

X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.2,random_state=42)

In [5]:
X_train = X_train.fillna(0)
X_val = X_val.fillna(0)
X_train = pd.get_dummies(X_train, columns=['Sex','Embarked'])
X_val = pd.get_dummies(X_val, columns=['Sex','Embarked'])

X_train, X_val = X_train.align(X_val, join='left', axis=1, fill_value=0)

In [6]:
print(type(X_train))
print(type(y_train))
print(X_train.shape)
print(y_train.shape)


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(712, 12)
(712,)


In [7]:
model = keras.Sequential([
    keras.Input(shape=(X_train.shape[1],)),
    layers.Dense(64, activation='relu', name='layer1'),
    layers.Dense(32, activation='relu', name='layer2'),
    layers.Dense(1, activation='sigmoid', name='layer3')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))


Epoch 1/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5112 - loss: 7.6163 - val_accuracy: 0.5698 - val_loss: 3.8505
Epoch 2/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5421 - loss: 2.2645 - val_accuracy: 0.5978 - val_loss: 1.0363
Epoch 3/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5758 - loss: 0.8383 - val_accuracy: 0.6425 - val_loss: 0.7271
Epoch 4/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6531 - loss: 0.6764 - val_accuracy: 0.6816 - val_loss: 0.6573
Epoch 5/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6657 - loss: 0.6256 - val_accuracy: 0.6034 - val_loss: 0.7546
Epoch 6/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6615 - loss: 0.6452 - val_accuracy: 0.6592 - val_loss: 0.5779
Epoch 7/20
[1m23/23[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x31ac7a610>

In [8]:
val_loss, val_acc = model.evaluate(X_val, y_val)
print("Validation Accuracy:", val_acc)

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7709 - loss: 0.5335 
Validation Accuracy: 0.7709497213363647


In [9]:
test = pd.read_csv('test.csv')

In [10]:
test = test.fillna(0)
test = pd.get_dummies(test)
X_train, test = X_train.align(test, join='left', axis=1, fill_value=0)

In [11]:
pred = model.predict(test)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [12]:
submission = pd.DataFrame({
    'PassengerId': pd.read_csv('test.csv')['PassengerId'],
    'Survived': (pred > 0.5).astype(int).flatten()
})

In [15]:
submission.to_csv('submission/submission_v1.csv', index=False)