In [3]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [4]:
# File paths
path = 'Z:/VS Project/HDAT/PCOR/homework/titanic/data/'
data = pd.read_csv(path + 'train.csv')
data1 = pd.read_csv(path + 'test.csv')
data2 = pd.read_csv(path + 'submission.csv')

In [5]:
# Data preprocessing
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = 'Survived'

In [6]:
# Preparing training data
X_train = data[features]
y_train = data[target]

In [7]:
# Preprocessing pipelines for numerical and categorical data
numeric_features = ['Age', 'SibSp', 'Parch', 'Fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [8]:
categorical_features = ['Pclass', 'Sex', 'Embarked']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [10]:
# Fit and transform the training data
X_train_preprocessed = preprocessor.fit_transform(X_train)
y_train = y_train.values

In [11]:
# Preparing test data
X_test = data1[features]
X_test_preprocessed = preprocessor.transform(X_test)

In [12]:
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

In [13]:
# 모델 생성
model = Sequential([
    # 첫 번째 은닉층: 뉴런 수를 증가시키고, L2 정규화 적용
    Dense(64, activation='relu', input_shape=(X_train_preprocessed.shape[1],), kernel_regularizer=l2(0.001)),
    Dropout(0.3),  # 과적합 방지를 위한 드롭아웃 적용
    # 두 번째 은닉층
    Dense(32, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    # 세 번째 은닉층
    Dense(16, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    # 출력층
    Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [14]:
# 옵티마이저: Adam, 학습률을 점진적으로 감소시키는 스케줄러 적용
optimizer = Adam(learning_rate=0.001)

model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [25]:
# Train the model
model.fit(X_train_preprocessed, y_train, epochs=50, batch_size=32, validation_split=0.2)


Epoch 1/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.8182 - loss: 0.4435 - val_accuracy: 0.8771 - val_loss: 0.3526
Epoch 2/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8268 - loss: 0.4185 - val_accuracy: 0.8715 - val_loss: 0.3540
Epoch 3/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8491 - loss: 0.3908 - val_accuracy: 0.8883 - val_loss: 0.3572
Epoch 4/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8362 - loss: 0.3978 - val_accuracy: 0.8771 - val_loss: 0.3600
Epoch 5/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8199 - loss: 0.4328 - val_accuracy: 0.8715 - val_loss: 0.3573
Epoch 6/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8222 - loss: 0.4369 - val_accuracy: 0.8771 - val_loss: 0.3615
Epoch 7/50
[1m23/23[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x2c43b559010>

In [26]:
# Generate predictions for test set
predictions = model.predict(X_test_preprocessed)
predictions = (predictions > 0.5).astype(int).reshape(-1)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [27]:
# Load the submission file to compare predicted results
true_labels = data2['Survived'].values

In [28]:
# Evaluate the model's accuracy based on the provided submission file
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy of the model on the test set is: {accuracy}")

Accuracy of the model on the test set is: 0.9521531100478469


In [19]:
# predictions 배열과 PassengerId를 결합하여 제출 파일 생성
submission_df = pd.DataFrame({
    'PassengerId': data1['PassengerId'],
    'Survived': predictions
})

In [20]:
# 제출 파일 저장 경로
submission_path = path + 'submission.csv'

# 제출 파일을 CSV 형식으로 저장
submission_df.to_csv(submission_path, index=False)