In [1]:
"""
Logistic

Missing Value Imputation Strategies:
spending columns → 0
age -> median
categorical → 'Unknown'

New Features:
None

Deleted Features:
Name
"""

"\nLogistic\n\nMissing Value Imputation Strategies:\nspending columns → 0\nage -> median\ncategorical → 'Unknown'\n\nNew Features:\nNone\n\nDeleted Features:\nName\n"

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
import missingno as msno

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

y = train["Transported"].astype(int)

X = train.drop(columns=["Transported"])
X_test = test.copy()

In [4]:
# spending columns → 0
spend_cols = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
X[spend_cols] = X[spend_cols].fillna(0)
X_test[spend_cols] = X_test[spend_cols].fillna(0)

# Age → median
X['Age'] = X['Age'].fillna(X['Age'].median())
X_test['Age'] = X_test['Age'].fillna(X['Age'].median())

# categorical → 'Unknown'
cat_cols = ['HomePlanet','CryoSleep','Destination','Cabin','VIP']
for c in cat_cols:
    X[c] = X[c].fillna('Unknown')
    X_test[c] = X_test[c].fillna('Unknown')


In [5]:
drop_cols = ['Name', 'Cabin']
X = X.drop(columns=drop_cols)
X_test = X_test.drop(columns=drop_cols)

In [6]:
pid_test = test["PassengerId"]       
X = X.drop(columns=["PassengerId"])
X_test  = X_test.drop(columns=["PassengerId"])

X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)

# train / test 컬럼 정렬
X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)


In [7]:

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(
    n_estimators=300,   
    max_depth=None,       
    min_samples_leaf=10, 
    max_features='sqrt', 
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

val_pred = rf.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, val_pred))

Validation Accuracy: 0.7872340425531915


In [9]:
test_pred = rf.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": pid_test,
    "Transported": test_pred.astype(bool)
})

submission.to_csv("./submission/v1_rf.csv", index=False)


In [10]:
feat_imp = (
    pd.Series(rf.feature_importances_, index=X_train.columns)
      .sort_values(ascending=False)
)

print(feat_imp.head(15))

CryoSleep_True               0.166847
Spa                          0.134766
CryoSleep_False              0.123047
RoomService                  0.113485
VRDeck                       0.110282
FoodCourt                    0.096630
ShoppingMall                 0.078278
Age                          0.062138
HomePlanet_Earth             0.045645
HomePlanet_Europa            0.034221
HomePlanet_Mars              0.012632
Destination_55 Cancri e      0.007452
Destination_TRAPPIST-1e      0.005696
Destination_PSO J318.5-22    0.002751
CryoSleep_Unknown            0.002606
dtype: float64
