In [1]:
#[1]Step 
#1.understand the shape of the data(Histograms, box plots, etc.)
#2.Data cleaning
#3.Data Exploration
#4.Feature Engineering
#5.Data Preprocessing for Model
#6.Basic Model Building
#7.Model Tuning
#8.Ensemble Model Building
#9.Results 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [3]:
# data loading 
titanic_data = pd.read_csv('./titanic_train.csv')
titanic_test = pd.read_csv('./titanic_test.csv')

In [4]:
# delete 'Cabin' row  
titanic_data = titanic_data.drop(columns='Cabin', axis=1)
titanic_test = titanic_test.drop(columns='Cabin', axis=1)

In [5]:
# 'Age' row null change median 값 
titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)
titanic_test['Age'].fillna(titanic_test['Age'].median(), inplace=True)

In [6]:
# 'Embarked' row null change median 값 
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)
titanic_test['Embarked'].fillna(titanic_test['Embarked'].mode()[0], inplace=True)

In [7]:
# 'Survived' to string 
titanic_data['Survived'] = titanic_data['Survived'].astype(str)

In [8]:
# 'Embarked' str -> number 
titanic_data.replace({'Sex': {'male': 0, 'female': 1}, 'Embarked': {'S': 0, 'C': 1, 'Q': 2}}, inplace=True)
titanic_test.replace({'Sex': {'male': 0, 'female': 1}, 'Embarked': {'S': 0, 'C': 1, 'Q': 2}}, inplace=True)

In [9]:
# 'Name', 'Ticket' delete row 
titanic_data = titanic_data.drop(columns=['Name', 'Ticket'], axis=1)
titanic_test = titanic_test.drop(columns=['Name', 'Ticket'], axis=1)

In [10]:
# 독립 변수(X)와 종속 변수(Y) 설정
X = titanic_data.drop(columns=['Survived'], axis=1)
Y = titanic_data['Survived']

In [11]:
# null change median 값 
X.fillna(X.median(), inplace=True)
titanic_test.fillna(titanic_test.median(), inplace=True)

In [12]:
# test split x and y 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [13]:
# RandomForestClassifier 초기화 및 학습
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=2)
rf_model.fit(X_train, Y_train)

In [14]:
# train data accuracy 
X_train_prediction = rf_model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print('Accuracy score of training data (Random Forest): ', training_data_accuracy)

Accuracy score of training data (Random Forest):  0.952247191011236


In [15]:
# test data accuracy
X_test_prediction = rf_model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print('Accuracy score of test data (Random Forest): ', test_data_accuracy)

Accuracy score of test data (Random Forest):  0.7653631284916201


In [16]:
# cross_val_scores -> accuracy 
cross_val_scores = cross_val_score(rf_model, X, Y, cv=5)
print('Cross-validated accuracy:', np.mean(cross_val_scores))

Cross-validated accuracy: 0.8037097482894986


In [17]:
# test data predict 
test_predictions_rf = rf_model.predict(titanic_test)

In [18]:
# predict result change to DF 
submission_rf = pd.DataFrame({'PassengerId': titanic_test['PassengerId'], 'Survived': test_predictions_rf})

In [19]:
# Save CSV
submission_rf.to_csv('titanic_submission_rf.csv', index=False)