In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [6]:
# Load the data
titanic_data = pd.read_csv('./titanic_train.csv')
test_data = pd.read_csv('./titanic_test.csv')

# Feature Engineering: Add a new feature "FamilySize" combining "SibSp" and "Parch"
titanic_data['FamilySize'] = titanic_data['SibSp'] + titanic_data['Parch'] + 1
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1

# Drop irrelevant columns
titanic_data = titanic_data.drop(columns=['Cabin', 'Name', 'Ticket'], axis=1)
test_data = test_data.drop(columns=['Cabin', 'Name', 'Ticket'], axis=1)

# Handle missing values
titanic_data['Age'].fillna(titanic_data['Age'].mean(), inplace=True)
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)

test_data['Age'].fillna(test_data['Age'].mean(), inplace=True)
test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mean(), inplace=True)

# Convert categorical columns using one-hot encoding
titanic_data = pd.get_dummies(titanic_data, columns=['Sex', 'Embarked'])
test_data = pd.get_dummies(test_data, columns=['Sex', 'Embarked'])

# Prepare data for training
X = titanic_data.drop(columns=['PassengerId', 'Survived'], axis=1)
Y = titanic_data['Survived']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

# Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=2)
rf_model.fit(X_train, Y_train)

# Evaluate the model
X_train_prediction = rf_model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print('Accuracy score of training data: ', training_data_accuracy)

X_test_prediction = rf_model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print('Accuracy score of test data: ', test_data_accuracy)

# Make predictions for test data
test_predictions = rf_model.predict(test_data.drop(columns=['PassengerId']))

# Create a submission dataframe
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': test_predictions})

# Save the submission to a CSV file
submission.to_csv('titanic_submission2.csv', index=False)


Accuracy score of training data:  0.9859550561797753
Accuracy score of test data:  0.7932960893854749
