# Predict survival on the Titanic

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

train_data = pd.read_csv("data\\train.csv")
test_data = pd.read_csv("data\\test.csv")

# Selecting features to use for model training
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']

# Combine train and test data to process missing values and encode categorical features
combined_data = pd.concat([train_data[features], test_data[features]])

# Replace missing values in numeric features with the average value
numerical_imputer = SimpleImputer(strategy='mean')
combined_data[['Age', 'Fare']] = numerical_imputer.fit_transform(combined_data[['Age', 'Fare']])

# Convert to a numerical format. 0 for 'male' and 1 for 'female'
le = LabelEncoder()
combined_data['Sex'] = le.fit_transform(combined_data['Sex'])

X_train = combined_data[:len(train_data)]
X_test = combined_data[len(train_data):]

X = X_train
y = train_data['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Evaluating the accuracy of the model RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy on validation set: {accuracy}')

predictions = model.predict(X_test)

# Result
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output

Accuracy on validation set: 0.8156424581005587


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [2]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_val)
accuracy_logistic = accuracy_score(y_val, y_pred_logistic)
print(f'Accuracy with Logistic Regression: {accuracy_logistic}')

Accuracy with Logistic Regression: 0.8100558659217877


In [3]:
from sklearn.svm import SVC

svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_val)
accuracy_svm = accuracy_score(y_val, y_pred_svm)
print(f'Accuracy with Support Vector Machine (SVM): {accuracy_svm}')

Accuracy with Support Vector Machine (SVM): 0.6536312849162011


In [4]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_val)
accuracy_xgb = accuracy_score(y_val, y_pred_xgb)
print(f'Accuracy with XGBoost: {accuracy_xgb}')

Accuracy with XGBoost: 0.8044692737430168
