In [11]:
# import packages
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# load csv files
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
gender_df = pd.read_csv("gender_submission.csv")

In [None]:
# ensure csv can be read

train_df.info()
train_df.describe()
train_df.isnull().sum()

In [None]:
# observe graphs

# survival counts
sns.countplot(x='Survived', data=train_df)
plt.title("Survival Count")
plt.show()

# survival by sex
sns.countplot(x='Sex', hue='Survived', data=train_df)
plt.title("Survival by Sex")
plt.show()

# survival by class
sns.countplot(x='Pclass', hue='Survived', data=train_df)
plt.title("Survival by Passenger Class")
plt.show()

# those with higher class, especially those who are women,
# survive at disproportionally higher rates than the lower class and males in general.

In [None]:
# handle missing data

# fill missing "Age" with median age
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].median())

# fill missing "Embarked" with most frequent port
train_df['Embarked'] =train_df['Embarked'].fillna(test_df['Embarked'].mode()[0])

# fill missing "Fare" in test set
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].median())

In [19]:
# encode Sex and Embarked into numerical values

from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()

for col in ['Sex', 'Embarked']:
    train_df[col] = label.fit_transform(train_df[col])
    test_df[col] = label.transform(test_df[col])

In [None]:
# checks sex and embarked are numerical
train_df.head()

In [None]:
# define X and Y

X = train_df.drop(['Survived', 'Name', 'Ticket'], axis=1)
y = train_df['Survived']

In [26]:
# split into two sets (training and validation)

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# logistic regression model training

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# evaluate accuracy on training set
y_pred = model.predict(X_val)
print("Accuracy: ", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

In [None]:
test_features = test_df.drop(['PassengerId', 'Name', 'Ticket'], axis=1)

# ensure columns align
test_features = test_features.reindex(columns=X.columns, fill_value=0)

preds = model.predict(test_features)

output = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': preds
})

# export results

output.to_csv('results.csv', index=False)
print("Submission saved as results.csv")

In [None]:
# Workflow Note
# 1. Load data and packages
# 2. Clean data
# 3. Encode data
# 4. Split data
# 5. Train Regression Model
# 6. Evaluate model
# 7. Predict test set
# 8. Export results