In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
pwd = os.getcwd()
filepath1 = os.path.join(pwd, 'dataset/titanic.csv')
filepath2 = os.path.join(pwd, 'dataset/test.csv')
filepath3 = os.path.join(pwd, 'dataset/gender_submission.csv')

In [3]:
df = pd.read_csv(filepath1)
test_df = pd.read_csv(filepath2)
nas_df = pd.read_csv(filepath3)

In [4]:
def data_cleaning(df):
    df.drop(columns=['Unnamed: 0'], axis=1, inplace=True, errors='ignore')
    df.drop(['Name', 'Cabin', 'Ticket'], axis=1, inplace=True)
    df.dropna(subset=["Embarked"], inplace=True)
    imputer = SimpleImputer(strategy="median")
    df_num = df.select_dtypes(include=[np.number])
    imputer.fit(df_num)
    x=imputer.transform(df_num)
    df[df_num.columns]=pd.DataFrame(x, columns=df_num.columns, index=df_num.index)
    return df

In [5]:
def one_hot_encode(df):
    df_cat = df[["Sex", "Embarked"]]
    cat_encoder = OneHotEncoder(sparse_output=False)
    df_cat_1hot = cat_encoder.fit_transform(df_cat)
    feature_names = cat_encoder.get_feature_names_out(["Sex", "Embarked"])
    new_df = pd.DataFrame(df_cat_1hot, columns=feature_names, index=df.index)
    df.drop(['Sex', 'Embarked'], axis=1, inplace=True)
    df = pd.concat([df, new_df], axis=1)
    return df

In [6]:
df = one_hot_encode(df)
X_train = df.drop("Survived", axis=1)
y_train = df["Survived"]

In [7]:
test_df_cleaned = data_cleaning(test_df.copy())
test_df_encoded = one_hot_encode(test_df_cleaned)

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(test_df_encoded)

In [9]:
cv_scores = cross_val_score(LogisticRegression(max_iter=1000), X_train_scaled, y_train, cv=5)
print("Cross-validation accuracies:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

Cross-validation accuracies: [0.78089888 0.78651685 0.78089888 0.76966292 0.8079096 ]
Mean CV accuracy: 0.7851774265219325


In [10]:
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)
predictions = log_reg.predict(X_test_scaled)

In [12]:
y_true = nas_df["Survived"]
accuracy = accuracy_score(y_true, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.937799043062201


In [17]:
submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],  # Original IDs
    'Survived': predictions.astype(int)     # Your model's output
})

submission_df.to_csv('submission.csv', index=False)