In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Load the training data
train_df = pd.read_csv('data/train.csv')

# Preprocess the data
# Drop columns that we won't be using
train_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# Handle missing values in Age and Embarked columns
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

# Convert categorical variables to dummy variables
train_df = pd.get_dummies(train_df, columns=['Sex', 'Embarked'])

# Split the data into a training set and a validation set
X = train_df.drop('Survived', axis=1)
y = train_df['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data using standard scaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Choose a logistic regression model and train it on the training set
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the performance of the model on the validation set
accuracy = model.score(X_val, y_val)
print(f'Accuracy on validation set: {accuracy:.4f}')

# Use the model to make predictions on the testing data
test_df = pd.read_csv('data/test.csv')

# Preprocess the testing data
# Drop columns that we won't be using
test_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# Handle missing values in Age and Fare columns
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)

# Convert categorical variables to dummy variables
test_df = pd.get_dummies(test_df, columns=['Sex', 'Embarked'])

# Scale the data using standard scaler
X_test = scaler.transform(test_df)

# Make predictions on the testing data
predictions = model.predict(X_test)

# Save the predictions to a CSV file
submission_df = pd.DataFrame({'PassengerId': range(892, 1310), 'Survived': predictions})
submission_df.to_csv('submission.csv', index=False)
