Week 3

In [None]:
# Import required libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score

In [None]:
# Load the Titanic dataset
df = pd.read_csv("train.csv")

# Display the shape and the first few rows of the dataset
print("Dataset shape:", df.shape)
df.head()

In [None]:
# Select relevant features and target variable

# 'Survived' is the column we want to predict (1 = survived, 0 = did not survive)
target = "Survived"

# We'll use a subset of commonly available and useful columns
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Drop rows where the target (Survived) is missing
df = df.dropna(subset=[target])

# Define our features (X) and labels (y)
X = df[features]
y = df[target]

X.head()

In [None]:
# Separate categorical and numerical features
cat_features = ["Sex", "Embarked"]
num_features = ["Pclass", "Age", "SibSp", "Parch", "Fare"]

In [None]:
# Build preprocessing pipelines

# Pipeline for numerical data:
# - SimpleImputer replaces missing values with the median
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

# Pipeline for categorical data:
# - SimpleImputer fills missing categories with the most frequent value
# - OneHotEncoder converts categories into binary indicator columns
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine both transformations using ColumnTransformer
# This ensures each type of column is processed correctly
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features)
    ])

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

In [None]:
# Split data into training and testing sets

# The training set is used to fit the model.
# The testing set is used to evaluate performance on unseen data.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)


In [None]:
# Train (fit) the model
model.fit(X_train, y_train)
print("Model training complete!")

# Predict survival likelihoods on the test data
y_pred = model.predict(X_test)

# Compute the R² score, a measure of how well the model fits the data
# (1.0 = perfect prediction, 0 = no correlation)
r2 = r2_score(y_test, y_pred)
print(f"Model R² score on test set: {r2:.3f}")

In [None]:
# Make a sample prediction

example_passenger = pd.DataFrame([{
    "Pclass": 3,
    "Sex": "male",
    "Age": 25,
    "SibSp": 0,
    "Parch": 0,
    "Fare": 7.25,
    "Embarked": "S"
}])

predicted_survival = model.predict(example_passenger)[0]

print(f"Predicted likelihood of survival: {predicted_survival:.2f}")