In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# ----------------------------
# 1. Load the Titanic dataset
# ----------------------------
data = pd.read_csv("/content/Titanic-Dataset.csv")  # make sure train.csv is extracted from archive
print("First 5 rows:\n", data.head())

First 5 rows:
    PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   Na

In [3]:
# Target variable = Survived
y = data["Survived"]
X = data.drop(columns=["Survived"])

In [4]:
# ----------------------------
# 2. Identify columns
# ----------------------------
# Numerical features (we will scale them)
numeric_features = ["Age", "SibSp", "Parch", "Fare"]

# Categorical features (we will encode them)
categorical_features = ["Pclass", "Sex", "Embarked"]

In [5]:
# ----------------------------
# 3. Preprocessing steps
# ----------------------------

# For numeric: fill missing values with median, then scale
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),   # fill missing numbers
    ("scaler", StandardScaler())                     # scale to mean=0, std=1
])

In [6]:
# For categorical: fill missing with most frequent, then one-hot encode
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),   # fill missing categories
    ("encoder", OneHotEncoder(handle_unknown="ignore"))     # convert to 0/1 columns
])


In [7]:
# Combine numeric + categorical transformers
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [8]:
# ----------------------------
# 4. Build pipeline with model
# ----------------------------
# Logistic Regression is simple and interpretable
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=500))
])


In [9]:
# ----------------------------
# 5. Train/Test split for evaluation
# ----------------------------
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_val)

# Evaluate
print("Validation Accuracy:", accuracy_score(y_val, y_pred))

Validation Accuracy: 0.7988826815642458
