# Titanic Survival Prediction Model Development

## 1. Load Dataset

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib

# Load the dataset
data_path = 'titanic.csv'

# Selected features
selected_features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']
target = 'Survived'

if os.path.exists(data_path):
    print(f"Loading data from {data_path}...")
    df = pd.read_csv(data_path)
else:
    print("titanic.csv not found. Generating synthetic dataset for demonstration...")
    # Generate synthetic data mimicking Titanic dataset
    np.random.seed(42)
    n_samples = 891
    df = pd.DataFrame({
        'Pclass': np.random.choice([1, 2, 3], n_samples, p=[0.24, 0.21, 0.55]),
        'Sex': np.random.choice(['male', 'female'], n_samples),
        'Age': np.random.normal(30, 14, n_samples).astype(int),
        'Fare': np.random.exponential(32, n_samples),
        'Embarked': np.random.choice(['S', 'C', 'Q'], n_samples, p=[0.7, 0.2, 0.1]),
        'Survived': np.random.randint(0, 2, n_samples)
    })
    df['Age'] = df['Age'].apply(lambda x: max(1, x))

print(df[selected_features].head())

## 2. Preprocessing & Feature Engineering

In [None]:
X = df[selected_features]
y = df[target]

# Preprocessing for numerical data
numerical_features = ['Age', 'Fare']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_features = ['Pclass', 'Sex', 'Embarked']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

## 3. Train Model (Logistic Regression)

In [None]:
model = LogisticRegression(random_state=42, max_iter=1000)

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf.fit(X_train, y_train)
print("Model trained.")

## 4. Evaluate Model

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

## 5. Save Model

In [None]:
joblib.dump(clf, 'titanic_survival_model.pkl')
print("Model saved to titanic_survival_model.pkl")