### Titanic Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import scipy.stats as stats
import seaborn as sns

In [2]:
titanic = sns.load_dataset('titanic')

In [None]:
print(titanic['survived'].value_counts(normalize=True))

survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64


In [4]:
# Stratified Sampling on survived

sample_sizes = [30, 100, 300]
samples = {}

for size in sample_sizes:
    # stratified split to get sample of required size
    _, sample = train_test_split(
        titanic,
        test_size=size,
        stratify=titanic['survived'],
        random_state=42
    )
    samples[size] = sample
    print(f"\nSample size {size} class distribution:")
    print(sample['survived'].value_counts(normalize=True))



Sample size 30 class distribution:
survived
0    0.6
1    0.4
Name: proportion, dtype: float64

Sample size 100 class distribution:
survived
0    0.62
1    0.38
Name: proportion, dtype: float64

Sample size 300 class distribution:
survived
0    0.616667
1    0.383333
Name: proportion, dtype: float64


In [5]:
# Confidence Intervals for survival rate in each sample

def proportion_confidence_interval(successes, n, confidence=0.95):
    p = successes / n
    z = stats.norm.ppf(1 - (1 - confidence) / 2)
    SE = np.sqrt(p * (1 - p) / n)
    ME = z * SE
    return p, p - ME, p + ME

for size, sample in samples.items():
    n = len(sample)
    successes = sample['survived'].sum()  # count of survived=1
    p, lower, upper = proportion_confidence_interval(successes, n)
    width = upper - lower
    print(f"\nSample size: {size}")
    print(f"Survival proportion: {p:.3f}")
    print(f"95% Confidence Interval: ({lower:.3f}, {upper:.3f})")
    print(f"Width of Confidence Interval: {width:.3f}")



Sample size: 30
Survival proportion: 0.400
95% Confidence Interval: (0.225, 0.575)
Width of Confidence Interval: 0.351

Sample size: 100
Survival proportion: 0.380
95% Confidence Interval: (0.285, 0.475)
Width of Confidence Interval: 0.190

Sample size: 300
Survival proportion: 0.383
95% Confidence Interval: (0.328, 0.438)
Width of Confidence Interval: 0.110
