Titanic dataset: The mystery of the Titanic

Problem: Predict survival on the Titanic using the training data, and inspect the test data for model evaluation.

Dataset files: train.csv, test.csv


In [None]:
import pandas as pd
import numpy as np
import io
from IPython.display import display

# File paths from dataset metadata
train_path = 'train.csv'
test_path = 'test.csv'

# Initialize dataframes
df = None
test_df = None

# Load datasets with error handling
try:
    df = pd.read_csv(train_path)
    print('Training data loaded:', df.shape)
except Exception as e:
    print('Error loading training data from', train_path, ':', e)
    df = pd.DataFrame()

try:
    test_df = pd.read_csv(test_path)
    print('Test data loaded:', test_df.shape)
except Exception as e:
    print('Error loading test data from', test_path, ':', e)
    test_df = pd.DataFrame()


In [None]:
# Display the first few rows of the training data
if df is not None and not df.empty:
    display(df.head())
else:
    print('Training data is empty or not loaded.')


In [None]:
# Info about the training data
import io
buf = io.StringIO()
if df is not None and not df.empty:
    df.info(buf=buf)
    print(buf.getvalue())
else:
    print('No training data to describe.')


In [None]:
# Statistical summary for the training data
if df is not None and not df.empty:
    display(df.describe(include='all'))
else:
    print('No training data to summarize.')


In [None]:
# Dataset shape, columns, data types, and missing values
if df is not None and not df.empty:
    shape = df.shape
    n_cols = df.shape[1]
    cols = df.columns.tolist()
    dtypes = df.dtypes
    missing = df.isnull().sum()
    print('Dataset shape:', shape)
    print('Number of columns:', n_cols)
    print('Columns:', cols)
    print('Data types:\n', dtypes)
    print('Missing values per column:\n', missing)
    # Target detection
    possible_targets = ['target','label','class','y']
    target_col = None
    for c in df.columns:
        if c.lower() in possible_targets:
            target_col = c
            break
    if target_col is None:
        target_col = df.columns[-1]
    print('Suspected target variable:', target_col)
else:
    print('No data to inspect.')


In [None]:
# Detect GPU availability (GPU/CPU)
device = 'cpu'
try:
    import torch
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
except Exception:
    device = 'cpu'
print('Detected device:', device)


In [None]:
# Compile dataset summary
import io

shape = df.shape if df is not None else (0, 0)
columns = df.shape[1] if (df is not None) else 0
missing_values = df.isnull().sum().to_dict() if (df is not None) else {}
sample_data = df.head().to_dict(orient='records') if (df is not None) else []
statistical_summary = df.describe(include='all') if (df is not None and not df.empty) else None
info = ''
if df is not None and not df.empty:
    buf2 = io.StringIO()
    df.info(buf=buf2)
    info = buf2.getvalue()

# Target variable detection
target_variable = None
if df is not None and not df.empty:
    for col in df.columns:
        if col.lower() in ['target','label','class','y']:
            target_variable = col
            break
    if target_variable is None:
        target_variable = df.columns[-1]

# Device from earlier cell
try:
    device
except NameError:
    device = 'cpu'

dataset_summary = {
    "shape": shape,
    "columns": columns,
    "missing_values": missing_values,
    "sample_data": sample_data,
    "statistical_summary": statistical_summary,
    "info": info,
    "device": device,
    "target_variable": target_variable
}
print('Dataset summary:')
print(dataset_summary)
