In [1]:
import pandas as pd
import numpy as np

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')
print(f'\nTrain columns: {train.columns.tolist()}')
print(f'\nTarget distribution:\n{train["Survived"].value_counts(normalize=True)}')
print(f'\nMissing values in train:\n{train.isnull().sum()}')
print(f'\nMissing values in test:\n{test.isnull().sum()}')

Train shape: (891, 12)
Test shape: (418, 11)

Train columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

Target distribution:
Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64

Missing values in train:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Missing values in test:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [2]:
# Key feature analysis - survival rates by key features
print("Survival by Sex:")
print(train.groupby('Sex')['Survived'].mean())
print("\nSurvival by Pclass:")
print(train.groupby('Pclass')['Survived'].mean())
print("\nSurvival by Embarked:")
print(train.groupby('Embarked')['Survived'].mean())

# Extract titles
import re
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

train['Title'] = train['Name'].apply(get_title)
print("\nTitle distribution:")
print(train['Title'].value_counts())
print("\nSurvival by Title:")
print(train.groupby('Title')['Survived'].mean().sort_values(ascending=False))

Survival by Sex:
Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

Survival by Pclass:
Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64

Survival by Embarked:
Embarked
C    0.553571
Q    0.389610
S    0.336957
Name: Survived, dtype: float64

Title distribution:
Title
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Mlle          2
Major         2
Ms            1
Mme           1
Don           1
Lady          1
Sir           1
Capt          1
Countess      1
Jonkheer      1
Name: count, dtype: int64

Survival by Title:
Title
Countess    1.000000
Ms          1.000000
Lady        1.000000
Mme         1.000000
Mlle        1.000000
Sir         1.000000
Mrs         0.792000
Miss        0.697802
Master      0.575000
Major       0.500000
Col         0.500000
Dr          0.428571
Mr          0.156673
Capt        0.000000
Don         0.000000
Jonkheer    0.000000
Rev         0.000

In [3]:
# Family size analysis
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
print("Survival by FamilySize:")
print(train.groupby('FamilySize')['Survived'].agg(['mean', 'count']))

# Age analysis
print("\nAge statistics:")
print(train['Age'].describe())

# Check correlation between features
train['Sex_encoded'] = train['Sex'].map({'female': 0, 'male': 1})
print("\nCorrelation with Survived:")
numeric_cols = ['Pclass', 'Sex_encoded', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize']
for col in numeric_cols:
    corr = train[col].corr(train['Survived'])
    print(f"{col}: {corr:.3f}")

Survival by FamilySize:
                mean  count
FamilySize                 
1           0.303538    537
2           0.552795    161
3           0.578431    102
4           0.724138     29
5           0.200000     15
6           0.136364     22
7           0.333333     12
8           0.000000      6
11          0.000000      7

Age statistics:
count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

Correlation with Survived:
Pclass: -0.338
Sex_encoded: -0.543
Age: -0.077
SibSp: -0.035
Parch: 0.082
Fare: 0.257
FamilySize: 0.017


In [4]:
# Additional feature exploration
# Ticket analysis
print("Sample tickets:")
print(train['Ticket'].head(20))

# Check ticket frequency (group travelers)
ticket_counts = train['Ticket'].value_counts()
print(f"\nTickets with multiple passengers: {(ticket_counts > 1).sum()}")
print(f"Max passengers on same ticket: {ticket_counts.max()}")

# Cabin analysis
print("\nCabin examples (non-null):")
print(train[train['Cabin'].notna()]['Cabin'].head(10))

# Check data types
print("\nData types:")
print(train.dtypes)

Sample tickets:
0            A/5 21171
1             PC 17599
2     STON/O2. 3101282
3               113803
4               373450
5               330877
6                17463
7               349909
8               347742
9               237736
10             PP 9549
11              113783
12           A/5. 2151
13              347082
14              350406
15              248706
16              382652
17              244373
18              345763
19                2649
Name: Ticket, dtype: object

Tickets with multiple passengers: 134
Max passengers on same ticket: 7

Cabin examples (non-null):
1             C85
3            C123
6             E46
10             G6
11           C103
21            D56
23             A6
27    C23 C25 C27
31            B78
52            D33
Name: Cabin, dtype: object

Data types:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch          