In [1]:
import pandas as pd
import numpy as np

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print('Train shape:', train.shape)
print('Test shape:', test.shape)
print('\nTrain columns:', train.columns.tolist())
print('\nTarget distribution:')
print(train['Survived'].value_counts(normalize=True))

Train shape: (891, 12)
Test shape: (418, 11)

Train columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

Target distribution:
Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64


In [2]:
# Check missing values and data types
print('Missing values in train:')
print(train.isnull().sum())
print('\nMissing values in test:')
print(test.isnull().sum())
print('\nData types:')
print(train.dtypes)

Missing values in train:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Missing values in test:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

Data types:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [4]:
# Survival rates by key features
print('Survival by Sex:')
print(train.groupby('Sex')['Survived'].mean())
print('\nSurvival by Pclass:')
print(train.groupby('Pclass')['Survived'].mean())
print('\nSurvival by Embarked:')
print(train.groupby('Embarked')['Survived'].mean())

Survival by Sex:
Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

Survival by Pclass:
Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64

Survival by Embarked:
Embarked
C    0.553571
Q    0.389610
S    0.336957
Name: Survived, dtype: float64


In [5]:
# Check Ticket feature and Cabin deck
print('Sample Tickets:')
print(train['Ticket'].head(10))
print('\nTicket unique values:', train['Ticket'].nunique())

print('\nSample Cabins:')
print(train['Cabin'].dropna().head(10))

# Check Title extraction
import re
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

train['Title'] = train['Name'].apply(get_title)
print('\nTitle distribution:')
print(train['Title'].value_counts())

Sample Tickets:
0           A/5 21171
1            PC 17599
2    STON/O2. 3101282
3              113803
4              373450
5              330877
6               17463
7              349909
8              347742
9              237736
Name: Ticket, dtype: object

Ticket unique values: 681

Sample Cabins:
1             C85
3            C123
6             E46
10             G6
11           C103
21            D56
23             A6
27    C23 C25 C27
31            B78
52            D33
Name: Cabin, dtype: object

Title distribution:
Title
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Mlle          2
Major         2
Ms            1
Mme           1
Don           1
Lady          1
Sir           1
Capt          1
Countess      1
Jonkheer      1
Name: count, dtype: int64
