In [1]:
import pandas as pd
import numpy as np

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print('Train shape:', train.shape)
print('Test shape:', test.shape)
print('\nColumns:', train.columns.tolist())
print('\nTarget distribution:')
print(train['Survived'].value_counts(normalize=True))

Train shape: (891, 12)
Test shape: (418, 11)

Columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

Target distribution:
Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64


In [2]:
# Check data types and missing values
print('Data types:')
print(train.dtypes)
print('\nMissing values in train:')
print(train.isnull().sum())
print('\nMissing values in test:')
print(test.isnull().sum())

Data types:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

Missing values in train:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Missing values in test:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [3]:
# Key survival rates by features
print('Survival by Sex:')
print(train.groupby('Sex')['Survived'].mean())
print('\nSurvival by Pclass:')
print(train.groupby('Pclass')['Survived'].mean())
print('\nSurvival by Embarked:')
print(train.groupby('Embarked')['Survived'].mean())

Survival by Sex:
Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

Survival by Pclass:
Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64

Survival by Embarked:
Embarked
C    0.553571
Q    0.389610
S    0.336957
Name: Survived, dtype: float64


In [4]:
# Extract Title from Name
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
print('Title distribution:')
print(train['Title'].value_counts())
print('\nSurvival by Title:')
print(train.groupby('Title')['Survived'].agg(['mean', 'count']).sort_values('count', ascending=False))

Title distribution:
Title
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Mlle          2
Major         2
Ms            1
Mme           1
Don           1
Lady          1
Sir           1
Capt          1
Countess      1
Jonkheer      1
Name: count, dtype: int64

Survival by Title:
              mean  count
Title                    
Mr        0.156673    517
Miss      0.697802    182
Mrs       0.792000    125
Master    0.575000     40
Dr        0.428571      7
Rev       0.000000      6
Mlle      1.000000      2
Major     0.500000      2
Col       0.500000      2
Capt      0.000000      1
Lady      1.000000      1
Countess  1.000000      1
Don       0.000000      1
Jonkheer  0.000000      1
Mme       1.000000      1
Ms        1.000000      1
Sir       1.000000      1


In [5]:
# Family size analysis
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
train['IsAlone'] = (train['FamilySize'] == 1).astype(int)
print('Survival by FamilySize:')
print(train.groupby('FamilySize')['Survived'].agg(['mean', 'count']))
print('\nSurvival by IsAlone:')
print(train.groupby('IsAlone')['Survived'].mean())

Survival by FamilySize:
                mean  count
FamilySize                 
1           0.303538    537
2           0.552795    161
3           0.578431    102
4           0.724138     29
5           0.200000     15
6           0.136364     22
7           0.333333     12
8           0.000000      6
11          0.000000      7

Survival by IsAlone:
IsAlone
0    0.505650
1    0.303538
Name: Survived, dtype: float64


In [None]:
# Has_Cabin feature
train['Has_Cabin'] = train['Cabin'].notna().astype(int)
print('Survival by Has_Cabin:')
print(train.groupby('Has_Cabin')['Survived'].agg(['mean', 'count']))

# Quick look at sample data
print('\nSample data:')
print(train[['Survived', 'Pclass', 'Sex', 'Age', 'Title', 'FamilySize', 'Has_Cabin', 'Fare']].head(10))