In [1]:
import pandas as pd
import numpy as np

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')
print(f'\nColumns: {train.columns.tolist()}')
print(f'\nTarget distribution:\n{train["Survived"].value_counts(normalize=True)}')
train.head()

Train shape: (891, 12)
Test shape: (418, 11)

Columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

Target distribution:
Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Check missing values and data types
print('Missing values in train:')
print(train.isnull().sum())
print('\nMissing values in test:')
print(test.isnull().sum())
print('\nData types:')
print(train.dtypes)

Missing values in train:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Missing values in test:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

Data types:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [4]:
# Key survival patterns
print('Survival by Sex:')
print(train.groupby('Sex')['Survived'].mean())
print('\nSurvival by Pclass:')
print(train.groupby('Pclass')['Survived'].mean())
print('\nSurvival by Embarked:')
print(train.groupby('Embarked')['Survived'].mean())

Survival by Sex:
Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

Survival by Pclass:
Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64

Survival by Embarked:
Embarked
C    0.553571
Q    0.389610
S    0.336957
Name: Survived, dtype: float64


In [6]:
# Extract title from name and check survival patterns
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\\.', expand=False)
print('Title distribution:')
print(train['Title'].value_counts())
print('\nSurvival by Title:')
print(train.groupby('Title')['Survived'].agg(['mean', 'count']).sort_values('count', ascending=False))

Title distribution:
Title
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Mlle          2
Major         2
Ms            1
Mme           1
Don           1
Lady          1
Sir           1
Capt          1
Countess      1
Jonkheer      1
Name: count, dtype: int64

Survival by Title:
              mean  count
Title                    
Mr        0.156673    517
Miss      0.697802    182
Mrs       0.792000    125
Master    0.575000     40
Dr        0.428571      7
Rev       0.000000      6
Mlle      1.000000      2
Major     0.500000      2
Col       0.500000      2
Capt      0.000000      1
Lady      1.000000      1
Countess  1.000000      1
Don       0.000000      1
Jonkheer  0.000000      1
Mme       1.000000      1
Ms        1.000000      1
Sir       1.000000      1


In [7]:
# Family size analysis
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
print('Survival by FamilySize:')
print(train.groupby('FamilySize')['Survived'].agg(['mean', 'count']))

Survival by FamilySize:
                mean  count
FamilySize                 
1           0.303538    537
2           0.552795    161
3           0.578431    102
4           0.724138     29
5           0.200000     15
6           0.136364     22
7           0.333333     12
8           0.000000      6
11          0.000000      7


In [8]:
# Check Fare distribution and correlation with survival
print('Fare statistics:')
print(train['Fare'].describe())
print('\nSurvival by Fare quartiles:')
train['FareQuartile'] = pd.qcut(train['Fare'], 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
print(train.groupby('FareQuartile')['Survived'].mean())

Fare statistics:
count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

Survival by Fare quartiles:
FareQuartile
Q1    0.197309
Q2    0.303571
Q3    0.454955
Q4    0.581081
Name: Survived, dtype: float64


  print(train.groupby('FareQuartile')['Survived'].mean())


In [None]:
# Age analysis
print('Age statistics:')
print(train['Age'].describe())
print('\nSurvival by Age groups:')
train['AgeGroup'] = pd.cut(train['Age'], bins=[0, 16, 32, 48, 64, 100], labels=['Child', 'Young', 'Middle', 'Senior', 'Elderly'])
print(train.groupby('AgeGroup')['Survived'].agg(['mean', 'count']))