In [1]:
import pandas as pd
import numpy as np

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')
print(f'\nTrain columns: {train.columns.tolist()}')
print(f'\nTarget distribution:\n{train["Survived"].value_counts(normalize=True)}')
train.head()

Train shape: (891, 12)
Test shape: (418, 11)

Train columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

Target distribution:
Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# Check missing values and data types
print("Missing values:")
print(train.isnull().sum())
print(f"\nData types:\n{train.dtypes}")

# Key survival rates
print(f"\n\nSurvival by Sex:")
print(train.groupby('Sex')['Survived'].mean())
print(f"\nSurvival by Pclass:")
print(train.groupby('Pclass')['Survived'].mean())

Missing values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Data types:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


Survival by Sex:
Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

Survival by Pclass:
Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64


In [3]:
# Feature engineering examples
import re

# Title extraction
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

train['Title'] = train['Name'].apply(get_title)
print("Title distribution:")
print(train['Title'].value_counts())

# Survival by title
print("\nSurvival rate by Title:")
print(train.groupby('Title')['Survived'].mean().sort_values(ascending=False))

Title distribution:
Title
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Mlle          2
Major         2
Ms            1
Mme           1
Don           1
Lady          1
Sir           1
Capt          1
Countess      1
Jonkheer      1
Name: count, dtype: int64

Survival rate by Title:
Title
Countess    1.000000
Ms          1.000000
Lady        1.000000
Mme         1.000000
Mlle        1.000000
Sir         1.000000
Mrs         0.792000
Miss        0.697802
Master      0.575000
Major       0.500000
Col         0.500000
Dr          0.428571
Mr          0.156673
Capt        0.000000
Don         0.000000
Jonkheer    0.000000
Rev         0.000000
Name: Survived, dtype: float64


In [4]:
# Family size analysis
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
train['IsAlone'] = (train['FamilySize'] == 1).astype(int)

print("Survival by FamilySize:")
print(train.groupby('FamilySize')['Survived'].agg(['mean', 'count']))

print("\nSurvival by IsAlone:")
print(train.groupby('IsAlone')['Survived'].mean())

Survival by FamilySize:
                mean  count
FamilySize                 
1           0.303538    537
2           0.552795    161
3           0.578431    102
4           0.724138     29
5           0.200000     15
6           0.136364     22
7           0.333333     12
8           0.000000      6
11          0.000000      7

Survival by IsAlone:
IsAlone
0    0.505650
1    0.303538
Name: Survived, dtype: float64
