In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("\nTrain columns:", train.columns.tolist())
print("\nFirst few rows:")
print(train.head())

Train shape: (891, 12)
Test shape: (418, 11)

Train columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

First few rows:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          P

In [2]:
# Basic data info
print("=== TRAIN DATA INFO ===")
print(train.info())
print("\n=== MISSING VALUES ===")
print(train.isnull().sum())
print("\n=== TARGET DISTRIBUTION ===")
print(train['Survived'].value_counts(normalize=True))
print(f"Survival rate: {train['Survived'].mean():.3f}")

=== TRAIN DATA INFO ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None

=== MISSING VALUES ===
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
C

In [3]:
# Feature analysis
print("=== FEATURE CORRELATIONS WITH TARGET ===")
numeric_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
for feature in numeric_features:
    if feature in train.columns:
        corr = train[feature].corr(train['Survived'])
        print(f"{feature}: {corr:.3f}")

print("\n=== CATEGORICAL FEATURES ===")
print("Sex distribution:")
print(train['Sex'].value_counts())
print(f"\nSurvival by Sex:")
print(train.groupby('Sex')['Survived'].agg(['count', 'sum', 'mean']))

print("\nPclass distribution:")
print(train['Pclass'].value_counts())
print(f"\nSurvival by Pclass:")
print(train.groupby('Pclass')['Survived'].agg(['count', 'sum', 'mean']))

=== FEATURE CORRELATIONS WITH TARGET ===
Pclass: -0.338
Age: -0.077
SibSp: -0.035
Parch: 0.082
Fare: 0.257

=== CATEGORICAL FEATURES ===
Sex distribution:
Sex
male      577
female    314
Name: count, dtype: int64

Survival by Sex:
        count  sum      mean
Sex                         
female    314  233  0.742038
male      577  109  0.188908

Pclass distribution:
Pclass
3    491
1    216
2    184
Name: count, dtype: int64

Survival by Pclass:
        count  sum      mean
Pclass                      
1         216  136  0.629630
2         184   87  0.472826
3         491  119  0.242363
