In [13]:
import pandas as pd
import numpy as np

# 1. Load the dataset
df = pd.read_csv('titanic.csv')
print("--- First 5 Records ---")
print(df.head())
print("\n--- Last 5 Records ---")
print(df.tail())

# 2. Identify Feature Types (Manual Step)
# This is usually done by looking at df.head() and df.columns
# You should list these in your report based on your observation.

# 3. Info and Statistical Summary
print("\n--- Data Information ---")
df.info() 
print("\n--- Statistical Summary ---")
print(df.describe())

# 4. Check Unique Values and Distribution
# Loop through categorical columns to see the spread
categorical_cols = ['Survived', 'Pclass', 'Sex']
print("\n--- Categorical Data Distribution ---")
for col in categorical_cols:
    print(f"\nUnique values in {col}: {df[col].unique()}")
    print(df[col].value_counts())

# 5. ML Suitability - Feature Selection
# Let's identify which columns we will actually use
target = 'Survived'
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
print(f"\nTarget Variable: {target}")
print(f"Input Features: {features}")

# 6. Analyze Dataset Size
print(f"\nDataset Shape: {df.shape}")
# Observation: 891 rows is generally enough for basic ML like Random Forest.

# 7. Data Quality Issues (Missing Values & Imbalance)
print("\n--- Missing Values Count ---")
print(df.isnull().sum())

# Check for Imbalance in Target Variable
print("\n--- Target Variable Balance ---")
print(df['Survived'].value_counts(normalize=True) * 100)

--- First 5 Records ---
   Survived  Pclass                                               Name  \
0         0       3                             Mr. Owen Harris Braund   
1         1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2         1       3                              Miss. Laina Heikkinen   
3         1       1        Mrs. Jacques Heath (Lily May Peel) Futrelle   
4         0       3                            Mr. William Henry Allen   

      Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard     Fare  
0    male  22.0                        1                        0   7.2500  
1  female  38.0                        1                        0  71.2833  
2  female  26.0                        0                        0   7.9250  
3  female  35.0                        1                        0  53.1000  
4    male  35.0                        0                        0   8.0500  

--- Last 5 Records ---
     Survived  Pclass                        