In [1]:

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import warnings
warnings.filterwarnings('ignore')

# ## Load Data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

print("Training data shape:", train_data.shape)
print("\nFirst few rows:")
print(train_data.head())

# ## Exploratory Data Analysis
print("\n=== Missing Values ===")
print(train_data.isnull().sum())

print("\n=== Survival Statistics ===")
print("Overall survival rate:", train_data['Survived'].mean())
print("\nSurvival by Sex:")
print(train_data.groupby('Sex')['Survived'].mean())
print("\nSurvival by Pclass:")
print(train_data.groupby('Pclass')['Survived'].mean())

# ## Baseline Strategy: Rule-Based Probability Approach
# 
#  use simple rules based on observed survival patterns:
# - Women had much higher survival rates (~74%)
# - Men had much lower survival rates (~19%)
# - First class passengers had higher survival rates
# 
# Our baseline will predict survival based on Sex and Pclass combinations

# Calculate survival probabilities for different groups
survival_probs = train_data.groupby(['Sex', 'Pclass'])['Survived'].mean()
print("\n=== Survival Probabilities by Sex and Pclass ===")
print(survival_probs)

# Function to predict based on probability threshold
def predict_baseline(row):
    """
    Predict survival based on sex and passenger class.
    Use 0.5 as threshold - if survival probability > 0.5, predict survived (1)
    """
    sex = row['Sex']
    pclass = row['Pclass']
    
    # Get survival probability for this group
    if (sex, pclass) in survival_probs.index:
        prob = survival_probs[(sex, pclass)]
    else:
        # Default to overall survival rate if combination not found
        prob = train_data['Survived'].mean()
    
    # Predict survived if probability > 0.5
    return 1 if prob > 0.5 else 0

# ## Predictions on Training Data
train_data['Predicted'] = train_data.apply(predict_baseline, axis=1)

# ## Performance Evaluation on Training Data
accuracy = accuracy_score(train_data['Survived'], train_data['Predicted'])
conf_matrix = confusion_matrix(train_data['Survived'], train_data['Predicted'])
f1 = f1_score(train_data['Survived'], train_data['Predicted'])

print("BASELINE PERFORMANCE ON TRAINING DATA")
print(f"Accuracy: {accuracy:.4f}")
print(f"\nConfusion Matrix:")
print(conf_matrix)
print(f"\nF1 Score: {f1:.4f}")

# Breakdown of predictions
print("\nPrediction breakdown:")
print(train_data['Predicted'].value_counts())
print("\nActual survival breakdown:")
print(train_data['Survived'].value_counts())

# ## Generate Predictions for Test Data
test_data['Survived'] = test_data.apply(predict_baseline, axis=1)

# Create submission file
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': test_data['Survived']
})

submission.to_csv('baseline_submission.csv', index=False)
print(f"\nTest predictions breakdown:")
print(submission['Survived'].value_counts())

# ## Summary
print("\n")
print("BASELINE SUMMARY")

print("Method: Probability-based prediction using Sex and Pclass")
print(f"Training Accuracy: {accuracy:.4f}")
print(f"Training F1 Score: {f1:.4f}")
print("Submission file: baseline_submission.csv")


Training data shape: (891, 12)

First few rows:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      