In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# --- 1. Load the Dataset ---
try:
    # Load data from the CSV file
    df = pd.read_csv('train.csv')
except FileNotFoundError:
    print("Error: 'train.csv' not found.")
    print("Please download it from Kaggle (https://www.kaggle.com/c/titanic/data) and place it in the same folder.")
    exit()

# --- 2. Preprocess the Data ---

# We need to convert categorical features (like 'Sex') into numbers
# and fill in missing values (like in 'Age').

# Select relevant features for prediction
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']
target = 'Survived'

# Make a copy to avoid changing the original dataframe
df_processed = df[features + [target]].copy()

# A. Fill missing values (Imputation)
# Fill missing 'Age' values with the average age
df_processed['Age'] = df_processed['Age'].fillna(df_processed['Age'].mean())
# Fill missing 'Embarked' values with the most common port
df_processed['Embarked'] = df_processed['Embarked'].fillna(df_processed['Embarked'].mode()[0])

# B. Encode categorical features into numbers
le = LabelEncoder()
df_processed['Sex'] = le.fit_transform(df_processed['Sex'])
df_processed['Embarked'] = le.fit_transform(df_processed['Embarked'])

# --- 3. Define Features (X) and Target (y) ---
X = df_processed[features]
y = df_processed[target]

# --- 4. Split Data into Training and Testing Sets ---
# 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")
print("---")

# --- 5. Implement and Train the Naive Bayes Classifier ---
# We use Gaussian Naive Bayes, which assumes features follow a normal (Gaussian) distribution
model = GaussianNB()

# Train the model on the training data
model.fit(X_train, y_train)

# --- 6. Make Predictions and Evaluate ---
# Make predictions on the unseen test data
y_pred = model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Did not survive (0)', 'Survived (1)'])

# --- 7. Display Results ---
print("Naive Bayes Classifier Results")
print("="*30)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(report)

# Show some example predictions
print("\nExample Predictions:")
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(results.head(10))