In [1]:
!pip install pandas scikit-learn



In [2]:
import pandas as pd
import sklearn
print(pd.__version__)
print(sklearn.__version__)


2.2.2
1.5.1


In [4]:
# Import required libraries
import pandas as pd  # For handling and analyzing data
from sklearn.model_selection import train_test_split  # To split data into training and testing sets
from sklearn.ensemble import RandomForestClassifier  # Machine learning algorithm (Random Forest)
from sklearn.metrics import accuracy_score, classification_report  # For evaluating model performance
import warnings
warnings.filterwarnings('ignore')  # Suppress warning messages for cleaner output


# Load the Titanic dataset (make sure 'titanic.csv' is in the same folder)
titanic_data = pd.read_csv('titanic.csv')

# Remove rows where the target column 'Survived' has missing values
titanic_data = titanic_data.dropna(subset=['Survived'])


# Select relevant features (independent variables) for prediction
X = titanic_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]

# Define the target variable (dependent variable)
y = titanic_data['Survived']


# Convert the 'Sex' column from text to numbers:
#   female → 0, male → 1
X['Sex'] = X['Sex'].map({'female': 0, 'male': 1})

# Fill missing 'Age' values with the median age of the dataset
X['Age'] = X['Age'].fillna(X['Age'].median())


# Split data into training and testing sets:
# 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# Initialize the Random Forest Classifier
# n_estimators=100 → uses 100 trees
# random_state=42 → ensures reproducible results
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train (fit) the model on the training data
rf_classifier.fit(X_train, y_train)


# Use the trained model to predict survival on test data
y_pred = rf_classifier.predict(X_test)


# Evaluate model performance using accuracy and a classification report
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print accuracy and detailed metrics (precision, recall, F1-score)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_rep)


# Pick one passenger from the test set to see an individual prediction
sample = X_test.iloc[0:1]

# Predict survival for that passenger
prediction = rf_classifier.predict(sample)

# Display the passenger’s features
sample_dict = sample.iloc[0].to_dict()
print(f"\nSample Passenger: {sample_dict}")

# Display the model’s prediction in human-readable form
print(f"Predicted Survival: {'Survived' if prediction[0] == 1 else 'Did Not Survive'}")


Accuracy: 0.80

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.85      0.83       105
           1       0.77      0.73      0.75        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179


Sample Passenger: {'Pclass': 3.0, 'Sex': 1.0, 'Age': 28.0, 'SibSp': 1.0, 'Parch': 1.0, 'Fare': 15.2458}
Predicted Survival: Did Not Survive


In [None]:
# MODEL OUTPUT EXPLANATION

# Accuracy: 0.80
# The model correctly predicted 80% of the passengers' survival outcomes in the test set.
# This means 8 out of every 10 predictions were correct — a solid score for this dataset.

# Classification Report
# The report breaks down model performance for each class:
#   Class 0 = Did Not Survive
#   Class 1 = Survived
#
# precision – of those predicted as a class, how many were correct
# recall    – of the true class examples, how many were found
# f1-score  – balances precision and recall
# support   – number of real examples for that class
#
# Summary:
# - Class 0 (Did Not Survive): predicted very accurately (F1 = 0.83)
# - Class 1 (Survived): slightly less accurate (F1 = 0.75)
# Overall accuracy = 80%, which shows good balance and reliability.

# Sample Passenger Prediction
# Example passenger:
#   {'Pclass': 3.0, 'Sex': 1.0, 'Age': 28.0, 'SibSp': 1.0, 'Parch': 1.0, 'Fare': 15.2458}
#
# Interpreted as:
#   - 3rd class male
#   - 28 years old
#   - Traveling with 1 sibling/spouse and 1 parent/child
#   - Ticket fare £15.25
#
# Model Prediction → "Did Not Survive"
# This aligns with historical patterns: 3rd-class males had the lowest survival rate.

# Summary
# - Random Forest accuracy: ~80%
# - Strong prediction power on overall survival trends
# - Key insight: gender and class heavily influence survival probability