In [7]:
# 1. Import necessary libraries
import pandas as pd                      # For data manipulation and analysis.
import numpy as np                       # For numerical operations.
from sklearn.tree import DecisionTreeClassifier  # The classification model.
from sklearn.model_selection import train_test_split  # To split the dataset.
from sklearn.metrics import accuracy_score         # To evaluate the model.

# 2. Load the dataset using a raw string to avoid unicode escape issues.
data = pd.read_csv(r"C:\Users\Sahil\Downloads\Titanic-Dataset.csv")

# 3. Data Preprocessing

# (a) Handle missing values:
# Fill missing 'Age' values with the mean age. Notice we assign back to the column to avoid chained assignment.
data['Age'] = data['Age'].fillna(data['Age'].mean())

# For the 'Embarked' column, fill missing values with the most common port (here assumed as 'S')
data['Embarked'] = data['Embarked'].fillna('S')

# (b) Encode categorical variables:
# Convert the 'Sex' column to numeric: male -> 0, female -> 1.
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})

# Convert the 'Embarked' column to numeric. Here we map: S -> 0, C -> 1, Q -> 2.
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# (c) Drop columns that are not useful for prediction:
# For this example, we drop columns like 'Name', 'Ticket', and 'Cabin' because they contain non-numeric information.
data = data.drop(['Name', 'Ticket', 'Cabin'], axis=1)

# 4. Define Features and Target Variable:
# We assume 'Survived' is our target variable. All other columns will be features.
X = data.drop(['Survived'], axis=1)
y = data['Survived']

# 5. Split the dataset into training and testing sets:
# We use 80% of the data for training and 20% for testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Create and Train the Model:
# Initialize the Decision Tree Classifier.
model = DecisionTreeClassifier()

# Fit the model on the training data.
model.fit(X_train, y_train)

# 7. Make Predictions and Evaluate the Model:
# Use the trained model to predict survival on the test set.
predictions = model.predict(X_test)

# Calculate the accuracy score to evaluate model performance.
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


Accuracy: 0.7597765363128491
