In [6]:
import pandas as pd  # For data handling
from sklearn.model_selection import train_test_split  # For splitting the dataset
from sklearn.tree import DecisionTreeClassifier  # The machine learning model
from sklearn.metrics import accuracy_score  # For evaluating the model's performance

# Load the dataset from the CSV file.
data = pd.read_csv(r"C:\Users\Sahil\Downloads\Titanic-Dataset.csv")
print(data.head())  # View the first 5 rows to understand the data structure

# Clean the data by removing columns that are not useful.
data = data.drop(['Cabin', 'Name', 'Ticket'], axis=1)

# Fill missing values in 'Age' with the average age.
data['Age'].fillna(data['Age'].mean(), inplace=True)

# Remove any rows with missing values after filling in the 'Age'.
data.dropna(inplace=True)

# Convert the 'Sex' column from text to numbers (male: 0, female: 1).
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})

# Separate the features (X) from the label (y) we want to predict.
X = data.drop('Survived', axis=1)  # All columns except 'Survived'
y = data['Survived']  # The 'Survived' column (0 or 1)

# Spliting the data into training (80%) and testing (20%) sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a decision tree classifier and train it with the training data.
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Use the model to predict survival on the test data.
predictions = model.predict(X_test)
print("Predictions:", predictions)

# Evaluating the predictions by comparing them to the actual outcomes.
accuracy = accuracy_score(y_test, predictions)
print("Model Accuracy:", accuracy)


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(data['Age'].mean(), inplace=True)


ValueError: could not convert string to float: 'S'

In [7]:
# 1. Import necessary libraries
import pandas as pd                      # For data manipulation and analysis.
import numpy as np                       # For numerical operations.
from sklearn.tree import DecisionTreeClassifier  # The classification model.
from sklearn.model_selection import train_test_split  # To split the dataset.
from sklearn.metrics import accuracy_score         # To evaluate the model.

# 2. Load the dataset using a raw string to avoid unicode escape issues.
data = pd.read_csv(r"C:\Users\Sahil\Downloads\Titanic-Dataset.csv")

# 3. Data Preprocessing

# (a) Handle missing values:
# Fill missing 'Age' values with the mean age. Notice we assign back to the column to avoid chained assignment.
data['Age'] = data['Age'].fillna(data['Age'].mean())

# For the 'Embarked' column, fill missing values with the most common port (here assumed as 'S')
data['Embarked'] = data['Embarked'].fillna('S')

# (b) Encode categorical variables:
# Convert the 'Sex' column to numeric: male -> 0, female -> 1.
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})

# Convert the 'Embarked' column to numeric. Here we map: S -> 0, C -> 1, Q -> 2.
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# (c) Drop columns that are not useful for prediction:
# For this example, we drop columns like 'Name', 'Ticket', and 'Cabin' because they contain non-numeric information.
data = data.drop(['Name', 'Ticket', 'Cabin'], axis=1)

# 4. Define Features and Target Variable:
# We assume 'Survived' is our target variable. All other columns will be features.
X = data.drop(['Survived'], axis=1)
y = data['Survived']

# 5. Split the dataset into training and testing sets:
# We use 80% of the data for training and 20% for testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Create and Train the Model:
# Initialize the Decision Tree Classifier.
model = DecisionTreeClassifier()

# Fit the model on the training data.
model.fit(X_train, y_train)

# 7. Make Predictions and Evaluate the Model:
# Use the trained model to predict survival on the test set.
predictions = model.predict(X_test)

# Calculate the accuracy score to evaluate model performance.
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


Accuracy: 0.7597765363128491
