In [31]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


def preprocess_data(file_path):
    # Load data from CSV
    df = pd.read_csv(file_path)
    
    # Fill missing ages with the median age
    df['Age'].fillna(df['Age'].median(), inplace=True)
    
    # Fill missing Embarked values with the mode
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    
    # Create a new feature for FamilySize (including the passenger)
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    
    # Categorize FamilySize into groups (Alone, Small, Large)
    df["FamilyGroup"] = pd.cut(
        df["FamilySize"],
        bins=[0, 1, 4, np.inf],
        labels=["Alone", "Small", "Large"]
    )
    
    # Create a binary feature where 1 indicates the passenger is alone
    df["IsAlone"] = (df["FamilySize"] == 1).astype(int)
    
    # Combine Pclass and FamilySize to capture potential interaction effects
    df["Pclass_FamilySize"] = df["Pclass"] * df["FamilySize"]
    
    # Extract titles from the Name using a regular expression
    df["Title"] = df["Name"].str.extract(r' ([A-Za-z]+)\.', expand=False)
    
    # Replace some of the less common or similar titles
    df["Title"] = df["Title"].replace(["Rev", "Dr", "Col", "Major"], "Rare")
    df["Title"] = df["Title"].replace(["Mlle", "Ms"], "Miss")
    df["Title"] = df["Title"].replace("Mme", "Mrs")
    
    # Map the titles to numerical values
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    df["Title"] = df["Title"].map(title_mapping)
    
    # Convert Sex into a binary variable: 0 for male and 1 for female
    df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
    
    # One-hot encode the Embarked column
    df = pd.get_dummies(df, columns=["Embarked"], prefix="Embarked")
    
    return df

In [32]:
df_train = preprocess_data('train.csv')
df_test = preprocess_data('test.csv')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we

In [33]:
X_train = df_train[['Pclass', 'Sex']].values
y_train = df_train[['Survived']].values

X_test = df_test[['Pclass', 'Sex']].values


In [34]:
# Using several depths of decision tree to predict
depths = [2, 3, 4, 5]
for depth in depths:
    clf = tree.DecisionTreeClassifier(max_depth=depth, criterion='entropy', random_state=42)
    clf.fit(X_train, y_train)

    # Predict on validation set
    y_pred = clf.predict(X_test).reshape(-1)
    
    # Print results
    print(f"\nMax Depth = {depth}")
    print(f"  Precision: {y_pred}")


Max Depth = 2
  Precision: [0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0
 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 1 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0
 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1
 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 0 0 0 1 0 0 1 0 0 0]

Max Depth = 3
  Precision: [0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 1 0 0