# SVM and Decision Tree Classification Exercise

This notebook provides an exercise for training and evaluating:
- Support Vector Machine (SVM) with hyperparameter tuning (GridSearchCV).
- Decision Tree classifier, with tree visualization and feature importance.

We use the Titanic dataset (contains both numerical and categorical features) as a practical example.

In [169]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
import matplotlib.pyplot as plt

In [170]:
# 1. Load dataset
df = sns.load_dataset('titanic')
df.dropna()
# Select features and target, drop missing values
numeric_features = ['age', 'fare', 'sibsp', 'parch']
categorical_features = ['sex', 'embarked']
df = df[numeric_features + categorical_features + ['survived']]

x = df[numeric_features + categorical_features]

y = df['survived']

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       714 non-null    float64
 1   fare      891 non-null    float64
 2   sibsp     891 non-null    int64  
 3   parch     891 non-null    int64  
 4   sex       891 non-null    object 
 5   embarked  889 non-null    object 
 6   survived  891 non-null    int64  
dtypes: float64(2), int64(3), object(2)
memory usage: 48.9+ KB


In [171]:
x = df.drop(columns=['survived'])
y = df['survived']

In [172]:
# 2. Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42,stratify=y)
print(f"Train shape: {x_train.shape}, Test shape: {x_test.shape}")
print(f"Train target shape: {y_train.shape}, Test target shape: {y_test.shape}")


Train shape: (712, 6), Test shape: (179, 6)
Train target shape: (712,), Test target shape: (179,)


In [173]:
# Encode categorical features
x_train=pd.get_dummies(x_train, columns=categorical_features, drop_first=True)
x_test=pd.get_dummies(x_test, columns=categorical_features, drop_first=True)




In [177]:
# 3. Preprocess features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train[numeric_features])
x_test_scaled = scaler.transform(x_test[numeric_features])



In [178]:
print(x_train.columns)
print(x_test.columns)


Index(['age', 'fare', 'sibsp', 'parch', 'sex_male', 'embarked_Q',
       'embarked_S'],
      dtype='object')
Index(['age', 'fare', 'sibsp', 'parch', 'sex_male', 'embarked_Q',
       'embarked_S'],
      dtype='object')


In [179]:
print(x_train)

      age      fare  sibsp  parch  sex_male  embarked_Q  embarked_S
692   NaN   56.4958      0      0      True       False        True
481   NaN    0.0000      0      0      True       False        True
527   NaN  221.7792      0      0      True       False        True
855  18.0    9.3500      0      1     False       False        True
801  31.0   26.2500      1      1     False       False        True
..    ...       ...    ...    ...       ...         ...         ...
359   NaN    7.8792      0      0     False        True       False
258  35.0  512.3292      0      0     False       False       False
736  48.0   34.3750      1      3     False       False        True
462  47.0   38.5000      0      0      True       False        True
507   NaN   26.5500      0      0      True       False        True

[712 rows x 7 columns]


In [180]:
# 4. Train SVM with GridSearchCV
param_grid = {
    'linear': {'kernel': ['linear'],'C': [0.1, 1, 10, 100]},
    'poly': {'kernel': ['poly'],'C': [0.1, 1, 10, 100],'degree': [2, 3, 4]},
    'sigmoid': {'kernel': ['sigmoid'],'C': [0.1, 1, 10, 100],'gamma': [0.001, 0.01, 0.1, 1]},
    'rbf': {'kernel': ['rbf'],'C': [0.1, 1, 10, 100],'gamma': [0.001, 0.01, 0.1, 1]},
}



In [182]:
from sklearn.svm import SVC
model=SVC(probability=True)
model.fit(x_train_scaled, y_train)

ValueError: Input X contains NaN.
SVC does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# 5. Evaluate SVM on test set
y_pred=model.predict(X_test)
y_pred_proba=model.predict_proba(X_train)[:,1]

accuracy=accuracy_score(y_test,y_pred)
precision=precision_score(y_test,y_pred)
recall=recall_score(y_test,y_pred)
f1=f1_score(y_test,y_pred)
roc_auc=roc_auc_score(y_test,y_pred_proba)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")  





NotFittedError: This SVC instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
# 6. Train baseline Decision Tree



In [None]:
# 7. Inspect tree rules



NameError: name 'dt' is not defined

In [None]:
# 8. Plot Decision Tree


In [None]:
# 9. Evaluate Decision Tree


In [None]:
# 10. Feature Importances


In [None]:
# 11. Hyperparameter Tuning for Decision Tree


In [None]:
# 12. Evaluate Tuned Decision Tree
y_pred_dt_best = best_dt.predict(X_test_enc)
