# Precision-Recall Tradeoff

In [1]:
import pandas as pd
from sklearn import set_config
set_config(transform_output="pandas")

In [2]:
# get dataset
titanic_df = pd.read_csv("C:/Users/Vic/Desktop/Data Scienece/Datasets-20231016/Dataset_Titanic.csv")
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
titanic_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
# prediction target
y = titanic_df["Survived"]


In [5]:
# training data
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']
X = titanic_df[features]

X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch
0,3,male,22.0,1,0
1,1,female,38.0,1,0
2,3,female,26.0,0,0
3,1,female,35.0,1,0
4,3,male,35.0,0,0


In [6]:
# create train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [7]:
# One-hot encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
imp = SimpleImputer()

ct = ColumnTransformer(
    [('ohe', ohe, ['Sex']), 
    ('imputer', imp, ['Age'])],              
    remainder='passthrough'
)

ct.fit_transform(X_train)

Unnamed: 0,ohe__Sex_female,ohe__Sex_male,imputer__Age,remainder__Pclass,remainder__SibSp,remainder__Parch
692,0.0,1.0,29.807687,3,0,0
481,0.0,1.0,29.807687,2,0,0
527,0.0,1.0,29.807687,1,0,0
855,1.0,0.0,18.000000,3,0,1
801,1.0,0.0,31.000000,2,1,1
...,...,...,...,...,...,...
359,1.0,0.0,29.807687,3,0,0
258,1.0,0.0,35.000000,1,0,0
736,1.0,0.0,48.000000,3,1,3
462,0.0,1.0,47.000000,1,0,0


In [8]:
# create pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

# define classifier (= ML-model)
clf = DecisionTreeClassifier()

# create pipeline
pipe = Pipeline([
    ('preprocessor', ct),
    ('classifier', clf)]
)

In [9]:
pipe.fit(X_train, y_train)

In [10]:
y_pred = pipe.predict(X_test)

In [11]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.87      0.86       110
           1       0.78      0.74      0.76        69

    accuracy                           0.82       179
   macro avg       0.81      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



# Adjusting the Classification-Threshold

In [12]:
# Source: https://machinelearningmastery.com/threshold-moving-for-imbalanced-classification/
# use model to predict values
y_pred_new = pipe.predict(X_test)
y_pred_new

array([0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1], dtype=int64)

In [13]:
# get probabilities for predictions
# lists: [probability for class 0, probability for class 1]
pipe.predict_proba(X_test)

array([[1.        , 0.        ],
       [0.5       , 0.5       ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.25      , 0.75      ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.5       , 0.5       ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.9137931 , 0.0862069 ],
       [0.66666667, 0.33333333],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.9137931 , 0.0862069 ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.9137931 , 0.0862069 ],
       [1.        , 0.        ],
       [0.8       , 0.2       ],
       [0.        , 1.        ],
       [0.6       , 0.4       ],
       [0.        , 1.        ],
       [0.9137931 , 0.0862069 ],
       [0.        , 1.        ],
       [0.6       , 0.4       ],
       [0.9137931 , 0.0862069 ],
       [1.

In [14]:
# get only probabilities for class 1 ("survived")
pipe.predict_proba(X_test)[:,0]

array([1.        , 0.5       , 1.        , 1.        , 0.25      ,
       0.        , 0.        , 0.5       , 1.        , 1.        ,
       0.9137931 , 0.66666667, 0.        , 0.        , 1.        ,
       1.        , 1.        , 0.9137931 , 1.        , 0.        ,
       0.9137931 , 1.        , 0.8       , 0.        , 0.6       ,
       0.        , 0.9137931 , 0.        , 0.6       , 0.9137931 ,
       1.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.9137931 , 0.        ,
       1.        , 0.        , 1.        , 0.        , 0.        ,
       0.91666667, 0.9137931 , 1.        , 0.        , 0.        ,
       0.        , 0.23529412, 0.23529412, 0.875     , 0.33333333,
       0.5       , 1.        , 0.        , 1.        , 1.        ,
       0.25      , 0.        , 0.66666667, 1.        , 1.        ,
       1.        , 0.        , 1.        , 1.        , 1.        ,
       0.        , 0.9137931 , 0.5       , 0.        , 0.66666

In [15]:
# Switching threshold from 0.5 to x (Default-threshold = 0.5)

# Only if the probability is higher than threshold, we will assign class 0 (dead). Otherwise: survived!
y_pred_new = (pipe.predict_proba(X_test)[:,0] >= 0.1).astype(int)
y_pred_new = ~y_pred_new + 2
y_pred_new

array([0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1])

In [16]:
print(classification_report(y_test, y_pred_new))

              precision    recall  f1-score   support

           0       0.80      0.90      0.85       110
           1       0.80      0.64      0.71        69

    accuracy                           0.80       179
   macro avg       0.80      0.77      0.78       179
weighted avg       0.80      0.80      0.79       179



In [17]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_new)
cm

array([[99, 11],
       [25, 44]], dtype=int64)