Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

Loading Data

In [2]:
file = 'C:\\Users\\R44063\\OneDrive - E.ON\\_VSCode\\Projekt Udacity\\data\\raw\\PurchaseOrders.csv'
df = pd.read_csv(file, sep=';', encoding='utf-8') # Good Polish encoding

Display the DataFrame

In [3]:
df

Unnamed: 0,Purchasing Document,Item,Req Delivery Date,Delivery Date,Delivery Completed,Requisitioner,Created By,Supplier,Material Group,Terms of Payment
0,4500093085,10,06.06.2025,06.06.2025,Yes,A72755,M89400,42513214,C492005,Y016
1,4500045759,10,31.12.2024,29.02.2024,No,D40969,S64912,42471833,C613500,Y009
2,4500091628,10,05.06.2025,,No,A72755,M89400,42513214,C492005,Y016
3,4500045943,10,31.01.2024,23.01.2024,No,J60975,B26557,42473253,C551000,Y016
4,4500090106,10,14.05.2025,,No,M85771,A44167,41019592,C640500,YEM3
...,...,...,...,...,...,...,...,...,...,...
28650,4500054156,30,07.05.2024,25.10.2024,No,A72401,B26557,61001220,C530505,Y016
28651,4500054156,10,07.05.2024,25.10.2024,No,A72401,B26557,61001220,C530505,Y016
28652,4500054156,20,07.05.2024,25.10.2024,No,A72401,B26557,61001220,C530505,Y016
28653,4500089337,10,31.12.2025,,No,M85771,A44167,61001523,C683000,Y004


Creating a target variable - Delayed delivery

In [11]:
# Today's date as a date object
today = pd.to_datetime('today').date()

# Convert date strings to datetime.date
df['Req Delivery Date'] = pd.to_datetime(df['Req Delivery Date'], format='%d.%m.%Y').dt.date
df['Delivery Date'] = pd.to_datetime(df['Delivery Date'], format='%d.%m.%Y', errors='coerce').dt.date

# Condition 1: Requested delivery date was more than 2 days ago and delivery has not occurred
condition_1 = (
    (df['Req Delivery Date'] < (today - pd.Timedelta(days=2))) &
    (df['Delivery Date'].isna())
)

# Condition 2: Delivery happened more than 2 days before the requested delivery date
condition_2 = (
    df['Delivery Date'].notna() &
    ((df['Req Delivery Date'] - df['Delivery Date']) > pd.Timedelta(days=2))
)

# Condition 3: Delivery completed = NO
condition_3 = (
    df['Delivery Completed'] == 'NO'
)

# Flag delayed deliveries
df['Delayed delivery'] = np.where(condition_3, 1, np.where(condition_1 | condition_2, 1, 0))
df

Unnamed: 0,Purchasing Document,Item,Req Delivery Date,Delivery Date,Delivery Completed,Requisitioner,Created By,Supplier,Material Group,Terms of Payment,Delayed delivery
0,4500093085,10,2025-06-06,2025-06-06,Yes,A72755,M89400,42513214,C492005,Y016,0
1,4500045759,10,2024-12-31,2024-02-29,No,D40969,S64912,42471833,C613500,Y009,1
2,4500091628,10,2025-06-05,NaT,No,A72755,M89400,42513214,C492005,Y016,1
3,4500045943,10,2024-01-31,2024-01-23,No,J60975,B26557,42473253,C551000,Y016,1
4,4500090106,10,2025-05-14,NaT,No,M85771,A44167,41019592,C640500,YEM3,1
...,...,...,...,...,...,...,...,...,...,...,...
28650,4500054156,30,2024-05-07,2024-10-25,No,A72401,B26557,61001220,C530505,Y016,0
28651,4500054156,10,2024-05-07,2024-10-25,No,A72401,B26557,61001220,C530505,Y016,0
28652,4500054156,20,2024-05-07,2024-10-25,No,A72401,B26557,61001220,C530505,Y016,0
28653,4500089337,10,2025-12-31,NaT,No,M85771,A44167,61001523,C683000,Y004,0
