In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression # Our first model choice
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import joblib # For saving the model

# Load the processed data
df = pd.read_csv(r'../data/processed/smart_logistics_processed.csv')
# Re-convert Timestamp if needed, though for modeling we'll use extracted features
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [2]:
df.columns

Index(['Timestamp', 'Asset_ID', 'Latitude', 'Longitude', 'Inventory_Level',
       'Shipment_Status', 'Temperature', 'Humidity', 'Traffic_Status',
       'Waiting_Time', 'User_Transaction_Amount', 'User_Purchase_Frequency',
       'Logistics_Delay_Reason', 'Asset_Utilization', 'Demand_Forecast',
       'Logistics_Delay', 'Year', 'Month', 'Day_of_Week', 'Hour_of_Day',
       'Day_Name', 'Is_Weekend', 'Time_of_Day'],
      dtype='object')

In [3]:
# Drop irrelevant/target-leaking columns for prediction
X = df.drop(['Timestamp', 'Logistics_Delay', 'Logistics_Delay_Reason', 'Asset_ID', 'Waiting_Time'], axis=1)
y = df['Logistics_Delay']

In [4]:
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Using stratify=y is important for imbalanced target variables (if delays are rare)
# Check balance: y.value_counts() and y_train.value_counts(), y_test.value_counts()

In [10]:
# Create preprocessors for numerical and categorical features
# numerical_transformer = StandardScaler()
# categorical_transformer = OneHotEncoder(handle_unknown='ignore') # handle_unknown='ignore' for unseen categories in test set

# # Create a preprocessor using ColumnTransformer
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_transformer, numerical_features),
#         ('cat', categorical_transformer, categorical_features)
#     ],
#     remainder='passthrough' # Keep other columns (if any)
# )

df

Unnamed: 0,Timestamp,Asset_ID,Latitude,Longitude,Inventory_Level,Shipment_Status,Temperature,Humidity,Traffic_Status,Waiting_Time,...,Asset_Utilization,Demand_Forecast,Logistics_Delay,Year,Month,Day_of_Week,Hour_of_Day,Day_Name,Is_Weekend,Time_of_Day
0,2024-03-20 00:11:14,Truck_7,-65.7383,11.2497,390,Delayed,27.0,67.8,Detour,38,...,60.1,285,1,2024,3,2,0,Wednesday,0,Evening
1,2024-10-30 07:53:51,Truck_6,22.2748,-131.7086,491,In Transit,22.5,54.3,Heavy,16,...,80.9,174,1,2024,10,2,7,Wednesday,0,Morning
2,2024-07-29 18:42:48,Truck_10,54.9232,79.5455,190,In Transit,25.2,62.2,Detour,34,...,99.2,260,0,2024,7,0,18,Monday,0,Evening
3,2024-10-28 00:50:54,Truck_9,42.3900,-1.4788,330,Delivered,25.4,52.3,Heavy,37,...,97.4,160,1,2024,10,0,0,Monday,0,Evening
4,2024-09-27 15:52:58,Truck_7,-65.8477,47.9468,480,Delayed,20.5,57.2,Clear,56,...,71.6,270,1,2024,9,4,15,Friday,0,Afternoon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2024-07-22 16:30:00,Truck_6,89.8701,73.6867,264,Delivered,26.9,70.0,Heavy,32,...,79.2,213,1,2024,7,0,16,Monday,0,Evening
996,2024-04-30 04:58:58,Truck_5,-10.4792,-177.1239,479,Delivered,23.7,77.9,Detour,56,...,83.7,272,0,2024,4,1,4,Tuesday,0,Evening
997,2024-10-27 22:09:13,Truck_2,-71.0609,75.3714,347,In Transit,21.0,63.1,Detour,35,...,74.8,275,0,2024,10,6,22,Sunday,1,Evening
998,2024-04-18 23:06:56,Truck_2,-76.7910,18.3631,276,Delivered,18.0,64.3,Heavy,10,...,88.6,242,1,2024,4,3,23,Thursday,0,Evening


In [None]:
# model regression
df.

In [7]:
# Create the full pipeline with preprocessing and the model
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', LogisticRegression(random_state=42, solver='liblinear')) # liblinear works well for small datasets
                                ])

# Train the model
model_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,100


In [9]:
joblib.dump(model_pipeline, r'../models/best_logistics_delay_model.pkl')
print("Model saved successfully to ../models/best_logistics_delay_model.pkl")

Model saved successfully to ../models/best_logistics_delay_model.pkl


In [4]:
df = pd.read_csv(r'E:\Machine-Failure-Prediction-using-AI4I-2020-Data\data\ai4i2020.csv')
df

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,M24855,M,298.8,308.4,1604,29.5,14,0,0,0,0,0,0
9996,9997,H39410,H,298.9,308.4,1632,31.8,17,0,0,0,0,0,0
9997,9998,M24857,M,299.0,308.6,1645,33.4,22,0,0,0,0,0,0
9998,9999,H39412,H,299.0,308.7,1408,48.5,25,0,0,0,0,0,0
