## STEP 1: Data Preprocessing
- Data is cleaned and preprocesed to handle missing values.
- Categorical variables such as Traffic Status, Logistics_Delay_Reason proper scaling for ML models.
- 

In [6]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [18]:
df = pd.read_csv("smart_logistics_dataset.csv")
df.head()

Unnamed: 0,Timestamp,Asset_ID,Latitude,Longitude,Inventory_Level,Shipment_Status,Temperature,Humidity,Traffic_Status,Waiting_Time,User_Transaction_Amount,User_Purchase_Frequency,Logistics_Delay_Reason,Asset_Utilization,Demand_Forecast,Logistics_Delay
0,2024-03-20 00:11:14,Truck_7,-65.7383,11.2497,390,Delayed,27.0,67.8,Detour,38,320,4,,60.1,285,1
1,2024-10-30 07:53:51,Truck_6,22.2748,-131.7086,491,In Transit,22.5,54.3,Heavy,16,439,7,Weather,80.9,174,1
2,2024-07-29 18:42:48,Truck_10,54.9232,79.5455,190,In Transit,25.2,62.2,Detour,34,355,3,,99.2,260,0
3,2024-10-28 00:50:54,Truck_9,42.39,-1.4788,330,Delivered,25.4,52.3,Heavy,37,227,5,Traffic,97.4,160,1
4,2024-09-27 15:52:58,Truck_7,-65.8477,47.9468,480,Delayed,20.5,57.2,Clear,56,197,6,,71.6,270,1


In [19]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Timestamp                1000 non-null   object 
 1   Asset_ID                 1000 non-null   object 
 2   Latitude                 1000 non-null   float64
 3   Longitude                1000 non-null   float64
 4   Inventory_Level          1000 non-null   int64  
 5   Shipment_Status          1000 non-null   object 
 6   Temperature              1000 non-null   float64
 7   Humidity                 1000 non-null   float64
 8   Traffic_Status           1000 non-null   object 
 9   Waiting_Time             1000 non-null   int64  
 10  User_Transaction_Amount  1000 non-null   int64  
 11  User_Purchase_Frequency  1000 non-null   int64  
 12  Logistics_Delay_Reason   737 non-null    object 
 13  Asset_Utilization        1000 non-null   float64
 14  Demand_Forecast          

Unnamed: 0,Latitude,Longitude,Inventory_Level,Temperature,Humidity,Waiting_Time,User_Transaction_Amount,User_Purchase_Frequency,Asset_Utilization,Demand_Forecast,Logistics_Delay
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,-1.360093,0.837049,297.915,23.8939,65.0422,35.062,299.055,5.513,79.5991,199.284,0.566
std,51.997183,104.843618,113.554773,3.322178,8.753765,14.477768,117.787792,2.935379,11.631153,59.920847,0.495873
min,-89.7915,-179.8202,100.0,18.0,50.0,10.0,100.0,1.0,60.0,100.0,0.0
25%,-46.167975,-88.448075,201.0,21.2,57.2,23.0,191.75,3.0,69.475,144.0,0.0
50%,-4.50315,0.6783,299.0,23.8,65.2,35.0,301.5,6.0,79.25,202.0,1.0
75%,44.5028,88.15645,399.0,26.6,72.4,49.0,405.0,8.0,89.425,251.25,1.0
max,89.8701,179.9237,500.0,30.0,80.0,60.0,500.0,10.0,100.0,300.0,1.0


Preparacion de la data
- Realizaremos conversion de tipos para algunas columnas que lo requieran
- Crearemos nuevas columnas de información clave.
- Considearemos que en la columna "Logistics_delay_reason" un valor faltante significa que no hubo alguna razón de demora.


In [20]:
df['Logistics_Delay_Reason'] = df['Logistics_Delay_Reason'].fillna('No Delay Reason')
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['hour'] = df['Timestamp'].dt.hour
df['day_of_week'] = df['Timestamp'].dt.dayofweek
df.drop('Timestamp', axis=1, inplace=True) # Eliminar la columna original

In [16]:
# 3. Aplicar One-Hot Encoding a las variables categóricas
categorical_features = ['Asset_ID', 'Shipment_Status', 'Traffic_Status', 'Logistics_Delay_Reason']

# Usa pd.get_dummies para realizar One-Hot Encoding
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)
df.head()

Unnamed: 0,Latitude,Longitude,Inventory_Level,Temperature,Humidity,Waiting_Time,User_Transaction_Amount,User_Purchase_Frequency,Asset_Utilization,Demand_Forecast,...,Asset_ID_Truck_6,Asset_ID_Truck_7,Asset_ID_Truck_8,Asset_ID_Truck_9,Shipment_Status_Delivered,Shipment_Status_In Transit,Traffic_Status_Detour,Traffic_Status_Heavy,Logistics_Delay_Reason_Traffic,Logistics_Delay_Reason_Weather
0,-65.7383,11.2497,390,27.0,67.8,38,320,4,60.1,285,...,False,True,False,False,False,False,True,False,False,False
1,22.2748,-131.7086,491,22.5,54.3,16,439,7,80.9,174,...,True,False,False,False,False,True,False,True,False,True
2,54.9232,79.5455,190,25.2,62.2,34,355,3,99.2,260,...,False,False,False,False,False,True,True,False,False,False
3,42.39,-1.4788,330,25.4,52.3,37,227,5,97.4,160,...,False,False,False,True,True,False,False,True,True,False
4,-65.8477,47.9468,480,20.5,57.2,56,197,6,71.6,270,...,False,True,False,False,False,False,False,False,False,False


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Asset_ID                 1000 non-null   object 
 1   Latitude                 1000 non-null   float64
 2   Longitude                1000 non-null   float64
 3   Inventory_Level          1000 non-null   int64  
 4   Shipment_Status          1000 non-null   object 
 5   Temperature              1000 non-null   float64
 6   Humidity                 1000 non-null   float64
 7   Traffic_Status           1000 non-null   object 
 8   Waiting_Time             1000 non-null   int64  
 9   User_Transaction_Amount  1000 non-null   int64  
 10  User_Purchase_Frequency  1000 non-null   int64  
 11  Logistics_Delay_Reason   1000 non-null   object 
 12  Asset_Utilization        1000 non-null   float64
 13  Demand_Forecast          1000 non-null   int64  
 14  Logistics_Delay          

In [None]:
Normalizaremos columnas como 

In [None]:
y = df['Concretecompressivestrength']
x = df.drop('Concretecompressivestrength', axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print("Train size :", x_train.shape, y_train.shape)
print("Test size  :", x_test.shape, y_test.shape)

Analizaremos los valores de columnas claves para observar si los podemos comvertir a números, los cuales son más facilmente procesables.

Asset_ID
Truck_8     109
Truck_4     107
Truck_2     105
Truck_10    105
Truck_6     103
Truck_7     102
Truck_9      94
Truck_5      93
Truck_3      93
Truck_1      89
Name: count, dtype: int64