# Load the Dataset

In [1]:
# Import necessary libraries.
import pandas as pd

In [2]:
# load the data from the '../data/raw' folder.
file_path = r"../data/raw/smart_logistics_dataset.csv"
try:
    df = pd.read_csv(file_path)
    print("The dataset has been successfully loaded.")
except FileNotFoundError:
    print(f"File not found at {file_path}")

The dataset has been successfully loaded.


The descriptions for the columns are straight from the kaggle dataset description.
Key Features:  
- Timestamp: Date and time when the data was recorded, representing logistics activity.  
- Asset_ID: Unique identifier for the logistical assets (e.g., trucks).  
- Latitude & Longitude: Geographical coordinates of the asset for tracking and monitoring.  
- Inventory_Level: Current level of inventory associated with the asset or shipment.  
- Shipment_Status: Status of the shipment (e.g., In Transit, Delivered, Delayed).  
- Temperature: Temperature recorded at the time of the shipment or transportation.  
- Humidity: Humidity level at the time of recording.  
- Traffic_Status: Current traffic condition (e.g., Clear, Heavy, Detour).  
- Waiting_Time: Time spent waiting during the logistics process (in minutes).  
- User_Transaction_Amount: Monetary amount associated with user transactions.  
- User_Purchase_Frequency: Frequency of purchases made by the user.  
- Logistics_Delay_Reason: Reason for any delays in the logistics process (e.g., Weather, Mechanical Failure).  
- Asset_Utilization: Percentage of asset utilization, indicating how effectively assets are being used.  
- Demand_Forecast: Predicted demand for the logistics services in the coming period.  
- Logistics_Delay (Target): Binary variable indicating whether a logistics delay occurred (1 for delay, 0 for no delay).  

# Initial Understanding of the Data

In [3]:
df.head()

Unnamed: 0,Timestamp,Asset_ID,Latitude,Longitude,Inventory_Level,Shipment_Status,Temperature,Humidity,Traffic_Status,Waiting_Time,User_Transaction_Amount,User_Purchase_Frequency,Logistics_Delay_Reason,Asset_Utilization,Demand_Forecast,Logistics_Delay
0,2024-03-20 00:11:14,Truck_7,-65.7383,11.2497,390,Delayed,27.0,67.8,Detour,38,320,4,,60.1,285,1
1,2024-10-30 07:53:51,Truck_6,22.2748,-131.7086,491,In Transit,22.5,54.3,Heavy,16,439,7,Weather,80.9,174,1
2,2024-07-29 18:42:48,Truck_10,54.9232,79.5455,190,In Transit,25.2,62.2,Detour,34,355,3,,99.2,260,0
3,2024-10-28 00:50:54,Truck_9,42.39,-1.4788,330,Delivered,25.4,52.3,Heavy,37,227,5,Traffic,97.4,160,1
4,2024-09-27 15:52:58,Truck_7,-65.8477,47.9468,480,Delayed,20.5,57.2,Clear,56,197,6,,71.6,270,1


In [4]:
df.tail()

Unnamed: 0,Timestamp,Asset_ID,Latitude,Longitude,Inventory_Level,Shipment_Status,Temperature,Humidity,Traffic_Status,Waiting_Time,User_Transaction_Amount,User_Purchase_Frequency,Logistics_Delay_Reason,Asset_Utilization,Demand_Forecast,Logistics_Delay
995,2024-07-22 16:30:00,Truck_6,89.8701,73.6867,264,Delivered,26.9,70.0,Heavy,32,188,1,Weather,79.2,213,1
996,2024-04-30 04:58:58,Truck_5,-10.4792,-177.1239,479,Delivered,23.7,77.9,Detour,56,276,7,Weather,83.7,272,0
997,2024-10-27 22:09:13,Truck_2,-71.0609,75.3714,347,In Transit,21.0,63.1,Detour,35,382,5,,74.8,275,0
998,2024-04-18 23:06:56,Truck_2,-76.791,18.3631,276,Delivered,18.0,64.3,Heavy,10,361,5,,88.6,242,1
999,2024-09-18 19:39:24,Truck_8,59.8356,-114.4198,157,Delayed,18.7,73.6,Clear,55,323,9,Mechanical Failure,81.3,141,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Timestamp                1000 non-null   object 
 1   Asset_ID                 1000 non-null   object 
 2   Latitude                 1000 non-null   float64
 3   Longitude                1000 non-null   float64
 4   Inventory_Level          1000 non-null   int64  
 5   Shipment_Status          1000 non-null   object 
 6   Temperature              1000 non-null   float64
 7   Humidity                 1000 non-null   float64
 8   Traffic_Status           1000 non-null   object 
 9   Waiting_Time             1000 non-null   int64  
 10  User_Transaction_Amount  1000 non-null   int64  
 11  User_Purchase_Frequency  1000 non-null   int64  
 12  Logistics_Delay_Reason   737 non-null    object 
 13  Asset_Utilization        1000 non-null   float64
 14  Demand_Forecast          

In [6]:
df.dtypes.value_counts()

int64      6
object     5
float64    5
Name: count, dtype: int64

there are 16 columns in the dataset, in that there are 6 int64 datatype, 5 object datatype and 5 float64 datatype. the column Logistic_Delay_Reason has some missing values. 

In [7]:
df.shape

(1000, 16)

the dataset has 100 rows and 16 columns

In [8]:
df.columns

Index(['Timestamp', 'Asset_ID', 'Latitude', 'Longitude', 'Inventory_Level',
       'Shipment_Status', 'Temperature', 'Humidity', 'Traffic_Status',
       'Waiting_Time', 'User_Transaction_Amount', 'User_Purchase_Frequency',
       'Logistics_Delay_Reason', 'Asset_Utilization', 'Demand_Forecast',
       'Logistics_Delay'],
      dtype='object')

In [9]:
df.describe()

Unnamed: 0,Latitude,Longitude,Inventory_Level,Temperature,Humidity,Waiting_Time,User_Transaction_Amount,User_Purchase_Frequency,Asset_Utilization,Demand_Forecast,Logistics_Delay
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,-1.360093,0.837049,297.915,23.8939,65.0422,35.062,299.055,5.513,79.5991,199.284,0.566
std,51.997183,104.843618,113.554773,3.322178,8.753765,14.477768,117.787792,2.935379,11.631153,59.920847,0.495873
min,-89.7915,-179.8202,100.0,18.0,50.0,10.0,100.0,1.0,60.0,100.0,0.0
25%,-46.167975,-88.448075,201.0,21.2,57.2,23.0,191.75,3.0,69.475,144.0,0.0
50%,-4.50315,0.6783,299.0,23.8,65.2,35.0,301.5,6.0,79.25,202.0,1.0
75%,44.5028,88.15645,399.0,26.6,72.4,49.0,405.0,8.0,89.425,251.25,1.0
max,89.8701,179.9237,500.0,30.0,80.0,60.0,500.0,10.0,100.0,300.0,1.0


In [10]:
df.isnull().sum()

Timestamp                    0
Asset_ID                     0
Latitude                     0
Longitude                    0
Inventory_Level              0
Shipment_Status              0
Temperature                  0
Humidity                     0
Traffic_Status               0
Waiting_Time                 0
User_Transaction_Amount      0
User_Purchase_Frequency      0
Logistics_Delay_Reason     263
Asset_Utilization            0
Demand_Forecast              0
Logistics_Delay              0
dtype: int64

In [11]:
df.isnull().sum().sum()

np.int64(263)

so there is a total of 263 missing values in the 'Logistic_Delay_Reason' coloumn.

In [12]:
df.duplicated().sum()

np.int64(0)

there are no duplicate values in the dataset.

In [13]:
df['Asset_ID'].value_counts()

Asset_ID
Truck_8     109
Truck_4     107
Truck_2     105
Truck_10    105
Truck_6     103
Truck_7     102
Truck_9      94
Truck_3      93
Truck_5      93
Truck_1      89
Name: count, dtype: int64

In [14]:
df['Asset_ID'].nunique()

10

In [15]:
df['Logistics_Delay_Reason'].value_counts()

Logistics_Delay_Reason
Weather               267
Traffic               236
Mechanical Failure    234
Name: count, dtype: int64

In [16]:
df['Traffic_Status'].values

array(['Detour', 'Heavy', 'Detour', 'Heavy', 'Clear', 'Clear', 'Clear',
       'Detour', 'Clear', 'Clear', 'Clear', 'Detour', 'Detour', 'Detour',
       'Heavy', 'Clear', 'Clear', 'Detour', 'Detour', 'Clear', 'Detour',
       'Heavy', 'Detour', 'Clear', 'Detour', 'Detour', 'Heavy', 'Heavy',
       'Heavy', 'Heavy', 'Clear', 'Heavy', 'Detour', 'Heavy', 'Detour',
       'Detour', 'Heavy', 'Heavy', 'Detour', 'Clear', 'Clear', 'Detour',
       'Clear', 'Clear', 'Clear', 'Heavy', 'Clear', 'Heavy', 'Clear',
       'Heavy', 'Heavy', 'Heavy', 'Clear', 'Heavy', 'Clear', 'Detour',
       'Detour', 'Heavy', 'Clear', 'Clear', 'Heavy', 'Heavy', 'Detour',
       'Heavy', 'Detour', 'Detour', 'Clear', 'Heavy', 'Detour', 'Heavy',
       'Clear', 'Detour', 'Detour', 'Clear', 'Heavy', 'Clear', 'Clear',
       'Clear', 'Heavy', 'Clear', 'Clear', 'Clear', 'Detour', 'Heavy',
       'Clear', 'Heavy', 'Heavy', 'Clear', 'Clear', 'Detour', 'Clear',
       'Heavy', 'Heavy', 'Clear', 'Heavy', 'Detour', 'Detour', 

In [19]:
df['Logistics_Delay_Reason'].value_counts()

Logistics_Delay_Reason
Weather               267
Traffic               236
Mechanical Failure    234
Name: count, dtype: int64