In [64]:
import pandas as pd
import numpy as np

# 1. Load the dataset from the specified path
# Using forward slashes is the best way to avoid syntax errors on any operating system.
df = pd.read_csv("C:/Users/sathr/OneDrive/Desktop/AIML internship/smart_logistics_dataset.csv")

# 2. Data Cleaning and Preprocessing
# Convert 'Timestamp' to a proper datetime format
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Fill missing values in 'Logistics_Delay_Reason' with a new category.
# This modern syntax avoids the FutureWarning.
df['Logistics_Delay_Reason'] = df['Logistics_Delay_Reason'].fillna('No Delay Reason')

# 3. Feature Engineering
# Create time-based features from the Timestamp
df['hour'] = df['Timestamp'].dt.hour
df['day_of_week'] = df['Timestamp'].dt.dayofweek
df['day_of_year'] = df['Timestamp'].dt.dayofyear

# Define a temperature threshold (e.g., 5 degrees C) and calculate the thermal abuse score
temp_threshold = 5  
df['thermal_abuse_score'] = df['Temperature'].apply(lambda temp: max(0, temp - temp_threshold))

# Sort the data by Asset_ID and Timestamp for accurate cumulative sum
df.sort_values(by=['Asset_ID', 'Timestamp'], inplace=True)

# Calculate the cumulative thermal abuse score for each truck
df['cumulative_thermal_abuse'] = df.groupby('Asset_ID')['thermal_abuse_score'].cumsum()

# Create the binary spoilage flag based on a threshold
spoilage_threshold = 800
df['spoilage_flag'] = (df['cumulative_thermal_abuse'] > spoilage_threshold).astype(int)

# 4. Final Feature Selection
# Create a new DataFrame with only the relevant columns we planned for
columns_to_keep = [
    'Timestamp', 'Asset_ID', 'Temperature', 'Humidity', 
    'Shipment_Status', 'Traffic_Status', 'Logistics_Delay_Reason', 
    'hour', 'day_of_week', 'day_of_year', 
    'thermal_abuse_score', 'cumulative_thermal_abuse', 'spoilage_flag'
]

df_final = df[columns_to_keep]

# 5. Save the final updated DataFrame to a new CSV file
df_final.to_csv('smart_logistics_updated.csv', index=False)

print("Data preprocessing and feature engineering is completed.")
print("The final cleaned and prepared dataset has been saved to 'smart_logistics_updated.csv'.")

Data preprocessing and feature engineering is completed.
The final cleaned and prepared dataset has been saved to 'smart_logistics_updated.csv'.


In [66]:
# Get the total number of spoiled items
total_spoiled = df['spoilage_flag'].sum()

print(f"Total number of spoiled items: {total_spoiled}")

Total number of spoiled items: 581
