In [1]:
import pandas as pd

In [2]:
deliv = pd.read_csv("raw_data/delivery_logs.csv")
components = pd.read_csv("raw_data/components.csv")

In [34]:
deliv.head()

Unnamed: 0,delivery_id,supplier_id,component_id,order_date,expected_delivery_date,actual_delivery_date,quantity_ordered,quantity_received,delivery_status,pending_delivery
0,D0008669,S001,C0212,2024-02-19,2024-03-03,2024-03-04,15347,15347,Delayed,False
1,D0002176,S015,C0052,2024-02-19,2024-03-21,2024-03-21,19149,19149,On-Time,False
2,D0000450,S005,C0018,2024-02-19,2024-03-25,2024-03-25,5525,5525,On-Time,False
3,D0000499,S008,C0019,2024-02-19,2024-04-08,2024-04-09,8919,8919,Delayed,False
4,D0007348,S016,C0170,2024-02-19,2024-03-11,2024-03-12,16632,16632,Delayed,False


In [4]:
deliv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9650 entries, 0 to 9649
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   delivery_id             9650 non-null   object
 1   supplier_id             9650 non-null   object
 2   component_id            9650 non-null   object
 3   order_date              9650 non-null   object
 4   expected_delivery_date  9650 non-null   object
 5   actual_delivery_date    9470 non-null   object
 6   quantity_ordered        9650 non-null   int64 
 7   quantity_received       9650 non-null   int64 
 8   delivery_status         9650 non-null   object
dtypes: int64(2), object(7)
memory usage: 678.6+ KB


In [5]:
# check for missing values
deliv.isnull().sum()

delivery_id                 0
supplier_id                 0
component_id                0
order_date                  0
expected_delivery_date      0
actual_delivery_date      180
quantity_ordered            0
quantity_received           0
delivery_status             0
dtype: int64

In [6]:
# check for duplicates rows
deliv.duplicated().sum()

np.int64(0)

In [None]:
deliv['order_date'] = pd.to_datetime(deliv['order_date'], errors='coerce')
deliv['expected_delivery_date'] = pd.to_datetime(deliv['expected_delivery_date'], errors='coerce')
deliv['actual_delivery_date'] = pd.to_datetime(deliv['actual_delivery_date'], errors='coerce')

In [8]:
min_date = deliv['order_date'].min()
max_date = deliv['order_date'].max()

print(f"Date range: {min_date} to {max_date}")

Date range: 2024-02-19 00:00:00 to 2025-08-15 00:00:00


In [9]:
min_date = deliv['expected_delivery_date'].min()
max_date = deliv['expected_delivery_date'].max()

print(f"Date range: {min_date} to {max_date}")

Date range: 2024-03-03 00:00:00 to 2025-11-09 00:00:00


In [10]:
min_date = deliv['actual_delivery_date'].min()
max_date = deliv['actual_delivery_date'].max()

print(f"Date range: {min_date} to {max_date}")

Date range: 2024-03-04 00:00:00 to 2025-11-09 00:00:00


In [11]:
# check max and min column quatity
min_quantity = deliv['quantity_ordered'].min()
max_quantity = deliv['quantity_ordered'].max()

print(f"Quantity range: {min_quantity} to {max_quantity}")

Quantity range: 2 to 105550


In [12]:
# check max and min column quatity
min_quantity = deliv['quantity_received'].min()
max_quantity = deliv['quantity_received'].max()

print(f"Quantity range: {min_quantity} to {max_quantity}")

Quantity range: 2 to 105550


In [13]:
# check unique values 
deliv['delivery_status'].unique()

array(['Delayed', 'On-Time', 'Unknown', 'Partial'], dtype=object)

In [14]:
# Add a pending_delivery flag
deliv["pending_delivery"] = deliv["actual_delivery_date"].isna()

In [15]:
# Find component_ids in deliveries not in components table
invalid_ids = set(deliv["component_id"]) - set(components["component_id"])

print(f"Found {len(invalid_ids)} mismatched component IDs:", invalid_ids)


Found 0 mismatched component IDs: set()


In [16]:
# Remove leading/trailing spaces
components["component_name"] = components["component_name"].str.strip()

# Normalize spaces (replace multiple spaces with single)
components["component_name"] = components["component_name"].str.replace(r"\s+", " ", regex=True)

# Standardize casing: Title Case for readability
components["component_name"] = components["component_name"].str.title()

# Optional: unify units format (e.g., "Mm" -> "mm", "µm" consistently)
components["component_name"] = components["component_name"].str.replace("Mm", "mm")
components["component_name"] = components["component_name"].str.replace("µM", "µm")

# Check for duplicates
before = len(components)
components = components.drop_duplicates(subset=["component_name"])
after = len(components)

print(f"Cleaned component_name; removed {before - after} duplicates")


Cleaned component_name; removed 0 duplicates


In [17]:
# Check extreme quantity_ordered values and link them to component categories
import pandas as pd

# Load datasets
deliveries = pd.read_csv("raw_data/delivery_logs.csv")
components = pd.read_csv("raw_data/components.csv")

# Merge to bring in category info
deliveries = deliveries.merge(
    components[["component_id", "category"]],
    on="component_id",
    how="left"
)

# Find top 10 largest quantity_ordered rows
top_quantities = deliveries.nlargest(10, "quantity_ordered")[[
    "delivery_id", "component_id", "category", "quantity_ordered"
]]

print("Top 10 orders by quantity:\n", top_quantities)

# Also check statistical threshold: above 99th percentile
q99 = deliveries["quantity_ordered"].quantile(0.99)
possible_outliers = deliveries[deliveries["quantity_ordered"] > q99][
    ["delivery_id", "component_id", "category", "quantity_ordered"]
].sort_values(by="quantity_ordered", ascending=False)

print(f"\n99th percentile threshold: {q99}")
print("Orders above 99th percentile:\n", possible_outliers)


Top 10 orders by quantity:
      delivery_id component_id              category  quantity_ordered
8781    D0006679        C0157  Solder Balls/Spheres            105550
2199    D0007416        C0171  Solder Balls/Spheres             80346
4388    D0006812        C0160  Solder Balls/Spheres             79209
4391    D0007431        C0171  Solder Balls/Spheres             77479
3382    D0006756        C0159  Solder Balls/Spheres             77347
4918    D0006986        C0163  Solder Balls/Spheres             73118
9601    D0006634        C0156  Solder Balls/Spheres             72041
8860    D0007458        C0171  Solder Balls/Spheres             71793
9165    D0006631        C0156  Solder Balls/Spheres             71454
2987    D0006653        C0157  Solder Balls/Spheres             71271

99th percentile threshold: 43978.97000000001
Orders above 99th percentile:
      delivery_id component_id              category  quantity_ordered
8781    D0006679        C0157  Solder Balls/Spheres    

In [18]:
forecasts = pd.read_csv("raw_data/forecasts.csv")

In [19]:
forecasts.head()

Unnamed: 0,month,component_id,forecast_units
0,2024-02,C0001,66
1,2024-02,C0002,91
2,2024-02,C0003,107
3,2024-02,C0004,63
4,2024-02,C0005,123


In [20]:
forecasts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5280 entries, 0 to 5279
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   month           5280 non-null   object
 1   component_id    5280 non-null   object
 2   forecast_units  5280 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 123.9+ KB


In [21]:
#check for range
min_date = forecasts['month'].min()
max_date = forecasts['month'].max()
print(f"Quantity range: {min_date} to {max_date}")

Quantity range: 2024-02 to 2025-11


In [22]:
invent = pd.read_csv("raw_data/inventory_levels.csv")

In [23]:
invent.head()

Unnamed: 0,date,component_id,opening_stock,stock_in,stock_out,closing_stock
0,2024-02-19,C0001,162,0,10,152
1,2024-02-20,C0001,152,0,2,150
2,2024-02-21,C0001,150,0,4,146
3,2024-02-22,C0001,146,0,11,135
4,2024-02-23,C0001,135,0,5,130


In [24]:
invent.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151200 entries, 0 to 151199
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   date           151200 non-null  object
 1   component_id   151200 non-null  object
 2   opening_stock  151200 non-null  int64 
 3   stock_in       151200 non-null  int64 
 4   stock_out      151200 non-null  int64 
 5   closing_stock  151200 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 6.9+ MB


In [25]:
invent['date'] = pd.to_datetime(invent['date'], errors='coerce')


In [26]:
# Find rows where opening or closing stock is negative
neg_stock = invent[
    (invent["opening_stock"] < 0) | (invent["closing_stock"] < 0)
].copy()

print(f"Found {len(neg_stock):,} rows with negative stock values")
print(neg_stock.head(10))

Found 139,898 rows with negative stock values
         date component_id  opening_stock  stock_in  stock_out  closing_stock
24 2024-03-14        C0001              0         0          6             -6
25 2024-03-15        C0001             -6         0          6            -12
26 2024-03-16        C0001            -12         0          7            -19
27 2024-03-17        C0001            -19         0          3            -22
28 2024-03-18        C0001            -22         0          5            -27
29 2024-03-19        C0001            -27         0         12            -39
30 2024-03-20        C0001            -39         0          8            -47
31 2024-03-21        C0001            -47         0         13            -60
32 2024-03-22        C0001            -60         0          8            -68
33 2024-03-23        C0001            -68         0          7            -75


In [27]:
# Replace negative opening_stock and closing_stock with 0
invent["opening_stock"] = invent["opening_stock"].clip(lower=0)
invent["closing_stock"] = invent["closing_stock"].clip(lower=0)

print("Negative stock values replaced with 0")


Negative stock values replaced with 0


In [28]:
production = pd.read_csv("raw_data/production_orders.csv")

In [29]:
production.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151200 entries, 0 to 151199
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   prod_order_id   151200 non-null  object
 1   date            151200 non-null  object
 2   component_id    151200 non-null  object
 3   units_required  151200 non-null  int64 
 4   units_issued    151200 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 5.8+ MB


In [30]:
production['date'] = pd.to_datetime(production['date'], errors='coerce')


In [31]:
suppliers = pd.read_csv("raw_data/suppliers.csv")

# Strip leading/trailing spaces
suppliers["supplier_name"] = suppliers["supplier_name"].str.strip()

# Standardize casing (Title Case)
suppliers["supplier_name"] = suppliers["supplier_name"].str.title()

# Remove duplicate suppliers if any
before = len(suppliers)
suppliers = suppliers.drop_duplicates(subset=["supplier_name"])
after = len(suppliers)

print(f"Cleaned supplier_name; removed {before - after} duplicates")


Cleaned supplier_name; removed 0 duplicates


In [36]:
deliv = deliv.drop(columns=["pending_delivery"], errors="ignore")

In [37]:
# save cleaned data
components.to_csv("cleaned_data/components.csv", index=False)
invent.to_csv("cleaned_data/inventory_levels.csv", index=False)
deliv.to_csv("cleaned_data/delivery_logs.csv", index=False)
production.to_csv("cleaned_data/production_orders.csv", index=False)
forecasts.to_csv("cleaned_data/forecasts.csv", index=False)
suppliers.to_csv("cleaned_data/suppliers.csv", index=False)