# Week 2 – Data Collection & Preprocessing in Python

# Step 1: Uploading the 3 CSV files

In [None]:
from google.colab import files
uploaded_files = files.upload()


Saving inventory_table.csv to inventory_table.csv
Saving orders_table.csv to orders_table.csv
Saving suppliers_table.csv to suppliers_table.csv


# Step 2: Import Required Libraries

In [None]:
import pandas as pd
import numpy as np


# Step 3: Load the CSV Files into DataFrames

In [None]:
orders = pd.read_csv('orders_table.csv')
suppliers = pd.read_csv('suppliers_table.csv')
inventory = pd.read_csv('inventory_table.csv')


# Step 4: Explore and Clean the Data

In [None]:
#View original column names
print("Orders:", orders.columns.tolist())
print("Suppliers:", suppliers.columns.tolist())
print("Inventory:", inventory.columns.tolist())

#Normalize column names (trim and lowercase)
orders.columns = orders.columns.str.strip().str.lower()
suppliers.columns = suppliers.columns.str.strip().str.lower()
inventory.columns = inventory.columns.str.strip().str.lower()

#Drop rows with nulls, convert dates, rename columns
orders.dropna(inplace=True)
suppliers.dropna(inplace=True)
inventory.dropna(inplace=True)

# Date conversion
orders['order_date'] = pd.to_datetime(orders['order_date'], errors='coerce')
orders['delivery_date'] = pd.to_datetime(orders['delivery_date'], errors='coerce')

# Remove invalid date rows
orders.dropna(subset=['order_date', 'delivery_date'], inplace=True)

# Rename for clarity
orders.rename(columns={
    'order_id': 'OrderID',
    'product_id': 'ProductID',
    'supplier_id': 'SupplierID',
    'quantity': 'Quantity',
    'order_date': 'OrderDate',
    'delivery_date': 'DeliveryDate',
    'status': 'Status'
}, inplace=True)

suppliers.rename(columns={
    'supplier_id': 'SupplierID',
    'name': 'SupplierName',
    'contact_info': 'Email',
    'location': 'Region'
}, inplace=True)

inventory.rename(columns={
    'product_id': 'ProductID',
    'product_name': 'ItemName',
    'quantity_in_stock': 'StockQty',
    'reorder_level': 'ReorderThreshold'
}, inplace=True)


Orders: ['order_id', 'supplier_id', 'inventory_id', 'order_date', 'delivery_date', 'status', 'quantity']
Suppliers: ['supplier_id', 'name', 'email', 'phone', 'location']
Inventory: ['inventory_id', 'product_name', 'quantity', 'reorder_level', 'supplier_id']


# Step 5: Perform Data Calculations with NumPy

In [None]:
#Order Delays & Flags
orders['DaysDelayed'] = (orders['DeliveryDate'] - orders['OrderDate']).dt.days
orders['DaysDelayed'] = np.where(orders['DaysDelayed'] < 0, 0, orders['DaysDelayed'])
orders['Delayed'] = np.where(orders['DaysDelayed'] > 1, 1, 0)

#Delay Summary
total_orders = len(orders)
delayed = np.sum(orders['Delayed'])
delay_percent = (delayed / total_orders) * 100

print(f"\nTotal Orders: {total_orders}")
print(f"Delayed Orders: {delayed}")
print(f"Delay Rate: {delay_percent:.1f}%")

#Inventory Stock Statistics
if 'StockQty' in inventory.columns:
    stock_values = inventory['StockQty'].astype(int).to_numpy()

    avg_stock = np.mean(stock_values)
    low_stock = np.min(stock_values)
    high_stock = np.max(stock_values)
    std_dev = np.std(stock_values)

    print("\nInventory Stock Analysis:")
    print(f" - Average: {avg_stock:.2f}")
    print(f" - Minimum: {low_stock}")
    print(f" - Maximum: {high_stock}")
    print(f" - Std Dev: {std_dev:.2f}")





Total Orders: 3
Delayed Orders: 3
Delay Rate: 100.0%


# Step 6: Export Cleaned Files and Download

In [None]:
orders.to_csv("cleaned_orders.csv", index=False)
inventory.to_csv("cleaned_inventory.csv", index=False)
suppliers.to_csv("cleaned_suppliers.csv", index=False)

files.download("cleaned_orders.csv")
files.download("cleaned_inventory.csv")
files.download("cleaned_suppliers.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Step 7:Display Cleaned Output

In [None]:
df_orders = pd.read_csv("cleaned_orders.csv")
df_inventory = pd.read_csv("cleaned_inventory.csv")
df_suppliers = pd.read_csv("cleaned_suppliers.csv")

# Preview cleaned data
print("\nOrders Data:")
display(df_orders.head())

print("\nInventory Data:")
display(df_inventory.head())

print("\nSuppliers Data:")
display(df_suppliers.head())



Orders Data:


Unnamed: 0,OrderID,SupplierID,inventory_id,OrderDate,DeliveryDate,Status,Quantity,DaysDelayed,Delayed
0,1,1,1,2025-07-01,2025-07-03,Delivered,30,2,1
1,2,2,2,2025-07-05,2025-07-23,Delivered,10,18,1
2,3,2,3,2025-07-06,2025-07-09,Delivered,20,3,1



Inventory Data:


Unnamed: 0,inventory_id,ItemName,quantity,ReorderThreshold,supplier_id
0,1,Laptop Battery,50,20,1
1,2,Keyboard,15,10,2
2,3,HDMI Cable,5,10,2
3,4,Monitor,25,15,3
4,5,Wireless Mouse,8,10,1



Suppliers Data:


Unnamed: 0,SupplierID,SupplierName,email,phone,Region
0,1,Alpha Supplies,alpha@supplies.com,9876543210,Chennai
1,2,Beta Traders,beta@traders.com,9123456780,Bangalore
2,3,Gamma Distributors,gamma@distributors.com,9000001111,Delhi
