In [None]:
# Install required libraries (Colab)
!pip install pandas numpy pyspark requests pymongo mysql-connector-python -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.9/33.9 MB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import requests
from datetime import datetime

In [None]:
#  Import libraries
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [None]:
# Creating Spark Session
spark = SparkSession.builder.appName("DevOps").getOrCreate()
spark

In [None]:
# Uploading required files
from google.colab import files
uploaded = files.upload()

Saving inventory.csv to inventory.csv
Saving orders.csv to orders.csv
Saving suppliers.csv to suppliers.csv


In [None]:
# Step 2: Load Uploaded Data
orders_df = pd.read_csv("orders.csv")
inventory_df = pd.read_csv("inventory.csv")
suppliers_df = pd.read_csv("suppliers.csv")

print("Data loaded successfully!\n")
print("Orders sample:\n", orders_df.head())
print("\nInventory sample:\n", inventory_df.head())
print("\nSuppliers sample:\n", suppliers_df.head())

Data loaded successfully!

Orders sample:
    order_id  product_id  supplier_id  quantity  order_Date delivery_date  \
0         1           3            1        40  2025-07-03    2025-07-07   
1         2           7            4        15  2025-07-06    2025-07-11   
2         3           2            2        25  2025-07-01    2025-07-06   
3         4          10            5        10  2025-07-10    2025-07-14   
4         5           1            3        50  2025-07-02    2025-07-07   

      status  
0    Shipped  
1  Delivered  
2    Pending  
3  Cancelled  
4  Delivered  

Inventory sample:
    product_id product_name  quantity_in_stock  reorder_level
0           1       Laptop                 50             10
1           2        Mouse                200             30
2           3     Keyboard                150             25
3           4      Monitor                 80             15
4           5      Printer                 60             10

Suppliers sample:
    s

In [9]:
# Step 3: Data Cleaning
# Convert dates to datetime
orders_df['order_Date'] = pd.to_datetime(orders_df['order_Date'])
orders_df['delivery_date'] = pd.to_datetime(orders_df['delivery_date'])

# Drop missing values (if any)
orders_df.dropna(inplace=True)
inventory_df.dropna(inplace=True)
suppliers_df.dropna(inplace=True)

In [14]:
# Step 4: Calculations / Analysis
# Calculate delay in days
orders_df['delay_days'] = (pd.Timestamp.today() - orders_df['delivery_date']).dt.days
orders_df['delayed'] = np.where(orders_df['delay_days'] > 0, 1, 0)

# Top delayed customers
top_delayed_orders = orders_df.groupby('order_id')['delayed'].sum().sort_values(ascending=False)
print("Top Delayed Orders:\n", top_delayed_orders)

# Rename columns to match notebook
inventory_df.rename(columns={
    'quantity_in_stock': 'stock_qty'
}, inplace=True)

# Inventory stock check
low_stock = inventory_df[inventory_df['stock_qty'] < 10]  # Example threshold
print("\nLow Stock Items:\n", low_stock)

Top Delayed Orders:
 order_id
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
Name: delayed, dtype: int64

Low Stock Items:
 Empty DataFrame
Columns: [product_id, product_name, stock_qty, reorder_level]
Index: []


In [15]:
# Step 5: Aggregations / Grouping
# Delays by supplier
supplier_delays = orders_df.groupby('supplier_id')['delayed'].sum()
print("\nDelays by Supplier:\n", supplier_delays)

# Save to CSV
orders_df.to_csv("processed_orders.csv", index=False)
low_stock.to_csv("low_stock.csv", index=False)



Delays by Supplier:
 supplier_id
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
Name: delayed, dtype: int64


In [17]:
# Step 6: Logging (Week 5 Simulation)
log_text = f"Top Delayed Orders:\n{top_delayed_orders}\n\nLow Stock Items:\n{low_stock}\n\nDelays by Supplier:\n{supplier_delays}"

with open("pipeline_log.txt", "w") as f:
    f.write(log_text)

print("\nPipeline executed successfully. Check 'pipeline_log.txt' for details!")



Pipeline executed successfully. Check 'pipeline_log.txt' for details!


In [18]:
# Step 7: Download Output Files
files.download("pipeline_log.txt")
files.download("processed_orders.csv")
files.download("low_stock.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>