In [1]:
# Install required libraries (Colab)
!pip install pandas numpy pyspark requests pymongo mysql-connector-python -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.9/33.9 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import requests
from datetime import datetime

In [3]:
#  Import libraries
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [4]:
# Creating Spark Session
spark = SparkSession.builder.appName("DevOps_Pipeline").getOrCreate()
spark

In [5]:
# Uploading required files
from google.colab import files
uploaded = files.upload()

Saving customers.csv to customers.csv
Saving delivery_status.csv to delivery_status.csv
Saving orders.csv to orders.csv


In [6]:
# Load Uploaded Data
customers_df = pd.read_csv("customers.csv")
delivery_status_df = pd.read_csv("delivery_status.csv")
orders_df = pd.read_csv("orders.csv")

print("Data loaded successfully!\n")
print("Customers sample:\n", customers_df.head())
print("\nDelivery Status sample:\n", delivery_status_df.head())
print("\nOrders sample:\n", orders_df.head())

Data loaded successfully!

Customers sample:
    customer_id          name        contact_info           address
0            1  Anjali Mehta  anjali@example.com     Mumbai, India
1            2  Rohit Sharma   rohit@example.com      Delhi, India
2            3   Priya Reddy   priya@example.com  Hyderabad, India
3            4   Karan Patel   karan@example.com  Ahmedabad, India
4            5    Neha Singh    neha@example.com       Pune, India

Delivery Status sample:
    status_id  order_id current_status           updated_at
0          1         1      Delivered  2025-07-04 18:00:00
1          2         2      Delivered  2025-07-05 17:30:00
2          3         3        Shipped  2025-07-06 12:45:00
3          4         4     Processing  2025-07-07 10:15:00
4          5         5      Cancelled  2025-07-08 09:00:00

Orders sample:
    order_id  customer_id  product_id  order_date delivery_date      status
0         1            1         101  2025-07-01    2025-07-04   Delivered
1    

In [7]:
# Step 2: Data Cleaning
orders_df["order_date"] = pd.to_datetime(orders_df["order_date"])
orders_df["delivery_date"] = pd.to_datetime(orders_df["delivery_date"])
orders_df.dropna(inplace=True)

In [8]:
# Step 3: Delay Calculation
orders_df["delay_days"] = (pd.Timestamp.today() - orders_df["delivery_date"]).dt.days
orders_df["delayed"] = np.where(orders_df["delay_days"] > 0, 1, 0)

In [11]:
# Step 4: Insights
# Top delayed customers
top_delayed_customers = orders_df.groupby("customer_id")["delayed"].sum().sort_values(ascending=False)

# Join with customer region
orders_with_customer = orders_df.merge(customers_df, on="customer_id", how="left")
delays_by_region = orders_with_customer.groupby("address")["delayed"].sum()
print(delays_by_region)

address
Ahmedabad, India    1
Bangalore, India    1
Chennai, India      1
Delhi, India        1
Hyderabad, India    1
Jaipur, India       1
Kochi, India        1
Kolkata, India      1
Mumbai, India       1
Pune, India         1
Name: delayed, dtype: int64


In [12]:
# Step 5: Save Outputs
# Step 6: Logging
orders_df.to_csv("processed_orders.csv", index=False)

log_text = f"""
=== Delay Summary Log ===
Top Delayed Customers:
{top_delayed_customers.head(10)}

Delays by Region:
{delays_by_region}

Pipeline executed at {datetime.now()}
"""

# Step 7: Save & Outputs
with open("pipeline_log.txt", "w") as f:
    f.write(log_text)

print("Log file created: pipeline_log.txt")
print("Pipeline execution completed successfully!")

Log file created: pipeline_log.txt
Pipeline execution completed successfully!


In [13]:
# Step 8: Download output files
from google.colab import files
files.download("pipeline_log.txt")
files.download("processed_orders.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>