# Week 2- Data Processing with Python

# Step 1: Uploading the 3 CSV files

In [None]:
from google.colab import files
uploaded = files.upload()


Saving customer_table.csv to customer_table.csv
Saving delivery_status_table.csv to delivery_status_table.csv
Saving orders_table.csv to orders_table (1).csv


# Step 2: Import Libraries and Load the Data

In [None]:
import pandas as pd
import numpy as np

# load all CSVs
orders = pd.read_csv("orders_table.csv")
customers = pd.read_csv("customer_table.csv")
delivery_status = pd.read_csv("delivery_status_table.csv")

# preview
print("Orders:")
print(orders.head())
print("\nCustomers:")
print(customers.head())
print("\nDelivery Status:")
print(delivery_status.head())


Orders:
   order_id  customer_id  order_date delivery_date     status
0         1            1  2024-07-01    2024-07-03  delivered
1         2            2  2024-07-05    2024-07-08  delivered
2         3            3  2024-07-10    2024-07-12    delayed
3         4            4  2024-07-12    2024-07-15    delayed
4         5            1  2024-07-18    2024-07-20    delayed

Customers:
   customer_id          name               email       phone   region
0            1   rahul kumar   rahul@example.com  9876543210    north
1            2  anita sharma   anita@example.com  9123456780    south
2            3     vijay rao   vijay@example.com  9988776655     east
3            4  swathi menon  swathi@example.com  9090909090     west
4            5     arjun das   arjun@example.com  9012345678  central

Delivery Status:
   delivery_id  order_id    current_status         last_updated
0            1         1         delivered  2024-07-03 10:00:00
1            2         2         delivered

# Step 3: Check for Missing Values

In [None]:
# check for missing values
print("\nMissing values in orders:\n", orders.isnull().sum())
print("\nMissing values in customers:\n", customers.isnull().sum())
print("\nMissing values in delivery_status:\n", delivery_status.isnull().sum())



Missing values in orders:
 order_id         0
customer_id      0
order_date       0
delivery_date    0
status           0
dtype: int64

Missing values in customers:
 customer_id    0
name           0
email          0
phone          0
region         0
dtype: int64

Missing values in delivery_status:
 delivery_id       0
order_id          0
current_status    0
last_updated      0
dtype: int64


# Step 4: Clean Missing Values


In [None]:
# Drop rows with any missing values
orders.dropna(inplace=True)
customers.dropna(inplace=True)
delivery_status.dropna(inplace=True)




# Step 5: Convert Timestamps to Datetime Format

In [None]:
orders['order_date'] = pd.to_datetime(orders['order_date'])
orders['delivery_date'] = pd.to_datetime(orders['delivery_date'])
delivery_status['last_updated'] = pd.to_datetime(delivery_status['last_updated'])


# Step 6: Calculate Delay in Days

In [None]:
orders['delay_days'] = (pd.Timestamp.today() - orders['delivery_date']).dt.days
print(orders[['order_id', 'delivery_date', 'delay_days']].head())



   order_id delivery_date  delay_days
0         1    2024-07-03         392
1         2    2024-07-08         387
2         3    2024-07-12         383
3         4    2024-07-15         380
4         5    2024-07-20         375


# Step 7: Add a Flag Column (delayed)

In [None]:
orders['delayed'] = np.where(orders['delay_days'] > 0, 1, 0)
print(orders[['order_id', 'delay_days', 'delayed']].head())



   order_id  delay_days  delayed
0         1         392        1
1         2         387        1
2         3         383        1
3         4         380        1
4         5         375        1


# Step 8: Delay Summary by Customer


In [None]:
delay_summary = orders.groupby('customer_id')['delayed'].sum().sort_values(ascending=False)

print("Delayed Orders per Customer:")
display(delay_summary)


Delayed Orders per Customer:


Unnamed: 0_level_0,delayed
customer_id,Unnamed: 1_level_1
1,2
2,1
3,1
4,1


# Step 9: Display and save the cleaned data (Orders)

In [None]:
# Display the cleaned orders data
print("\nCleaned Orders Data (after removing missing values):")
display(orders)

# Save the cleaned data to a new CSV
orders.to_csv("cleaned_orders.csv", index=False)




Cleaned Orders Data (after removing missing values):


Unnamed: 0,order_id,customer_id,order_date,delivery_date,status,delay_days,delayed
0,1,1,2024-07-01,2024-07-03,delivered,392,1
1,2,2,2024-07-05,2024-07-08,delivered,387,1
2,3,3,2024-07-10,2024-07-12,delayed,383,1
3,4,4,2024-07-12,2024-07-15,delayed,380,1
4,5,1,2024-07-18,2024-07-20,delayed,375,1


In [None]:
# Download the file in Colab
from google.colab import files
files.download("cleaned_orders.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>