# RC Pakistan Cargo & Logistics - Data Ingestion and Quality Assessment

This notebook covers:
1. Data ingestion from CSV files
2. Data quality assessment
3. Data profiling and validation
4. Initial data cleaning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("RC Pakistan Cargo & Logistics - Data Engineering Pipeline")
print("=" * 60)

## 1. Data Ingestion

In [None]:
# Load all datasets
customers_df = pd.read_csv('../data/customers.csv')
bookings_df = pd.read_csv('../data/bookings.csv')
shipments_df = pd.read_csv('../data/shipments.csv')
payments_df = pd.read_csv('../data/payments.csv')

print("Dataset Shapes:")
print(f"Customers: {customers_df.shape}")
print(f"Bookings: {bookings_df.shape}")
print(f"Shipments: {shipments_df.shape}")
print(f"Payments: {payments_df.shape}")

## 2. Data Quality Assessment

In [None]:
def assess_data_quality(df, name):
    print(f"\n{name} Data Quality Assessment:")
    print("-" * 40)
    
    # Basic info
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Missing values
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print("\nMissing Values:")
        print(missing[missing > 0])
    else:
        print("\nNo missing values found")
    
    # Duplicates
    duplicates = df.duplicated().sum()
    print(f"\nDuplicate rows: {duplicates}")
    
    # Data types
    print("\nData Types:")
    print(df.dtypes)
    
    return df

# Assess each dataset
customers_df = assess_data_quality(customers_df, "Customers")
bookings_df = assess_data_quality(bookings_df, "Bookings")
shipments_df = assess_data_quality(shipments_df, "Shipments")
payments_df = assess_data_quality(payments_df, "Payments")

## 3. Data Profiling

In [None]:
# Customer distribution by city
plt.figure(figsize=(12, 6))
customers_df['City'].value_counts().plot(kind='bar')
plt.title('Customer Distribution by City')
plt.xlabel('City')
plt.ylabel('Number of Customers')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Booking status distribution
plt.figure(figsize=(10, 6))
bookings_df['Status'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Booking Status Distribution')
plt.ylabel('')
plt.show()

# Transport mode analysis
plt.figure(figsize=(8, 6))
bookings_df['Mode'].value_counts().plot(kind='bar')
plt.title('Transport Mode Distribution')
plt.xlabel('Transport Mode')
plt.ylabel('Number of Bookings')
plt.show()

## 4. Data Validation and Cleaning

In [None]:
# Convert date columns
date_columns = {
    'customers_df': ['CreatedDate'],
    'bookings_df': ['BookingDate'],
    'shipments_df': ['ShipmentDate', 'ExpectedDelivery'],
    'payments_df': ['Date']
}

for df_name, cols in date_columns.items():
    df = locals()[df_name]
    for col in cols:
        df[col] = pd.to_datetime(df[col])
    print(f"Converted date columns in {df_name}: {cols}")

# Validate business rules
print("\nBusiness Rule Validations:")
print("-" * 30)

# Check if shipment date is after booking date
merged_dates = bookings_df.merge(shipments_df, on='BookingID')
invalid_dates = merged_dates[merged_dates['ShipmentDate'] < merged_dates['BookingDate']]
print(f"Invalid shipment dates (before booking): {len(invalid_dates)}")

# Check weight consistency
weight_stats = bookings_df['WeightKG'].describe()
print(f"\nWeight Statistics:")
print(weight_stats)

# Check for negative amounts
negative_amounts = payments_df[payments_df['Amount'] < 0]
print(f"\nNegative payment amounts: {len(negative_amounts)}")

## 5. Save Cleaned Data

In [None]:
import os

# Create processed data directory
os.makedirs('../processed_data', exist_ok=True)

# Save cleaned datasets
customers_df.to_csv('../processed_data/customers_clean.csv', index=False)
bookings_df.to_csv('../processed_data/bookings_clean.csv', index=False)
shipments_df.to_csv('../processed_data/shipments_clean.csv', index=False)
payments_df.to_csv('../processed_data/payments_clean.csv', index=False)

print("Cleaned data saved to processed_data/ directory")
print("Data ingestion and quality assessment completed!")