# RC Pakistan Cargo & Logistics - Exploratory Data Analysis

This notebook covers:
1. Business metrics and KPIs
2. Route analysis (UAE to Pakistan/Kashmir)
3. Transport mode performance
4. Revenue analysis
5. Customer behavior insights

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("RC Pakistan Cargo & Logistics - Exploratory Data Analysis")
print("=" * 60)

## 1. Load Star Schema Data

In [None]:
# Load star schema tables
dim_date = pd.read_csv('../star_schema/DimDate.csv')
dim_customer = pd.read_csv('../star_schema/DimCustomer.csv')
dim_city = pd.read_csv('../star_schema/DimCity.csv')
dim_transport = pd.read_csv('../star_schema/DimTransportMode.csv')
dim_status = pd.read_csv('../star_schema/DimStatus.csv')
fact_shipment = pd.read_csv('../star_schema/FactShipment.csv')
fact_revenue = pd.read_csv('../star_schema/FactRevenue.csv')

# Convert date columns
dim_date['FullDate'] = pd.to_datetime(dim_date['FullDate'])

print("Star schema data loaded successfully")
print(f"Total shipments: {len(fact_shipment):,}")
print(f"Total revenue records: {len(fact_revenue):,}")
print(f"Date range: {dim_date['FullDate'].min()} to {dim_date['FullDate'].max()}")

## 2. Business KPIs Dashboard

In [None]:
# Calculate key business metrics
total_shipments = len(fact_shipment)
total_revenue = fact_revenue['Amount'].sum()
avg_revenue_per_shipment = total_revenue / total_shipments
total_weight = fact_shipment['WeightKG'].sum()
avg_transit_days = fact_shipment['TransitDays'].mean()
unique_customers = fact_shipment['CustomerKey'].nunique()

# Create KPI dashboard
fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=('Total Shipments', 'Total Revenue (AED)', 'Avg Revenue/Shipment',
                   'Total Weight (KG)', 'Avg Transit Days', 'Active Customers'),
    specs=[[{"type": "indicator"}, {"type": "indicator"}, {"type": "indicator"}],
           [{"type": "indicator"}, {"type": "indicator"}, {"type": "indicator"}]]
)

# Add indicators
fig.add_trace(go.Indicator(
    mode="number",
    value=total_shipments,
    title={"text": "Total Shipments"},
    number={'font': {'size': 40}}
), row=1, col=1)

fig.add_trace(go.Indicator(
    mode="number",
    value=total_revenue,
    title={"text": "Total Revenue (AED)"},
    number={'font': {'size': 40}, 'prefix': 'AED '}
), row=1, col=2)

fig.add_trace(go.Indicator(
    mode="number",
    value=avg_revenue_per_shipment,
    title={"text": "Avg Revenue/Shipment"},
    number={'font': {'size': 40}, 'prefix': 'AED '}
), row=1, col=3)

fig.add_trace(go.Indicator(
    mode="number",
    value=total_weight,
    title={"text": "Total Weight (KG)"},
    number={'font': {'size': 40}}
), row=2, col=1)

fig.add_trace(go.Indicator(
    mode="number",
    value=avg_transit_days,
    title={"text": "Avg Transit Days"},
    number={'font': {'size': 40}}
), row=2, col=2)

fig.add_trace(go.Indicator(
    mode="number",
    value=unique_customers,
    title={"text": "Active Customers"},
    number={'font': {'size': 40}}
), row=2, col=3)

fig.update_layout(height=400, title_text="RC Pakistan Cargo & Logistics - Key Performance Indicators")
fig.show()

## 3. Route Analysis

In [None]:
# Create route analysis by joining with city dimensions
route_analysis = fact_shipment.merge(
    dim_city.rename(columns={'CityKey': 'OriginCityKey', 'CityName': 'OriginCity', 'Country': 'OriginCountry'}),
    on='OriginCityKey'
).merge(
    dim_city.rename(columns={'CityKey': 'DestinationCityKey', 'CityName': 'DestinationCity', 'Country': 'DestinationCountry'}),
    on='DestinationCityKey'
)

route_analysis['Route'] = route_analysis['OriginCity'] + ' ‚Üí ' + route_analysis['DestinationCity']

# Top routes by volume
top_routes = route_analysis.groupby('Route').agg({
    'ShipmentID': 'count',
    'WeightKG': 'sum',
    'TransitDays': 'mean'
}).rename(columns={'ShipmentID': 'Shipments'}).round(2)

top_routes = top_routes.sort_values('Shipments', ascending=False).head(10)

# Visualize top routes
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Shipment volume by route
top_routes['Shipments'].plot(kind='barh', ax=ax1, color='skyblue')
ax1.set_title('Top 10 Routes by Shipment Volume')
ax1.set_xlabel('Number of Shipments')

# Average transit days by route
top_routes['TransitDays'].plot(kind='barh', ax=ax2, color='lightcoral')
ax2.set_title('Average Transit Days by Route')
ax2.set_xlabel('Days')

plt.tight_layout()
plt.show()

print("Top 10 Routes Analysis:")
print(top_routes)

## 4. Transport Mode Performance

In [None]:
# Transport mode analysis
transport_analysis = fact_shipment.merge(dim_transport, left_on='TransportModeKey', right_on='ModeKey')

transport_metrics = transport_analysis.groupby('ModeName').agg({
    'ShipmentID': 'count',
    'WeightKG': ['sum', 'mean'],
    'TransitDays': 'mean'
}).round(2)

transport_metrics.columns = ['Shipments', 'Total_Weight', 'Avg_Weight', 'Avg_Transit_Days']

# Revenue by transport mode
revenue_by_transport = fact_revenue.merge(
    fact_shipment[['BookingID', 'TransportModeKey']], on='BookingID'
).merge(dim_transport, left_on='TransportModeKey', right_on='ModeKey')

transport_revenue = revenue_by_transport.groupby('ModeName').agg({
    'Amount': ['sum', 'mean'],
    'RevenuePerKG': 'mean'
}).round(2)

transport_revenue.columns = ['Total_Revenue', 'Avg_Revenue', 'Revenue_Per_KG']

# Combine metrics
transport_summary = pd.concat([transport_metrics, transport_revenue], axis=1)

print("Transport Mode Performance Analysis:")
print(transport_summary)

# Visualize transport mode comparison
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# Shipment distribution
transport_summary['Shipments'].plot(kind='pie', ax=ax1, autopct='%1.1f%%')
ax1.set_title('Shipment Distribution by Transport Mode')
ax1.set_ylabel('')

# Revenue distribution
transport_summary['Total_Revenue'].plot(kind='pie', ax=ax2, autopct='%1.1f%%')
ax2.set_title('Revenue Distribution by Transport Mode')
ax2.set_ylabel('')

# Transit days comparison
transport_summary['Avg_Transit_Days'].plot(kind='bar', ax=ax3, color=['orange', 'green'])
ax3.set_title('Average Transit Days by Transport Mode')
ax3.set_ylabel('Days')
ax3.tick_params(axis='x', rotation=0)

# Revenue per KG comparison
transport_summary['Revenue_Per_KG'].plot(kind='bar', ax=ax4, color=['purple', 'brown'])
ax4.set_title('Revenue per KG by Transport Mode')
ax4.set_ylabel('AED per KG')
ax4.tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

## 5. Temporal Analysis

In [None]:
# Monthly trends analysis
monthly_trends = fact_shipment.merge(
    dim_date.rename(columns={'DateKey': 'BookingDateKey'}), 
    on='BookingDateKey'
)

monthly_summary = monthly_trends.groupby(['Year', 'Month', 'MonthName']).agg({
    'ShipmentID': 'count',
    'WeightKG': 'sum'
}).rename(columns={'ShipmentID': 'Shipments'}).reset_index()

# Revenue trends
monthly_revenue = fact_revenue.merge(
    dim_date.rename(columns={'DateKey': 'PaymentDateKey'}), 
    on='PaymentDateKey'
).groupby(['Year', 'Month', 'MonthName'])['Amount'].sum().reset_index()

monthly_analysis = monthly_summary.merge(monthly_revenue, on=['Year', 'Month', 'MonthName'])
monthly_analysis['Month_Year'] = monthly_analysis['MonthName'] + ' ' + monthly_analysis['Year'].astype(str)

# Plot monthly trends
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(15, 12))

# Monthly shipments
ax1.plot(monthly_analysis['Month'], monthly_analysis['Shipments'], marker='o', linewidth=2, markersize=8)
ax1.set_title('Monthly Shipment Volume Trend')
ax1.set_xlabel('Month')
ax1.set_ylabel('Number of Shipments')
ax1.grid(True, alpha=0.3)

# Monthly revenue
ax2.plot(monthly_analysis['Month'], monthly_analysis['Amount'], marker='s', color='green', linewidth=2, markersize=8)
ax2.set_title('Monthly Revenue Trend')
ax2.set_xlabel('Month')
ax2.set_ylabel('Revenue (AED)')
ax2.grid(True, alpha=0.3)

# Monthly weight
ax3.plot(monthly_analysis['Month'], monthly_analysis['WeightKG'], marker='^', color='red', linewidth=2, markersize=8)
ax3.set_title('Monthly Weight Volume Trend')
ax3.set_xlabel('Month')
ax3.set_ylabel('Total Weight (KG)')
ax3.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Monthly Business Trends:")
print(monthly_analysis[['MonthName', 'Shipments', 'WeightKG', 'Amount']].round(2))

## 6. Customer Analysis

In [None]:
# Customer behavior analysis
customer_analysis = fact_shipment.merge(dim_customer, on='CustomerKey')
customer_revenue = fact_revenue.merge(dim_customer, on='CustomerKey')

# Customer metrics
customer_metrics = customer_analysis.groupby(['CustomerKey', 'CustomerName', 'City']).agg({
    'ShipmentID': 'count',
    'WeightKG': 'sum',
    'TransitDays': 'mean'
}).rename(columns={'ShipmentID': 'Total_Shipments'}).reset_index()

customer_revenue_metrics = customer_revenue.groupby('CustomerKey').agg({
    'Amount': 'sum'
}).rename(columns={'Amount': 'Total_Revenue'}).reset_index()

customer_summary = customer_metrics.merge(customer_revenue_metrics, on='CustomerKey')
customer_summary['Revenue_Per_Shipment'] = customer_summary['Total_Revenue'] / customer_summary['Total_Shipments']

# Top customers by revenue
top_customers = customer_summary.nlargest(10, 'Total_Revenue')

print("Top 10 Customers by Revenue:")
print(top_customers[['CustomerName', 'City', 'Total_Shipments', 'Total_Revenue', 'Revenue_Per_Shipment']].round(2))

# Customer city distribution
city_distribution = customer_analysis['City'].value_counts()

plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
city_distribution.plot(kind='bar', color='lightblue')
plt.title('Customer Distribution by City')
plt.xlabel('City')
plt.ylabel('Number of Customers')
plt.xticks(rotation=45)

plt.subplot(2, 2, 2)
customer_summary['Total_Shipments'].hist(bins=20, color='lightgreen', alpha=0.7)
plt.title('Distribution of Shipments per Customer')
plt.xlabel('Number of Shipments')
plt.ylabel('Number of Customers')

plt.subplot(2, 2, 3)
customer_summary['Total_Revenue'].hist(bins=20, color='salmon', alpha=0.7)
plt.title('Distribution of Revenue per Customer')
plt.xlabel('Total Revenue (AED)')
plt.ylabel('Number of Customers')

plt.subplot(2, 2, 4)
plt.scatter(customer_summary['Total_Shipments'], customer_summary['Total_Revenue'], alpha=0.6)
plt.title('Shipments vs Revenue per Customer')
plt.xlabel('Total Shipments')
plt.ylabel('Total Revenue (AED)')

plt.tight_layout()
plt.show()

## 7. Business Insights Summary

In [None]:
# Generate business insights
print("RC PAKISTAN CARGO & LOGISTICS - BUSINESS INSIGHTS")
print("=" * 60)

print("\nüìä KEY PERFORMANCE METRICS:")
print(f"‚Ä¢ Total Shipments: {total_shipments:,}")
print(f"‚Ä¢ Total Revenue: AED {total_revenue:,.2f}")
print(f"‚Ä¢ Average Revenue per Shipment: AED {avg_revenue_per_shipment:.2f}")
print(f"‚Ä¢ Total Weight Handled: {total_weight:,.2f} KG")
print(f"‚Ä¢ Average Transit Time: {avg_transit_days:.1f} days")
print(f"‚Ä¢ Active Customer Base: {unique_customers:,} customers")

print("\nüöö TRANSPORT MODE INSIGHTS:")
air_shipments = transport_summary.loc['Air', 'Shipments']
sea_shipments = transport_summary.loc['Sea', 'Shipments']
air_revenue = transport_summary.loc['Air', 'Total_Revenue']
sea_revenue = transport_summary.loc['Sea', 'Total_Revenue']

print(f"‚Ä¢ Air Transport: {air_shipments} shipments ({air_shipments/total_shipments*100:.1f}%)")
print(f"‚Ä¢ Sea Transport: {sea_shipments} shipments ({sea_shipments/total_shipments*100:.1f}%)")
print(f"‚Ä¢ Air generates {air_revenue/total_revenue*100:.1f}% of total revenue")
print(f"‚Ä¢ Sea generates {sea_revenue/total_revenue*100:.1f}% of total revenue")

print("\nüó∫Ô∏è ROUTE INSIGHTS:")
most_popular_route = top_routes.index[0]
most_popular_count = top_routes.iloc[0]['Shipments']
print(f"‚Ä¢ Most Popular Route: {most_popular_route} ({most_popular_count} shipments)")
print(f"‚Ä¢ Top 3 Destinations: {route_analysis['DestinationCity'].value_counts().head(3).index.tolist()}")

print("\nüë• CUSTOMER INSIGHTS:")
avg_shipments_per_customer = customer_summary['Total_Shipments'].mean()
avg_revenue_per_customer = customer_summary['Total_Revenue'].mean()
print(f"‚Ä¢ Average Shipments per Customer: {avg_shipments_per_customer:.1f}")
print(f"‚Ä¢ Average Revenue per Customer: AED {avg_revenue_per_customer:.2f}")
print(f"‚Ä¢ Top Customer Revenue: AED {top_customers.iloc[0]['Total_Revenue']:.2f}")

print("\nüìà BUSINESS RECOMMENDATIONS:")
print("‚Ä¢ Focus on Air transport optimization - higher revenue per KG")
print("‚Ä¢ Develop customer retention programs for high-value customers")
print("‚Ä¢ Expand capacity on popular routes")
print("‚Ä¢ Implement dynamic pricing based on route demand")
print("‚Ä¢ Consider seasonal promotions during low-volume months")

print("\n‚úÖ Exploratory Data Analysis Completed!")