In [1]:
# NHSRC PHC SUPPLY CHAIN - EDA & NHSRC ALIGNMENT
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

print("üìä NHSRC PHC EDA & ALIGNMENT ANALYSIS")
print("=" * 60)

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

üìä NHSRC PHC EDA & ALIGNMENT ANALYSIS


In [3]:
# 1Ô∏è‚É£ DATA OVERVIEW
print("üìà 1. DATA OVERVIEW")
print("-" * 40)

# Load cleaned data
df = pd.read_csv('data/cleaned_inventory.csv')

üìà 1. DATA OVERVIEW
----------------------------------------


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6480 entries, 0 to 6479
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   date                   6480 non-null   object 
 1   facility_id            6480 non-null   object 
 2   facility_name          6480 non-null   object 
 3   ward_id                6480 non-null   object 
 4   sku_id                 6480 non-null   object 
 5   sku_name               6480 non-null   object 
 6   units_used             6480 non-null   int64  
 7   on_hand                6480 non-null   int64  
 8   stock_in_transit       6480 non-null   int64  
 9   lead_time_days         6480 non-null   int64  
 10  batch_id               6480 non-null   object 
 11  batch_expiry_date      6480 non-null   object 
 12  price_per_unit         6480 non-null   float64
 13  is_holiday             6480 non-null   int64  
 14  patient_admissions     6480 non-null   int64  
 15  ved_

In [15]:
print(df['date'].dtype)

datetime64[ns]


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6480 entries, 0 to 6479
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   date                   2592 non-null   datetime64[ns]
 1   facility_id            6480 non-null   object        
 2   facility_name          6480 non-null   object        
 3   ward_id                6480 non-null   object        
 4   sku_id                 6480 non-null   object        
 5   sku_name               6480 non-null   object        
 6   units_used             6480 non-null   int64         
 7   on_hand                6480 non-null   int64         
 8   stock_in_transit       6480 non-null   int64         
 9   lead_time_days         6480 non-null   int64         
 10  batch_id               6480 non-null   object        
 11  batch_expiry_date      6480 non-null   datetime64[ns]
 12  price_per_unit         6480 non-null   float64       
 13  is_

In [17]:
# Basic statistics
overview = {
    "total_records": len(df),
    "date_range": f"{df['date'].min().date()} to {df['date'].max().date()}",
    "total_days": (df['date'].max() - df['date'].min()).days,
    "unique_facilities": df['facility_id'].nunique(),
    "unique_skus": df['sku_id'].nunique(),
    "total_consumption": df['units_used'].sum(),
    "avg_daily_consumption": df['units_used'].mean(),
    "std_daily_consumption": df['units_used'].std(),
    "missing_values": df.isna().sum().sum(),
    "outliers_detected": df['is_outlier'].sum()
}

print("üìä BASIC STATISTICS:")
for key, value in overview.items():
    print(f"  {key.replace('_', ' ').title()}: {value}")

# Consumption stability
stability_by_sku = df.groupby('sku_id')['units_used'].std() / df.groupby('sku_id')['units_used'].mean()
print(f"\nüìà CONSUMPTION STABILITY (CV by SKU):")
print(f"  Mean CV: {stability_by_sku.mean():.2%}")
print(f"  Max CV: {stability_by_sku.max():.2%}")
print(f"  Min CV: {stability_by_sku.min():.2%}")

üìä BASIC STATISTICS:
  Total Records: 6480
  Date Range: 2024-01-01 to 2024-12-06
  Total Days: 340
  Unique Facilities: 3
  Unique Skus: 12
  Total Consumption: 76647
  Avg Daily Consumption: 11.82824074074074
  Std Daily Consumption: 11.322148777140047
  Missing Values: 3888
  Outliers Detected: 145

üìà CONSUMPTION STABILITY (CV by SKU):
  Mean CV: 38.81%
  Max CV: 47.11%
  Min CV: 19.66%


In [21]:
# 2. IDENTIFY MISSING COLUMNS
print("\nüîç STEP 2: Identifying missing columns...")
missing_summary = df.isna().sum()
missing_columns = missing_summary[missing_summary > 0]

print("Columns with missing values:")
for col, count in missing_columns.items():
    percentage = count / len(df) * 100
    print(f"  {col}: {count:,} missing ({percentage:.1f}%)")


üîç STEP 2: Identifying missing columns...
Columns with missing values:
  date: 3,888 missing (60.0%)


In [None]:
# 2Ô∏è‚É£ VED ANALYSIS
print("\nüè• 2. VED ANALYSIS")
print("-" * 40)

# Compute VED metrics
ved_metrics = df.groupby('ved_category').agg({
    'units_used': ['sum', 'mean', 'std'],
    'on_hand': 'sum',
    'lead_time_days': 'mean',
    'days_cover': 'mean',
    'expiry_days_remaining': lambda x: (x < 90).sum() / len(x) * 100  # % at risk
}).round(2)

ved_metrics.columns = ['total_consumption', 'avg_daily_usage', 'std_daily_usage',
                      'total_stock', 'avg_lead_time', 'avg_coverage_days', 'pct_high_risk']

print("üìä VED METRICS:")
print(ved_metrics)

# Export VED summary
ved_summary = ved_metrics.reset_index()
ved_summary.to_csv('reports/ved_summary.csv', index=False)
print("\nüíæ Exported: reports/ved_summary.csv")

# VED Consumption Plot
plt.figure(figsize=(10, 6))
ved_consumption = df.groupby('ved_category')['units_used'].sum()
ved_bars = ved_consumption.plot(kind='bar', color=['#e74c3c', '#f39c12', '#2ecc71'])
plt.title('Total Consumption by VED Category', fontsize=14, fontweight='bold')
plt.ylabel('Total Units Consumed')
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig('reports/ved_consumption.png', dpi=300, bbox_inches='tight')

# VED √ó Expiry Risk Heatmap
plt.figure(figsize=(10, 6))
ved_expiry = pd.crosstab(df['ved_category'], df['expiry_risk_bucket'])
sns.heatmap(ved_expiry, annot=True, fmt='d', cmap='YlOrRd', linewidths=0.5)
plt.title('VED Category √ó Expiry Risk Distribution', fontsize=14, fontweight='bold')
plt.ylabel('VED Category')
plt.xlabel('Expiry Risk')
plt.tight_layout()
plt.savefig('reports/ved_expiry_heatmap.png', dpi=300, bbox_inches='tight')

print("üìà Plots saved: ved_consumption.png, ved_expiry_heatmap.png")

In [None]:
# 3Ô∏è‚É£ FSN ANALYSIS
print("\nüì¶ 3. FSN ANALYSIS")
print("-" * 40)

# Compute FSN metrics
fsn_metrics = df.groupby('fsn_category').agg({
    'units_used': ['sum', 'mean', 'count'],
    'on_hand': 'mean',
    'days_cover': 'mean',
    'expiry_days_remaining': lambda x: (x < 90).sum() / len(x) * 100
}).round(2)

fsn_metrics.columns = ['total_consumption', 'avg_daily_usage', 'record_count',
                      'avg_stock', 'avg_coverage_days', 'pct_high_risk']

# Calculate turnover ratio (simplified)
fsn_metrics['turnover_ratio'] = (fsn_metrics['total_consumption'] / 
                                (fsn_metrics['avg_stock'] * fsn_metrics['record_count'])).round(3)

print("üìä FSN METRICS:")
print(fsn_metrics)

# Export FSN summary
fsn_summary = fsn_metrics.reset_index()
fsn_summary.to_csv('reports/fsn_summary.csv', index=False)
print("\nüíæ Exported: reports/fsn_summary.csv")

# FSN-VED Matrix
plt.figure(figsize=(10, 6))
fsn_ved_matrix = pd.crosstab(df['fsn_category'], df['ved_category'])
sns.heatmap(fsn_ved_matrix, annot=True, fmt='d', cmap='Blues', linewidths=0.5)
plt.title('FSN √ó VED Matrix: Movement vs Criticality', fontsize=14, fontweight='bold')
plt.ylabel('FSN Category')
plt.xlabel('VED Category')
plt.tight_layout()
plt.savefig('reports/fsn_ved_matrix.png', dpi=300, bbox_inches='tight')

# Consumption Velocity Distribution
plt.figure(figsize=(12, 6))
sku_velocity = df.groupby(['sku_id', 'fsn_category'])['units_used'].mean().reset_index()
sns.boxplot(data=sku_velocity, x='fsn_category', y='units_used')
plt.title('Daily Consumption Distribution by FSN Category', fontsize=14, fontweight='bold')
plt.ylabel('Average Daily Units')
plt.xlabel('FSN Category')
plt.tight_layout()
plt.savefig('reports/fsn_velocity_distribution.png', dpi=300, bbox_inches='tight')

print("üìà Plots saved: fsn_ved_matrix.png, fsn_velocity_distribution.png")

In [None]:
# 4Ô∏è‚É£ FEFO EXPIRY RISK ANALYSIS
print("\nüìÖ 4. FEFO EXPIRY RISK ANALYSIS")
print("-" * 40)

# Compute expiry metrics
expiry_metrics = df.groupby('expiry_risk_bucket').agg({
    'on_hand': 'sum',
    'price_per_unit': 'mean',
    'sku_id': 'nunique',
    'batch_id': 'nunique'
}).round(2)

# Calculate financial risk
expiry_metrics['financial_risk'] = expiry_metrics['on_hand'] * expiry_metrics['price_per_unit']
expiry_metrics['financial_risk_pct'] = (expiry_metrics['financial_risk'] / 
                                      expiry_metrics['financial_risk'].sum() * 100).round(1)

print("üìä EXPIRY RISK METRICS:")
print(expiry_metrics[['on_hand', 'sku_id', 'batch_id', 'financial_risk', 'financial_risk_pct']])

# Export expiry summary
expiry_summary = expiry_metrics.reset_index()
expiry_summary.to_csv('reports/expiry_summary.csv', index=False)
print("\nüíæ Exported: reports/expiry_summary.csv")

# Monthly expiry projection
df['expiry_month'] = df['batch_expiry_date'].dt.to_period('M')
monthly_expiry = df[df['expiry_days_remaining'] <= 180].groupby('expiry_month').size()

plt.figure(figsize=(12, 6))
monthly_expiry.plot(kind='line', marker='o', color='#e74c3c')
plt.title('Monthly Expiry Projection (Next 6 Months)', fontsize=14, fontweight='bold')
plt.ylabel('Number of Batches Expiring')
plt.xlabel('Expiry Month')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('reports/monthly_expiry_projection.png', dpi=300, bbox_inches='tight')

# Critical SKU list (high risk + high consumption)
critical_skus = df[(df['expiry_risk_bucket'].isin(['CRITICAL (<30d)', 'HIGH (30‚Äì90d)'])) & 
                   (df['units_used'] > df['units_used'].quantile(0.75))]
critical_sku_list = critical_skus.groupby('sku_id').agg({
    'sku_name': 'first',
    'ved_category': 'first',
    'expiry_days_remaining': 'min',
    'units_used': 'sum'
}).sort_values('expiry_days_remaining').head(10)

print("\nüö® TOP 10 CRITICAL SKUs (High Risk + High Usage):")
print(critical_sku_list[['sku_name', 'ved_category', 'expiry_days_remaining', 'units_used']])

print("üìà Plot saved: monthly_expiry_projection.png")

In [None]:
# 5Ô∏è‚É£ STOCK HEALTH MATRIX (CORE NHSRC METRIC)
print("\nüõ°Ô∏è 5. STOCK HEALTH MATRIX")
print("-" * 40)

# Calculate ADC (Average Daily Consumption) per SKU per facility
adc_data = df.groupby(['facility_id', 'sku_id']).agg({
    'units_used': 'mean',
    'on_hand': 'last',  # Latest stock level
    'lead_time_days': 'mean',
    'ved_category': 'first',
    'fsn_category': 'first',
    'days_cover': 'last'
}).reset_index()

adc_data.rename(columns={'units_used': 'ADC', 'on_hand': 'current_stock'}, inplace=True)

# Calculate ROL (Reorder Level) - Simple NHSRC formula
adc_data['ROL'] = (adc_data['ADC'] * adc_data['lead_time_days'] * 1.5).round(0)  # Buffer included

# Calculate coverage gap
adc_data['coverage_gap'] = adc_data['days_cover'] - adc_data['lead_time_days']

# Action severity score (0-100, higher = more urgent)
def calculate_severity(row):
    score = 0
    
    # Days cover component (0-40 points)
    if row['days_cover'] < 7:
        score += 40
    elif row['days_cover'] < 14:
        score += 30
    elif row['days_cover'] < 30:
        score += 20
    elif row['days_cover'] < 60:
        score += 10
    
    # VED component (0-30 points)
    if row['ved_category'] == 'Vital':
        score += 30
    elif row['ved_category'] == 'Essential':
        score += 20
    else:
        score += 10
    
    # Coverage gap component (0-30 points)
    if row['coverage_gap'] < 0:
        score += 30
    elif row['coverage_gap'] < 7:
        score += 20
    elif row['coverage_gap'] < 14:
        score += 10
    
    return min(100, score)

adc_data['action_severity_score'] = adc_data.apply(calculate_severity, axis=1)

# Risk categorization
def categorize_risk(score):
    if score >= 70:
        return 'Critical'
    elif score >= 50:
        return 'Warning'
    elif score >= 30:
        return 'Caution'
    else:
        return 'Safe'

adc_data['risk_category'] = adc_data['action_severity_score'].apply(categorize_risk)

# Export stock health matrix
stock_health_matrix = adc_data[['facility_id', 'sku_id', 'ved_category', 'fsn_category',
                               'ADC', 'current_stock', 'lead_time_days', 'ROL', 
                               'days_cover', 'coverage_gap', 'action_severity_score', 
                               'risk_category']].sort_values('action_severity_score', ascending=False)

stock_health_matrix.to_csv('reports/stock_health_matrix.csv', index=False)
print("üíæ Exported: reports/stock_health_matrix.csv")
print(f"üìä Total SKU-Facility combinations: {len(stock_health_matrix)}")

# Risk category distribution
risk_distribution = stock_health_matrix['risk_category'].value_counts()
print("\nüìà RISK CATEGORY DISTRIBUTION:")
for category, count in risk_distribution.items():
    percentage = count / len(stock_health_matrix) * 100
    print(f"  {category}: {count} SKUs ({percentage:.1f}%)")

# Display first 10 rows
print("\nüîπ FIRST 10 ROWS OF STOCK_HEALTH_MATRIX.CSV:")
print(stock_health_matrix.head(10).to_string())

In [None]:
# FINAL OUTPUTS
print("\n" + "="*60)
print("üéØ TRAINER OUTPUTS")
print("="*60)

print("\n1. üîπ FIRST 10 ROWS OF STOCK_HEALTH_MATRIX.CSV:")
print("-" * 60)
print(stock_health_matrix.head(10).to_string())

print("\n2. üîπ COUNT OF SKUs IN EACH RISK CATEGORY:")
print("-" * 60)
for category, count in risk_distribution.items():
    print(f"  {category}: {count} SKUs")

print("\n3. üîπ UPDATED GIT LS-FILES:")
print("-" * 60)
import subprocess
result = subprocess.run(['git', 'ls-files'], capture_output=True, text=True)
print(result.stdout)

print("\n" + "="*60)
print("‚úÖ DAY 3 EDA & NHSRC ALIGNMENT COMPLETE")
print("="*60)