# Week 5 of learning python with TDI (The Data Immersed)


In [None]:
#Numpy Fundamentals
#NumPy is Python's numerical computing library. It's built on arrays, not lists. Arrays are faster, more efficient, 
# and designed specifically for mathematical operations. Everything you learned about lists translates here, 
# but numpy does it better. 

In [5]:
#Task 1: Load your cleaned supply chain dataset. Extract numerical columns (costs, quantities, 
#lead times) into NumPy arrays. 
import pandas as pd
import numpy as np

# Load the general cleaned dataset
df = pd.read_csv("supply_chain_cleaned.csv")

# Convert numerical columns into NumPy arrays
total_costs = df["total_cost"].to_numpy()
quantities = df["quantity"].to_numpy()
lead_time_days = df["lead_time_days"].to_numpy()

# Verify arrays
print("Total Costs:", total_costs[:5], "Shape:", total_costs.shape, "Dtype:", total_costs.dtype)
print("Quantities:", quantities[:5], "Shape:", quantities.shape, "Dtype:", quantities.dtype)
print("Lead Time (days):", lead_time_days[:5], "Shape:", lead_time_days.shape, "Dtype:", lead_time_days.dtype)


Total Costs: [ 6062.06  1226.08 53976.5  25563.12 27188.33] Shape: (3000,) Dtype: float64
Quantities: [94 16 25 33 13] Shape: (3000,) Dtype: int64
Lead Time (days): [32 17 17 36 15] Shape: (3000,) Dtype: int64


In [3]:
#Task2:Perform Basic Array Operations
import numpy as np
import pandas as pd

# Load dataset
df = pd.read_csv("supply_chain_cleaned.csv")

# Extract arrays
total_costs = df["total_cost"].to_numpy()
unit_price = df["unit_price"].to_numpy()
quantities = df["quantity"].to_numpy()
lead_time_days = df["lead_time_days"].to_numpy()

# 1. Calculate total revenue (total_cost * quantity for each order, then sum)
total_revenue = np.sum(unit_price * quantities)

# 2. Calculate average order value (mean of total_costs)
average_order_value = np.mean(total_costs)

# 3. Find minimum and maximum costs
min_cost = np.min(total_costs)
max_cost = np.max(total_costs)

# 4. Calculate the range of lead times (max - min)
lead_time_range = np.max(lead_time_days) - np.min(lead_time_days)

# 5. Multiply all costs by a shipping factor (e.g., 1.05 for 5% markup)
shipping_factor = 1.05
adjusted_costs = total_costs * shipping_factor

# Print results
print("Total Revenue:", total_revenue)
print("Average Order Value:", average_order_value)
print("Minimum Cost:", min_cost)
print("Maximum Cost:", max_cost)
print("Lead Time Range (days):", lead_time_range)
print("Adjusted Costs (first 5):", adjusted_costs[:5])


Total Revenue: 170043301.73
Average Order Value: 56679.366758932134
Minimum Cost: -207.32463
Maximum Cost: 1720595.4
Lead Time Range (days): 48
Adjusted Costs (first 5): [ 6365.163   1287.384  56675.325  26841.276  28547.7465]


In [6]:
#Task 3: Calculate Statistical Metrics
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("supply_chain_cleaned.csv")

# Extract arrays
total_costs = df["total_cost"].to_numpy()
quantities = df["quantity"].to_numpy()
lead_time_days = df["lead_time_days"].to_numpy()

# --- Cost Statistics ---
cost_mean = np.mean(total_costs) #mean
cost_median = np.percentile(total_costs, 50)   # median
cost_std = np.std(total_costs) #Standard Deviation
cost_min = np.min(total_costs) #Minimum
cost_max = np.max(total_costs) #Maximum
cost_25th = np.percentile(total_costs, 25) #25th Percentile
cost_75th = np.percentile(total_costs, 75) #75th Percentile

# --- Quantity Statistics ---
quantity_mean = np.mean(quantities)
quantity_std = np.std(quantities)
quantity_min = np.min(quantities)
quantity_max = np.max(quantities)

# --- Lead Time Statistics ---
lead_mean = np.mean(lead_time_days)
lead_std = np.std(lead_time_days)
lead_min = np.min(lead_time_days)
lead_max = np.max(lead_time_days)

# --- Identify Above-Average Costs ---
above_avg_cost_orders = df[total_costs > cost_mean]

# --- Identify Below-Average Lead Times ---
below_avg_lead_orders = df[lead_time_days < lead_mean]

# Print results
print("=== Cost Statistics ===")
print("Mean:", cost_mean)
print("Median:", cost_median)
print("Std Dev:", cost_std)
print("Min:", cost_min)
print("Max:", cost_max)
print("25th Percentile:", cost_25th)
print("75th Percentile:", cost_75th)

print("\n=== Quantity Statistics ===")
print("Mean:", quantity_mean)
print("Std Dev:", quantity_std)
print("Min:", quantity_min)
print("Max:", quantity_max)

print("\n=== Lead Time Statistics ===")
print("Mean:", lead_mean)
print("Std Dev:", lead_std)
print("Min:", lead_min)
print("Max:", lead_max)

print("\n=== Orders with Above-Average Costs ===")
print(above_avg_cost_orders[["order_id", "product_category", "total_cost"]].head())

print("\n=== Orders with Below-Average Lead Times ===")
print(below_avg_lead_orders[["order_id", "supplier_name", "lead_time_days"]].head())


=== Cost Statistics ===
Mean: 56679.366758932134
Median: 30911.190000000002
Std Dev: 83551.31469749018
Min: -207.32463
Max: 1720595.4
25th Percentile: 11236.387499999999
75th Percentile: 70688.5125

=== Quantity Statistics ===
Mean: 32.626666666666665
Std Dev: 31.364905370316183
Min: 5
Max: 489

=== Lead Time Statistics ===
Mean: 20.424666666666667
Std Dev: 10.216766851058553
Min: 5
Max: 53

=== Orders with Above-Average Costs ===
    order_id product_category  total_cost
9   Ord-0010           Motors   178357.92
11  Ord-0012            Pumps    67580.94
17  Ord-0018  Control Systems    98938.80
20  Ord-0021          Sensors    82061.98
21  Ord-0022  Control Systems    92695.86

=== Orders with Below-Average Lead Times ===
   order_id supplier_name  lead_time_days
1  Ord-0002    Supplier E              17
2  Ord-0003    Supplier A              17
4  Ord-0005     Supplierb              15
5  Ord-0006    Supplier A              10
6  Ord-0007     Supplierb              20


In [7]:
#Task 4 : Compare Supplier Performance Using Arrays 
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("supply_chain_cleaned.csv")

# Get unique suppliers
suppliers = df["supplier_name"].unique()

# Dictionaries to store results
supplier_costs = {}
supplier_lead_times = {}
avg_costs = {}
avg_lead_times = {}

# Loop through each supplier
for supplier in suppliers:
    # Extract arrays for this supplier
    costs_array = df[df["supplier_name"] == supplier]["total_cost"].to_numpy()
    lead_array = df[df["supplier_name"] == supplier]["lead_time_days"].to_numpy()
    
    # Store arrays
    supplier_costs[supplier] = costs_array
    supplier_lead_times[supplier] = lead_array
    
    # Calculate averages
    avg_costs[supplier] = np.mean(costs_array)
    avg_lead_times[supplier] = np.mean(lead_array)

# Identify best supplier by delivery time (lowest average lead time)
best_delivery_supplier = max(avg_lead_times, key=avg_lead_times.get)

# Identify lowest cost supplier (lowest average cost)
lowest_cost_supplier = min(avg_costs, key=avg_costs.get)

# Print results
print("=== Average Cost per Supplier ===")
for supplier, avg in avg_costs.items():
    print(f"{supplier}: {avg:.2f}")

print("\n=== Average Lead Time per Supplier (days) ===")
for supplier, avg in avg_lead_times.items():
    print(f"{supplier}: {avg:.2f}")

print("\nSupplier with Best Average Delivery Time:", best_delivery_supplier)
print("Supplier with Lowest Average Cost:", lowest_cost_supplier)


=== Average Cost per Supplier ===
Suppliere: 7291.31
Supplier E: 7558.52
Supplier A: 87989.24
Supplierc: 42643.77
Supplierb: 61853.38
Supplierd: 60047.12
Supplier C: 56222.77
Suppliera: 93866.45
Supplier D: 75544.59
Supplier B: 72932.68

=== Average Lead Time per Supplier (days) ===
Suppliere: 23.21
Supplier E: 23.54
Supplier A: 9.94
Supplierc: 34.64
Supplierb: 14.57
Supplierd: 27.44
Supplier C: 34.39
Suppliera: 9.93
Supplier D: 27.77
Supplier B: 14.64

Supplier with Best Average Delivery Time: Supplierc
Supplier with Lowest Average Cost: Suppliere


In [4]:
#Task 5 : Detect Outliers Using Statistical Thresholds 
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("supply_chain_cleaned.csv")

# Extract arrays
total_costs = df["total_cost"].to_numpy()
lead_time_days = df["lead_time_days"].to_numpy()

# --- Cost Statistics ---
cost_mean = np.mean(total_costs)
cost_std = np.std(total_costs)

# High cost outliers (> mean + 2*std)
high_cost_outliers = df[total_costs > cost_mean + 2 * cost_std]

# Low cost outliers (< mean - 2*std)
low_cost_outliers = df[total_costs < cost_mean - 2 * cost_std]

# --- Lead Time Statistics ---
lead_mean = np.mean(lead_time_days)
lead_std = np.std(lead_time_days)

# High lead time outliers (> mean + 2*std)
high_lead_outliers = df[lead_time_days > lead_mean + 2 * lead_std]

# Low lead time outliers (< mean - 2*std)
low_lead_outliers = df[lead_time_days < lead_mean - 2 * lead_std]

# Print results
print("=== Cost Outliers ===")
print("High Cost Outliers (above mean + 2*std):")
print(high_cost_outliers[["order_id", "product_category", "total_cost"]])

print("\nLow Cost Outliers (below mean - 2*std):")
print(low_cost_outliers[["order_id", "product_category", "total_cost"]])

print("\n=== Lead Time Outliers ===")
print("High Lead Time Outliers (above mean + 2*std):")
print(high_lead_outliers[["order_id", "supplier_name", "lead_time_days"]])

print("\nLow Lead Time Outliers (below mean - 2*std):")
print(low_lead_outliers[["order_id", "supplier_name", "lead_time_days"]])


=== Cost Outliers ===
High Cost Outliers (above mean + 2*std):
      order_id product_category  total_cost
23    Ord-0024  Control Systems   264606.08
38    Ord-0039           Motors   377606.00
47    Ord-0048            Pumps   398828.92
50    Ord-0051           Motors   281957.60
70    Ord-0071  Control Systems   308307.87
...        ...              ...         ...
2880  Ord-2881           Motors   246296.02
2893  Ord-2894           Motors   235205.50
2923  Ord-2924           Motors   316447.35
2965  Ord-2966           Motors   233450.80
2980  Ord-2981  Control Systems   229896.00

[113 rows x 3 columns]

Low Cost Outliers (below mean - 2*std):
Empty DataFrame
Columns: [order_id, product_category, total_cost]
Index: []

=== Lead Time Outliers ===
High Lead Time Outliers (above mean + 2*std):
      order_id supplier_name  lead_time_days
15    Ord-0016    Supplier C              44
150   Ord-0151     Supplierc              42
168   Ord-0169     Supplierc              41
177   Ord-0178

In [5]:
#Challenge 1 : Create a "profit score" array by calculating (cost * quantity * profit_margin). Use 
#broadcasting to apply a different profit margin (0.20, 0.25, 0.30) and see how total profit changes. 

import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("supply_chain_cleaned.csv")

# Extract arrays
total_costs = df["total_cost"].to_numpy()
quantities = df["quantity"].to_numpy()
lead_time_days = df["lead_time_days"].to_numpy()

# 1. Calculate total revenue (cost * quantity for each order, then sum)
total_revenue = np.sum(total_costs * quantities)

# 2. Calculate average order value
average_order_value = np.mean(total_costs)

# 3. Find minimum and maximum costs
min_cost = np.min(total_costs)
max_cost = np.max(total_costs)

# 4. Calculate the range of lead times (max - min)
lead_time_range = np.max(lead_time_days) - np.min(lead_time_days)

# 5. Multiply all costs by a shipping factor (e.g., 1.05 for 5% markup)
shipping_factor = 1.05
adjusted_costs = total_costs * shipping_factor

# Print results
print("Total Revenue:", total_revenue)
print("Average Order Value:", average_order_value)
print("Minimum Cost:", min_cost)
print("Maximum Cost:", max_cost)
print("Lead Time Range (days):", lead_time_range)
print("Adjusted Costs (first 5):", adjusted_costs[:5])


Total Revenue: 11052716104.374344
Average Order Value: 56679.366758932134
Minimum Cost: -207.32463
Maximum Cost: 1720595.4
Lead Time Range (days): 48
Adjusted Costs (first 5): [ 6365.163   1287.384  56675.325  26841.276  28547.7465]


In [6]:
#Challenge 2 : Challenge 2: 
#Use NumPy to find correlations. Extract two variables (e.g., quantity and cost, or lead_time 
#and on_time_delivery). Calculate the correlation between them. What does the result tell you about your supply chain?

import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("supply_chain_cleaned.csv")

# Extract arrays
quantities = df["quantity"].to_numpy()
total_costs = df["total_cost"].to_numpy()
lead_time_days = df["lead_time_days"].to_numpy()

# Convert 'on_time' column to numeric (Yes=1, No=0)
on_time_delivery = df["on_time"].apply(lambda x: 1 if str(x).lower() == "yes" else 0).to_numpy()

# --- Correlation Calculations ---
# Correlation between quantity and total_cost
corr_quantity_cost = np.corrcoef(quantities, total_costs)[0, 1]

# Correlation between lead_time_days and on_time_delivery
corr_lead_on_time = np.corrcoef(lead_time_days, on_time_delivery)[0, 1]

# Print results
print("Correlation between Quantity and Total Cost:", corr_quantity_cost)
print("Correlation between Lead Time and On-Time Delivery:", corr_lead_on_time)


Correlation between Quantity and Total Cost: 0.7002192416190304
Correlation between Lead Time and On-Time Delivery: -0.1423042948869898


In [7]:
#Challenge 3: Create a distribution analysis. Divide costs into bins (e.g., under $1000, $1000-$5000, 
#$5000+). Count how many orders fall into each bin. What does this tell you about your 
#order distribution?

import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("supply_chain_cleaned.csv")

# Extract cost array
total_costs = df["total_cost"].to_numpy()

# Define bins: under $1000, $1000-$5000, $5000+
bins = [0, 1000, 5000, np.max(total_costs)]

# Use NumPy histogram to count orders in each bin
counts, bin_edges = np.histogram(total_costs, bins=bins)

# Print results
print("=== Cost Distribution ===")
for i in range(len(counts)):
    print(f"${bin_edges[i]} - ${bin_edges[i+1]}: {counts[i]} orders")


=== Cost Distribution ===
$0.0 - $1000.0: 37 orders
$1000.0 - $5000.0: 350 orders
$5000.0 - $1720595.4: 2612 orders


In [None]:
#Such an exciting week diving into NumPy fundamentals!
#Converting cleaned supply chain data into arrays, performing fast calculations, 
#and uncovering insights made analysis feel powerful and efficient.
#Looking forward to Week 6 and exploring Pandas to build unified views of the data!