## Package Imports and Reading In of Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer


In [None]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/Martin Marietta Pricing Case Study/data/seasonal_imbalanced_shipment_data.csv')
df.head()

Unnamed: 0,shipment_month,region,product_category,product_id,customer_id,transfer_flag,pickup_or_delivery,Job_distance,Shipment_qty,avg_price_per_ton
0,Mar,Region B,Electronics,P8462,C981783,External Customer,Pickup,10-15,8.31,972.4
1,May,Region D,Chemicals,P2695,C433440,External Customer,Pickup,15-20,16.53,957.58
2,Mar,Region C,Electronics,P7811,C808007,External Customer,Pickup,10-15,12.6,767.09
3,Apr,Region C,Electronics,P1569,C408920,Plant-to-Plant,Delivery,5-10,12.98,951.51
4,Jun,Region A,Chemicals,P1359,C793217,External Customer,Pickup,0-5,38.62,857.31


## Brief Exploratory Summary: Key Drivers of Price

In [None]:
# Encode and fit a quick model
features = ['shipment_month', 'region', 'product_category', 'transfer_flag',
            'pickup_or_delivery', 'Job_distance', 'Shipment_qty']

X = df[features]
y = df['avg_price_per_ton']

# One-hot encoding for categoricals
categorical = X.select_dtypes(include='object').columns.tolist()
numeric = ['Shipment_qty']

preprocessor = make_column_transformer(
    (OneHotEncoder(drop='first'), categorical),
    remainder='passthrough'
)

X_encoded = preprocessor.fit_transform(X)

model = LinearRegression()
model.fit(X_encoded, y)

# Create summary table
feature_names = preprocessor.get_feature_names_out()
coefs = pd.Series(model.coef_, index=feature_names, name='Coefficient').sort_values(key=abs, ascending=False)

coefs.head(10)


Unnamed: 0,Coefficient
onehotencoder__product_category_Food,-324.08045
onehotencoder__product_category_Textiles,-215.812957
onehotencoder__region_Region D,110.426549
onehotencoder__product_category_Furniture,-107.893221
onehotencoder__product_category_Electronics,107.841228
onehotencoder__shipment_month_Dec,93.242612
onehotencoder__shipment_month_Feb,-87.152278
onehotencoder__shipment_month_Jan,-86.3647
onehotencoder__region_Region B,72.825614
onehotencoder__shipment_month_Nov,-71.999021


## Define Pricing Bands with Tree-Based Segmentation (CHAID-style logic)

Use a shallow DecisionTreeRegressor to mimic a CHAID-style segmentation:

In [None]:
from sklearn.tree import DecisionTreeRegressor, export_text

tree = DecisionTreeRegressor(max_depth=3, min_samples_leaf=50, random_state=42)
tree.fit(X_encoded, y)

# Display segmentation rules
tree_rules = export_text(tree, feature_names=list(feature_names))
print(tree_rules)


|--- onehotencoder__product_category_Food <= 0.50
|   |--- onehotencoder__product_category_Textiles <= 0.50
|   |   |--- onehotencoder__product_category_Furniture <= 0.50
|   |   |   |--- value: [917.38]
|   |   |--- onehotencoder__product_category_Furniture >  0.50
|   |   |   |--- value: [755.93]
|   |--- onehotencoder__product_category_Textiles >  0.50
|   |   |--- onehotencoder__region_Region C <= 0.50
|   |   |   |--- value: [679.42]
|   |   |--- onehotencoder__region_Region C >  0.50
|   |   |   |--- value: [600.11]
|--- onehotencoder__product_category_Food >  0.50
|   |--- onehotencoder__region_Region C <= 0.50
|   |   |--- onehotencoder__shipment_month_Dec <= 0.50
|   |   |   |--- value: [559.11]
|   |   |--- onehotencoder__shipment_month_Dec >  0.50
|   |   |   |--- value: [644.06]
|   |--- onehotencoder__region_Region C >  0.50
|   |   |--- onehotencoder__shipment_month_Dec <= 0.50
|   |   |   |--- value: [493.70]
|   |   |--- onehotencoder__shipment_month_Dec >  0.50
|   |  

## Generate Pricing Bands for Selected Segments

In [None]:
# Define custom segments
segments = {
    'Electronics_Dec': df.query("product_category == 'Electronics' and shipment_month == 'Dec'"),
    'Furniture_July': df.query("product_category == 'Furniture' and shipment_month == 'Jul'"),
    'Chemicals_Q1': df[df['product_category'].eq('Chemicals') & df['shipment_month'].isin(['Jan', 'Feb', 'Mar'])]
}

# Calculate bands
pricing_bands = {}
for name, group in segments.items():
    band = {
        'floor': group['avg_price_per_ton'].quantile(0.10),
        'middle': group['avg_price_per_ton'].median(),
        'ceiling': group['avg_price_per_ton'].quantile(0.90)
    }
    pricing_bands[name] = band

pd.DataFrame(pricing_bands).T


Unnamed: 0,floor,middle,ceiling
Electronics_Dec,980.92,1096.67,1245.41
Furniture_July,721.51,821.53,938.214
Chemicals_Q1,692.622,799.93,927.16
