In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load data
data = pd.read_excel('/content/supplier data.xlsx', sheet_name='Supplier Quality')

# Create reliability label (1 = Reliable, 0 = Unreliable)
threshold = data['Total Defect Qty'].median()
data['Reliability'] = (data['Total Defect Qty'] < threshold).astype(int)

# Encode categorical features
categorical_cols = ['Plant Location', 'Category', 'Material Type']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

# Features and target
X = data[categorical_cols]
y = data['Reliability']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print("Model Evaluation:\n")
print(classification_report(y_test, y_pred))

# Display valid options for cold-start prediction
print("\nValid categories for cold-start prediction:")
for col in categorical_cols:
    unique_labels = label_encoders[col].classes_
    print(f"- {col}: {list(unique_labels)}")

# Example: Predict new supplier (using a valid existing sample)
sample = data.iloc[0]
new_supplier = pd.DataFrame({
    'Plant Location': [sample['Plant Location']],
    'Category': [sample['Category']],
    'Material Type': [sample['Material Type']],
})

predicted_label = model.predict(new_supplier)[0]
result = 'Reliable' if predicted_label == 1 else 'Unreliable'
print(f"\nNew Supplier Predicted Reliability: {result}")


Model Evaluation:

              precision    recall  f1-score   support

           0       0.48      0.49      0.49       514
           1       0.50      0.49      0.49       532

    accuracy                           0.49      1046
   macro avg       0.49      0.49      0.49      1046
weighted avg       0.49      0.49      0.49      1046


Valid categories for cold-start prediction:
- Plant Location: ['Barling', 'Bloomingdale', 'Bruce Crossing', 'Charles City', 'Charlevoix', 'Chatham', 'Chesaning', 'Clarksville', 'Clay', 'Climax', 'Cottonwood', 'De Ruyter', 'Florence', 'Frazer', 'Garwood', 'Henning', 'Hingham', 'Jordan Valley', 'June Lake', 'Middletown', 'New Britain', 'Prescott', 'Reading', 'Ripton', 'Riverside', 'Savannah', 'Twin Rocks', 'Waldoboro', 'Weaverville', 'Westside']
- Category: ['Electrical', 'Goods & Services', 'Logistics', 'Materials & Components', 'Mechanicals', 'Packaging']
- Material Type: ['Batteries', 'Carton', 'Composites', 'Controllers', 'Corrugate', 'Crates'

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load data
data = pd.read_excel('supplier data.xlsx', sheet_name='Supplier Quality')

# Extract date-based features
data['Date'] = pd.to_datetime(data['Date'])
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['DayOfWeek'] = data['Date'].dt.dayofweek

# Create reliability label based solely on historical Total Defect Qty
threshold = data['Total Defect Qty'].median()
data['Reliability'] = (data['Total Defect Qty'] < threshold).astype(int)

# Encode categorical features
categorical_cols = ['Plant Location', 'Category', 'Material Type']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

# Use only intrinsic and time-based features (no defect info)
feature_cols = ['Plant Location', 'Category', 'Material Type', 'Year', 'Month', 'DayOfWeek']

X = data[feature_cols]
y = data['Reliability']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=10)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print("Cold-Start Model Evaluation:\n")
print(classification_report(y_test, y_pred))

# Example: Predict new cold-start supplier
sample = data.iloc[0]
new_supplier = pd.DataFrame({
    'Plant Location': [sample['Plant Location']],
    'Category': [sample['Category']],
    'Material Type': [sample['Material Type']],
    'Year': [sample['Year']],
    'Month': [sample['Month']],
    'DayOfWeek': [sample['DayOfWeek']],
})

predicted_label = model.predict(new_supplier)[0]
result = 'Reliable' if predicted_label == 1 else 'Unreliable'
print(f"\nNew Supplier Predicted Reliability (cold-start): {result}")


Cold-Start Model Evaluation:

              precision    recall  f1-score   support

           0       0.50      0.52      0.51       514
           1       0.52      0.50      0.51       532

    accuracy                           0.51      1046
   macro avg       0.51      0.51      0.51      1046
weighted avg       0.51      0.51      0.51      1046


New Supplier Predicted Reliability (cold-start): Unreliable


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# Load data
data = pd.read_excel('supplier data.xlsx', sheet_name='Supplier Quality')

# Extract date-based features
data['Date'] = pd.to_datetime(data['Date'])
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['DayOfWeek'] = data['Date'].dt.dayofweek

# Create reliability label
threshold = data['Total Defect Qty'].median()
data['Reliability'] = (data['Total Defect Qty'] < threshold).astype(int)

# Encode categorical features
categorical_cols = ['Plant Location', 'Category', 'Material Type']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

# Features
feature_cols = ['Plant Location', 'Category', 'Material Type', 'Year', 'Month', 'DayOfWeek']
X = data[feature_cols]
y = data['Reliability']

# Step 1: Apply KMeans clustering on features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=5, random_state=42)
data['Cluster'] = kmeans.fit_predict(X_scaled)

# Add cluster as a feature
X['Cluster'] = data['Cluster']

# Step 2: Handle imbalance using SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Step 3: Train model
model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
model.fit(X_train_res, y_train_res)

# Evaluate
y_pred = model.predict(X_test)
print("Improved Cold-Start Model Evaluation:\n")
print(classification_report(y_test, y_pred))

# Example cold-start prediction (with cluster assigned)
sample = data.iloc[0]
sample_features = scaler.transform([[sample['Plant Location'], sample['Category'], sample['Material Type'],
                                     sample['Year'], sample['Month'], sample['DayOfWeek']]])
sample_cluster = kmeans.predict(sample_features)[0]

new_supplier = pd.DataFrame({
    'Plant Location': [sample['Plant Location']],
    'Category': [sample['Category']],
    'Material Type': [sample['Material Type']],
    'Year': [sample['Year']],
    'Month': [sample['Month']],
    'DayOfWeek': [sample['DayOfWeek']],
    'Cluster': [sample_cluster],
})

predicted_label = model.predict(new_supplier)[0]
result = 'Reliable' if predicted_label == 1 else 'Unreliable'
print(f"\nNew Supplier Predicted Reliability (improved): {result}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Cluster'] = data['Cluster']


Improved Cold-Start Model Evaluation:

              precision    recall  f1-score   support

           0       0.50      0.49      0.50       514
           1       0.52      0.53      0.52       532

    accuracy                           0.51      1046
   macro avg       0.51      0.51      0.51      1046
weighted avg       0.51      0.51      0.51      1046


New Supplier Predicted Reliability (improved): Unreliable




In [10]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# Load data
data = pd.read_excel('supplier data.xlsx', sheet_name='Supplier Quality')

# Extract date-based features
data['Date'] = pd.to_datetime(data['Date'])
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['DayOfWeek'] = data['Date'].dt.dayofweek

# Encode categorical features
categorical_cols = ['Plant Location', 'Category', 'Material Type']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

# Features for clustering
feature_cols = ['Plant Location', 'Category', 'Material Type', 'Year', 'Month', 'DayOfWeek']
X_features = data[feature_cols]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_features)

# Apply KMeans clustering
kmeans = KMeans(n_clusters=5, random_state=42)
data['Cluster'] = kmeans.fit_predict(X_scaled)

# Add cluster as a new feature
X_features['Cluster'] = data['Cluster']

# Define target
threshold = data['Total Defect Qty'].median()
data['Reliability'] = (data['Total Defect Qty'] < threshold).astype(int)
y = data['Reliability']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)

# Balance dataset with SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Train Random Forest Classifier
model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
model.fit(X_train_res, y_train_res)

# Evaluate model
y_pred = model.predict(X_test)
print("Cluster-based Cold-Start Model Evaluation:\n")
print(classification_report(y_test, y_pred))

# Example cold-start prediction
sample = data.iloc[0]
sample_features = scaler.transform([[sample['Plant Location'], sample['Category'],
                                     sample['Material Type'], sample['Year'],
                                     sample['Month'], sample['DayOfWeek']]])
sample_cluster = kmeans.predict(sample_features)[0]

new_supplier = pd.DataFrame({
    'Plant Location': [sample['Plant Location']],
    'Category': [sample['Category']],
    'Material Type': [sample['Material Type']],
    'Year': [sample['Year']],
    'Month': [sample['Month']],
    'DayOfWeek': [sample['DayOfWeek']],
    'Cluster': [sample_cluster],
})

predicted_label = model.predict(new_supplier)[0]
result = 'Reliable' if predicted_label == 1 else 'Unreliable'
print(f"\nNew Supplier Predicted Reliability (cluster-based): {result}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_features['Cluster'] = data['Cluster']


Cluster-based Cold-Start Model Evaluation:

              precision    recall  f1-score   support

           0       0.50      0.49      0.50       514
           1       0.52      0.53      0.52       532

    accuracy                           0.51      1046
   macro avg       0.51      0.51      0.51      1046
weighted avg       0.51      0.51      0.51      1046


New Supplier Predicted Reliability (cluster-based): Unreliable




In [11]:
# cluster_based_coldstart_procurement.py
# Run: python cluster_based_coldstart_procurement.py
# Requires: pandas, scikit-learn, imbalanced-learn, numpy

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# ---------- 1) Load dataset ----------
FILE = "/content/Procurement KPI Analysis Dataset.csv"
df = pd.read_csv(FILE)

# Quick check (uncomment to print)
# print(df.columns)
# print(df.head())

# ---------- 2) Clean / parse columns ----------
# Ensure date columns are datetime
df['Order_Date'] = pd.to_datetime(df['Order_Date'], errors='coerce')
df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')

# Fill or drop rows with critical missing fields:
# We will drop rows missing Item_Category or Unit_Price or Quantity as they are essential.
df = df.dropna(subset=['Item_Category', 'Quantity', 'Unit_Price'])

# ---------- 3) Create cold-start friendly features ----------
# Date-derived features (onboarding/time info that can be known)
df['Order_Year'] = df['Order_Date'].dt.year.fillna(0).astype(int)
df['Order_Month'] = df['Order_Date'].dt.month.fillna(0).astype(int)
df['Order_DayOfWeek'] = df['Order_Date'].dt.dayofweek.fillna(0).astype(int)

# Price engineering
df['Unit_Price'] = pd.to_numeric(df['Unit_Price'], errors='coerce').fillna(df['Unit_Price'].median())
df['Negotiated_Price'] = pd.to_numeric(df['Negotiated_Price'], errors='coerce')
# If negotiated price is missing, use Unit_Price (safe fallback)
df['Negotiated_Price'] = df['Negotiated_Price'].fillna(df['Unit_Price'])
df['Price_Diff'] = df['Unit_Price'] - df['Negotiated_Price']
df['Price_Ratio'] = df['Negotiated_Price'] / (df['Unit_Price'] + 1e-9)

# Quantity
df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce').fillna(df['Quantity'].median())

# Drop any remaining rows with NaN in engineered numeric features
df = df.dropna(subset=['Unit_Price', 'Negotiated_Price', 'Quantity'])

# ---------- 4) Target: Compliance (suitable for reliability) ----------
# Map Compliance Yes -> 1 (reliable), No -> 0 (unreliable)
# If Compliance column contains other values or NaN, drop or fill with mode
if 'Compliance' not in df.columns:
    raise KeyError("Dataset must contain 'Compliance' column for this pipeline.")

df['Compliance'] = df['Compliance'].astype(str).str.strip().str.lower()
# Keep only rows with yes/no; others drop
df = df[df['Compliance'].isin(['yes', 'no'])]
df['Target'] = (df['Compliance'] == 'yes').astype(int)

# ---------- 5) Encode categorical features (cold-start-safe) ----------
# We will not use 'Supplier' string directly for cold-start predictions because a new supplier name will be unseen.
# Instead we use item-level and price/quantity/date features and add a Cluster feature.

cat_col = 'Item_Category'
le_cat = LabelEncoder()
df[cat_col + '_enc'] = le_cat.fit_transform(df[cat_col].astype(str))

# Save list of valid categories for later prediction
valid_categories = list(le_cat.classes_)
print("Valid Item_Category values:", valid_categories)

# ---------- 6) Prepare features for clustering ----------
feat_for_cluster = [
    cat_col + '_enc',
    'Quantity',
    'Unit_Price',
    'Negotiated_Price',
    'Price_Diff',
    'Price_Ratio',
    'Order_Year',
    'Order_Month',
    'Order_DayOfWeek'
]
X_cluster = df[feat_for_cluster].copy()

# Standardize for clustering
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster)

# ---------- 7) KMeans clustering to create supplier profiles ----------
# Choose n_clusters reasonably (try 4-8). We'll pick 6 here but you can tune it.
n_clusters = 6
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(X_scaled)

# Add Cluster to feature set (int)
df['Cluster'] = df['Cluster'].astype(int)

# ---------- 8) Final feature set for classifier (NO historical defect fields) ----------
feature_cols = [
    cat_col + '_enc',
    'Quantity',
    'Unit_Price',
    'Negotiated_Price',
    'Price_Diff',
    'Price_Ratio',
    'Order_Year',
    'Order_Month',
    'Order_DayOfWeek',
    'Cluster'
]
X = df[feature_cols]
y = df['Target']

# ---------- 9) Train/test split ----------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

# ---------- 10) Balance training set with SMOTE (optional but helpful) ----------
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# ---------- 11) Train classifier ----------
clf = RandomForestClassifier(n_estimators=300, max_depth=12, random_state=42)
clf.fit(X_train_res, y_train_res)

# ---------- 12) Evaluate ----------
y_pred = clf.predict(X_test)
print("\nClassification Report (Compliance as target):\n")
print(classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# Feature importances
importances = pd.Series(clf.feature_importances_, index=feature_cols).sort_values(ascending=False)
print("\nFeature importances:\n", importances)

# ---------- 13) Example: cold-start prediction for a NEW supplier ----------
# For cold-start we only need item category, quantity, unit_price, negotiated_price, order_date.
# Provide a dict for new supplier onboarding attributes:
new_supplier_info = {
    'Item_Category': 'Office Supplies',   # must be one of valid_categories (or we map unknown -> mode)
    'Quantity': 500,
    'Unit_Price': 25.0,
    'Negotiated_Price': 23.0,
    'Order_Date': '2024-07-10'
}

# Safe mapping for category: if unseen, map to most frequent category
cat_val = new_supplier_info['Item_Category']
if cat_val not in valid_categories:
    print(f"Warning: category '{cat_val}' not in training categories. Mapping to mode: {valid_categories[0]}")
    cat_val = valid_categories[0]

# Build feature row (same transforms)
order_date = pd.to_datetime(new_supplier_info['Order_Date'], errors='coerce')
oy = order_date.year if pd.notnull(order_date) else 0
om = order_date.month if pd.notnull(order_date) else 0
od = order_date.dayofweek if pd.notnull(order_date) else 0

unit_price = float(new_supplier_info['Unit_Price'])
neg_price = float(new_supplier_info.get('Negotiated_Price', unit_price))
qty = float(new_supplier_info['Quantity'])
price_diff = unit_price - neg_price
price_ratio = neg_price / (unit_price + 1e-9)

row = [
    int(le_cat.transform([cat_val])[0]),
    qty,
    unit_price,
    neg_price,
    price_diff,
    price_ratio,
    int(oy),
    int(om),
    int(od)
]

# scale and predict cluster, then assemble final row including cluster
row_scaled = scaler.transform([row])                # scaler expects same order used earlier
pred_cluster = int(kmeans.predict(row_scaled)[0])

# create final feature vector including cluster
final_row = np.array(row + [pred_cluster]).reshape(1, -1)
pred_label = clf.predict(final_row)[0]
prob = clf.predict_proba(final_row)[0]

print("\nCold-start prediction for NEW supplier attributes:")
print(new_supplier_info)
print("Assigned cluster:", pred_cluster)
print("Predicted Compliance (1 = reliable / yes):", int(pred_label))
print("Prediction probabilities (class 0 = No, class 1 = Yes):", prob)

# ---------- 14) Save models/artifacts (optional) ----------
# You may want to persist label encoder, scaler, kmeans, and classifier for production use.
# e.g. with joblib:
# import joblib
# joblib.dump(le_cat, 'le_item_category.joblib')
# joblib.dump(scaler, 'scaler.joblib')
# joblib.dump(kmeans, 'kmeans.joblib')
# joblib.dump(clf, 'rf_compliance.joblib')


Valid Item_Category values: ['Electronics', 'MRO', 'Office Supplies', 'Packaging', 'Raw Materials']

Classification Report (Compliance as target):

              precision    recall  f1-score   support

           0       0.24      0.32      0.27        28
           1       0.84      0.77      0.80       128

    accuracy                           0.69       156
   macro avg       0.54      0.55      0.54       156
weighted avg       0.73      0.69      0.71       156

Confusion matrix:
 [[ 9 19]
 [29 99]]

Feature importances:
 Quantity             0.149184
Unit_Price           0.126261
Price_Diff           0.117688
Negotiated_Price     0.110932
Cluster              0.106815
Order_Month          0.105682
Price_Ratio          0.096784
Order_DayOfWeek      0.090732
Item_Category_enc    0.052278
Order_Year           0.043645
dtype: float64

Cold-start prediction for NEW supplier attributes:
{'Item_Category': 'Office Supplies', 'Quantity': 500, 'Unit_Price': 25.0, 'Negotiated_Price': 23.



In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier

# Reload data
df = pd.read_csv("Procurement KPI Analysis Dataset.csv")

# Dates
df['Order_Date'] = pd.to_datetime(df['Order_Date'], errors='coerce')
df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')

# Filter valid Compliance values
df['Compliance'] = df['Compliance'].astype(str).str.strip().str.lower()
df = df[df['Compliance'].isin(['yes','no'])]
df['Target'] = (df['Compliance'] == 'yes').astype(int)

# Features
df['Order_Year'] = df['Order_Date'].dt.year.fillna(0).astype(int)
df['Order_Month'] = df['Order_Date'].dt.month.fillna(0).astype(int)
df['Order_DayOfWeek'] = df['Order_Date'].dt.dayofweek.fillna(0).astype(int)

# Delivery delay (new feature)
df['Delivery_Delay'] = (df['Delivery_Date'] - df['Order_Date']).dt.days.fillna(0)

# Price engineering
df['Unit_Price'] = pd.to_numeric(df['Unit_Price'], errors='coerce')
df['Negotiated_Price'] = pd.to_numeric(df['Negotiated_Price'], errors='coerce').fillna(df['Unit_Price'])
df['Price_Diff'] = df['Unit_Price'] - df['Negotiated_Price']
df['Price_Discount'] = df['Price_Diff'] / (df['Unit_Price'] + 1e-9)

# Encode category
le_cat = LabelEncoder()
df['Item_Category_enc'] = le_cat.fit_transform(df['Item_Category'].astype(str))

# Drop missing critical rows
df = df.dropna(subset=['Unit_Price','Quantity'])

# Clustering
feat_for_cluster = ['Item_Category_enc','Quantity','Unit_Price','Negotiated_Price',
                    'Order_Year','Order_Month','Order_DayOfWeek']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[feat_for_cluster])
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(X_scaled)

# Final features
feature_cols = ['Item_Category_enc','Quantity','Unit_Price','Negotiated_Price',
                'Price_Diff','Price_Discount','Order_Year','Order_Month',
                'Order_DayOfWeek','Delivery_Delay','Cluster']

X = df[feature_cols]
y = df['Target']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=42)

# XGBoost Classifier with class imbalance handling
model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=(len(y_train[y_train==0]) / len(y_train[y_train==1])),
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Improved Model with XGBoost:\n")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Feature importance
importances = pd.Series(model.feature_importances_, index=feature_cols).sort_values(ascending=False)
print("\nFeature importances:\n", importances)

# Example cold-start prediction
new_supplier = {
    'Item_Category': 'Office Supplies',
    'Quantity': 500,
    'Unit_Price': 25.0,
    'Negotiated_Price': 23.0,
    'Order_Date': '2024-07-10',
    'Delivery_Date': '2024-07-20'
}

order_date = pd.to_datetime(new_supplier['Order_Date'])
delivery_date = pd.to_datetime(new_supplier['Delivery_Date'])
features = {
    'Item_Category_enc': le_cat.transform([new_supplier['Item_Category']])[0],
    'Quantity': new_supplier['Quantity'],
    'Unit_Price': new_supplier['Unit_Price'],
    'Negotiated_Price': new_supplier['Negotiated_Price'],
    'Price_Diff': new_supplier['Unit_Price'] - new_supplier['Negotiated_Price'],
    'Price_Discount': (new_supplier['Unit_Price'] - new_supplier['Negotiated_Price'])/new_supplier['Unit_Price'],
    'Order_Year': order_date.year,
    'Order_Month': order_date.month,
    'Order_DayOfWeek': order_date.dayofweek,
    'Delivery_Delay': (delivery_date - order_date).days
}

# Add cluster
row_for_cluster = scaler.transform([[features['Item_Category_enc'], features['Quantity'],
                                     features['Unit_Price'], features['Negotiated_Price'],
                                     features['Order_Year'], features['Order_Month'], features['Order_DayOfWeek']]])
features['Cluster'] = kmeans.predict(row_for_cluster)[0]

# Predict
row = pd.DataFrame([features])
pred = model.predict(row)[0]
prob = model.predict_proba(row)[0]
print("\nNew Supplier Prediction (XGBoost):", "Reliable" if pred==1 else "Unreliable")
print("Probabilities:", prob)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Improved Model with XGBoost:

              precision    recall  f1-score   support

           0       0.27      0.32      0.30        28
           1       0.85      0.81      0.83       128

    accuracy                           0.72       156
   macro avg       0.56      0.57      0.56       156
weighted avg       0.74      0.72      0.73       156

Confusion Matrix:
 [[  9  19]
 [ 24 104]]

Feature importances:
 Order_Month          0.111764
Order_Year           0.107259
Cluster              0.103360
Quantity             0.093580
Delivery_Delay       0.093160
Price_Diff           0.090039
Price_Discount       0.082555
Item_Category_enc    0.082196
Unit_Price           0.079643
Order_DayOfWeek      0.079172
Negotiated_Price     0.077272
dtype: float32

New Supplier Prediction (XGBoost): Unreliable
Probabilities: [0.5433933 0.4566067]




In [13]:
from sklearn.metrics import precision_recall_curve

# Predict probabilities for class 1 (Reliable)
y_proba = model.predict_proba(X_test)[:,1]

# Default threshold is 0.5 → let's check PR curve to find a better cutoff
prec, rec, thresholds = precision_recall_curve(y_test, y_proba)

# Choose threshold where recall for class 0 improves (say ~0.4)
optimal_threshold = 0.4

y_pred_adjusted = (y_proba > optimal_threshold).astype(int)

print(f"\nClassification Report with Adjusted Threshold ({optimal_threshold}):")
print(classification_report(y_test, y_pred_adjusted))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_adjusted))



Classification Report with Adjusted Threshold (0.4):
              precision    recall  f1-score   support

           0       0.30      0.25      0.27        28
           1       0.84      0.88      0.86       128

    accuracy                           0.76       156
   macro avg       0.57      0.56      0.57       156
weighted avg       0.75      0.76      0.75       156

Confusion Matrix:
 [[  7  21]
 [ 16 112]]


In [14]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Define parameter grid
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.2),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3),
    'gamma': uniform(0, 5),
    'min_child_weight': randint(1, 6)
}

# Randomized search
xgb_tuned = XGBClassifier(
    scale_pos_weight=(len(y_train[y_train==0]) / len(y_train[y_train==1])),
    random_state=42,
    eval_metric='logloss'
)

random_search = RandomizedSearchCV(
    xgb_tuned,
    param_distributions=param_dist,
    n_iter=30,  # number of random combinations to try
    scoring='f1_macro',  # to balance both classes
    cv=3,
    verbose=2,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print("\nBest Parameters from RandomizedSearchCV:")
print(random_search.best_params_)

best_model = random_search.best_estimator_

# Evaluate tuned model
y_pred_best = best_model.predict(X_test)
print("\nClassification Report (Tuned Model):")
print(classification_report(y_test, y_pred_best))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best))


Fitting 3 folds for each of 30 candidates, totalling 90 fits

Best Parameters from RandomizedSearchCV:
{'colsample_bytree': np.float64(0.846833395335222), 'gamma': np.float64(0.2689151214665181), 'learning_rate': np.float64(0.1966409426915387), 'max_depth': 9, 'min_child_weight': 2, 'n_estimators': 366, 'subsample': np.float64(0.8706901871157914)}

Classification Report (Tuned Model):
              precision    recall  f1-score   support

           0       0.21      0.29      0.24        28
           1       0.83      0.77      0.80       128

    accuracy                           0.68       156
   macro avg       0.52      0.53      0.52       156
weighted avg       0.72      0.68      0.70       156

Confusion Matrix:
 [[ 8 20]
 [30 98]]


In [15]:
from sklearn.metrics import f1_score
import numpy as np

y_proba = best_model.predict_proba(X_test)[:,1]

best_thresh, best_f1 = 0.5, 0
for t in np.arange(0.1, 0.9, 0.05):
    y_pred_thresh = (y_proba > t).astype(int)
    f1_class0 = f1_score(y_test, y_pred_thresh, pos_label=0)
    if f1_class0 > best_f1:
        best_f1 = f1_class0
        best_thresh = t

print(f"Best threshold for class 0: {best_thresh}, F1 (class 0): {best_f1}")

# Apply best threshold
y_pred_opt = (y_proba > best_thresh).astype(int)
print("\nClassification Report (Optimized Threshold):")
print(classification_report(y_test, y_pred_opt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_opt))


Best threshold for class 0: 0.8000000000000002, F1 (class 0): 0.3448275862068966

Classification Report (Optimized Threshold):
              precision    recall  f1-score   support

           0       0.23      0.71      0.34        28
           1       0.88      0.47      0.61       128

    accuracy                           0.51       156
   macro avg       0.55      0.59      0.48       156
weighted avg       0.76      0.51      0.56       156

Confusion Matrix:
 [[20  8]
 [68 60]]


In [16]:
from sklearn.ensemble import VotingClassifier

ensemble = VotingClassifier(
    estimators=[('rf', clf), ('xgb', best_model)],
    voting='soft'
)

ensemble.fit(X_train, y_train)
y_pred_ens = ensemble.predict(X_test)

print("\nClassification Report (Ensemble RF+XGB):")
print(classification_report(y_test, y_pred_ens))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_ens))



Classification Report (Ensemble RF+XGB):
              precision    recall  f1-score   support

           0       0.26      0.21      0.24        28
           1       0.83      0.87      0.85       128

    accuracy                           0.75       156
   macro avg       0.55      0.54      0.54       156
weighted avg       0.73      0.75      0.74       156

Confusion Matrix:
 [[  6  22]
 [ 17 111]]


In [17]:
from sklearn.utils import resample

df_majority = df[df.Target==1]
df_minority = df[df.Target==0]

df_majority_down = resample(df_majority,
                            replace=False,
                            n_samples=len(df_minority)*2,  # make it balanced or 2:1 ratio
                            random_state=42)

df_balanced = pd.concat([df_majority_down, df_minority])


In [18]:
# ---------------- Continue from your code ----------------

# Shuffle the balanced dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Features (same as before)
feature_cols = ['Item_Category_enc','Quantity','Unit_Price','Negotiated_Price',
                'Price_Diff','Price_Discount','Order_Year','Order_Month',
                'Order_DayOfWeek','Delivery_Delay','Cluster']

X_bal = df_balanced[feature_cols]
y_bal = df_balanced['Target']

# Train-test split
from sklearn.model_selection import train_test_split
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(
    X_bal, y_bal, test_size=0.2, stratify=y_bal, random_state=42
)

# Retrain model (try XGBoost here)
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

balanced_model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    eval_metric='logloss'
)

balanced_model.fit(X_train_bal, y_train_bal)

# Evaluate
y_pred_bal = balanced_model.predict(X_test_bal)
print("\nClassification Report (Balanced Training):")
print(classification_report(y_test_bal, y_pred_bal))
print("Confusion Matrix:\n", confusion_matrix(y_test_bal, y_pred_bal))

# Predict probabilities to allow threshold adjustment later
y_proba_bal = balanced_model.predict_proba(X_test_bal)[:,1]

# ---------------- Example Cold-Start Prediction ----------------
new_supplier = {
    'Item_Category': 'Office Supplies',
    'Quantity': 500,
    'Unit_Price': 25.0,
    'Negotiated_Price': 23.0,
    'Order_Date': '2024-07-10',
    'Delivery_Date': '2024-07-20'
}

import pandas as pd

order_date = pd.to_datetime(new_supplier['Order_Date'])
delivery_date = pd.to_datetime(new_supplier['Delivery_Date'])

features = {
    'Item_Category_enc': le_cat.transform([new_supplier['Item_Category']])[0],
    'Quantity': new_supplier['Quantity'],
    'Unit_Price': new_supplier['Unit_Price'],
    'Negotiated_Price': new_supplier['Negotiated_Price'],
    'Price_Diff': new_supplier['Unit_Price'] - new_supplier['Negotiated_Price'],
    'Price_Discount': (new_supplier['Unit_Price'] - new_supplier['Negotiated_Price'])/new_supplier['Unit_Price'],
    'Order_Year': order_date.year,
    'Order_Month': order_date.month,
    'Order_DayOfWeek': order_date.dayofweek,
    'Delivery_Delay': (delivery_date - order_date).days
}

# Predict cluster (same way as before)
row_for_cluster = scaler.transform([[features['Item_Category_enc'], features['Quantity'],
                                     features['Unit_Price'], features['Negotiated_Price'],
                                     features['Order_Year'], features['Order_Month'], features['Order_DayOfWeek']]])
features['Cluster'] = kmeans.predict(row_for_cluster)[0]

row = pd.DataFrame([features])
pred_bal = balanced_model.predict(row)[0]
prob_bal = balanced_model.predict_proba(row)[0]

print("\nNew Supplier Prediction (Balanced Model):", "Reliable" if pred_bal==1 else "Unreliable")
print("Probabilities:", prob_bal)



Classification Report (Balanced Training):
              precision    recall  f1-score   support

           0       0.57      0.43      0.49        28
           1       0.74      0.84      0.79        55

    accuracy                           0.70        83
   macro avg       0.66      0.63      0.64        83
weighted avg       0.68      0.70      0.69        83

Confusion Matrix:
 [[12 16]
 [ 9 46]]

New Supplier Prediction (Balanced Model): Reliable
Probabilities: [0.18783075 0.81216925]




In [19]:
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import numpy as np

# Predict probabilities for class 1 (Reliable)
y_proba_bal = balanced_model.predict_proba(X_test_bal)[:,1]

best_thresh, best_f1 = 0.5, 0
for t in np.arange(0.2, 0.8, 0.05):  # sweep thresholds
    y_pred_thresh = (y_proba_bal > t).astype(int)
    f1_class0 = f1_score(y_test_bal, y_pred_thresh, pos_label=0)
    if f1_class0 > best_f1:
        best_f1 = f1_class0
        best_thresh = t

print(f"\nBest threshold for class 0: {best_thresh}, F1 (class 0): {best_f1:.3f}")

# Apply best threshold
y_pred_opt = (y_proba_bal > best_thresh).astype(int)
print("\nClassification Report (Optimized Threshold):")
print(classification_report(y_test_bal, y_pred_opt))
print("Confusion Matrix:\n", confusion_matrix(y_test_bal, y_pred_opt))



Best threshold for class 0: 0.44999999999999996, F1 (class 0): 0.500

Classification Report (Optimized Threshold):
              precision    recall  f1-score   support

           0       0.60      0.43      0.50        28
           1       0.75      0.85      0.80        55

    accuracy                           0.71        83
   macro avg       0.67      0.64      0.65        83
weighted avg       0.70      0.71      0.70        83

Confusion Matrix:
 [[12 16]
 [ 8 47]]


In [20]:
FILE = "/content/Procurement KPI Analysis Dataset.csv"
df = pd.read_csv(FILE)

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# --- Assume df is loaded procurement dataset ---

# Parse dates
df['Order_Date'] = pd.to_datetime(df['Order_Date'])
df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'])

# New Features
df['Price_Ratio'] = df['Negotiated_Price'] / df['Unit_Price']
df['Discount_Amount'] = df['Unit_Price'] - df['Negotiated_Price']
df['Total_Cost'] = df['Quantity'] * df['Negotiated_Price']
df['High_Value_Flag'] = (df['Total_Cost'] > df['Total_Cost'].quantile(0.75)).astype(int)
df['Bulk_Order_Flag'] = (df['Quantity'] > df['Quantity'].quantile(0.75)).astype(int)
df['Order_Year'] = df['Order_Date'].dt.year
df['Order_Month'] = df['Order_Date'].dt.month
df['Order_DayOfWeek'] = df['Order_Date'].dt.dayofweek
df['Weekend_Order'] = df['Order_DayOfWeek'].isin([5,6]).astype(int)
df['Month_Quarter'] = df['Order_Date'].dt.quarter
df['Delivery_Delay'] = (df['Delivery_Date'] - df['Order_Date']).dt.days
df['Seasonal_Flag'] = df['Order_Month'].isin([11,12]).astype(int)  # example: year-end rush

# Encode Item Category
le_cat = LabelEncoder()
df['Item_Category_enc'] = le_cat.fit_transform(df['Item_Category'])

# Target = Compliance (Yes=1, No=0)
df['Target'] = df['Compliance'].map({'Yes':1, 'No':0})

# Select features
feature_cols = [
    'Item_Category_enc','Quantity','Unit_Price','Negotiated_Price',
    'Price_Ratio','Discount_Amount','Total_Cost','High_Value_Flag','Bulk_Order_Flag',
    'Order_Year','Order_Month','Order_DayOfWeek','Weekend_Order','Month_Quarter',
    'Delivery_Delay','Seasonal_Flag'
]

X = df[feature_cols]
y = df['Target']

# Balance dataset (downsample majority class)
from sklearn.utils import resample
df_majority = df[df.Target==1]
df_minority = df[df.Target==0]

df_majority_down = resample(df_majority,
                            replace=False,
                            n_samples=len(df_minority)*2,
                            random_state=42)

df_balanced = pd.concat([df_majority_down, df_minority]).sample(frac=1, random_state=42)

X_bal = df_balanced[feature_cols]
y_bal = df_balanced['Target']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_bal, y_bal, test_size=0.2, stratify=y_bal, random_state=42)

# Train XGBoost with engineered features
model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    eval_metric='logloss'
)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("\nClassification Report (Feature Engineered):")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Feature importances
importances = pd.Series(model.feature_importances_, index=feature_cols).sort_values(ascending=False)
print("\nFeature Importances:\n", importances)



Classification Report (Feature Engineered):
              precision    recall  f1-score   support

           0       0.50      0.32      0.39        28
           1       0.71      0.84      0.77        55

    accuracy                           0.66        83
   macro avg       0.60      0.58      0.58        83
weighted avg       0.64      0.66      0.64        83

Confusion Matrix:
 [[ 9 19]
 [ 9 46]]

Feature Importances:
 Order_DayOfWeek      0.092583
Discount_Amount      0.090265
Unit_Price           0.083290
Order_Month          0.082059
Total_Cost           0.078383
Price_Ratio          0.071361
Item_Category_enc    0.070233
Order_Year           0.069315
Delivery_Delay       0.069109
Seasonal_Flag        0.068319
Negotiated_Price     0.067598
Quantity             0.059006
Weekend_Order        0.058952
Month_Quarter        0.028255
Bulk_Order_Flag      0.011272
High_Value_Flag      0.000000
dtype: float32


In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# --- Feature Engineering ---
df['Order_Date'] = pd.to_datetime(df['Order_Date'])
df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'])

df['Price_Ratio'] = df['Negotiated_Price'] / df['Unit_Price']
df['Discount_Amount'] = df['Unit_Price'] - df['Negotiated_Price']
df['Total_Cost'] = df['Quantity'] * df['Negotiated_Price']
df['Order_Year'] = df['Order_Date'].dt.year
df['Order_Month'] = df['Order_Date'].dt.month
df['Order_DayOfWeek'] = df['Order_Date'].dt.dayofweek
df['Weekend_Order'] = df['Order_DayOfWeek'].isin([5,6]).astype(int)
df['Month_Quarter'] = df['Order_Date'].dt.quarter
df['Delivery_Delay'] = (df['Delivery_Date'] - df['Order_Date']).dt.days
df['Seasonal_Flag'] = df['Order_Month'].isin([11,12]).astype(int)

# Encode Item Category
le_cat = LabelEncoder()
df['Item_Category_enc'] = le_cat.fit_transform(df['Item_Category'])

# Target = Compliance (Yes=1, No=0)
df['Target'] = df['Compliance'].map({'Yes':1, 'No':0})

# Features
feature_cols = [
    'Item_Category_enc','Quantity','Unit_Price','Negotiated_Price',
    'Price_Ratio','Discount_Amount','Total_Cost',
    'Order_Year','Order_Month','Order_DayOfWeek','Weekend_Order','Month_Quarter',
    'Delivery_Delay','Seasonal_Flag'
]

X = df[feature_cols]
y = df['Target']

# --- Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# --- Cost-Sensitive XGBoost ---
scale_pos_weight = len(y_train[y_train==0]) / len(y_train[y_train==1])  # weight minority class
model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight
)
model.fit(X_train, y_train)

# --- Default Evaluation ---
y_pred_default = model.predict(X_test)
print("\nDefault Threshold (0.5) Report:")
print(classification_report(y_test, y_pred_default))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_default))

# --- Threshold Optimization ---
y_proba = model.predict_proba(X_test)[:,1]
best_thresh, best_f1 = 0.5, 0
for t in np.arange(0.2, 0.8, 0.05):
    y_pred_thresh = (y_proba > t).astype(int)
    f1_class0 = f1_score(y_test, y_pred_thresh, pos_label=0)
    if f1_class0 > best_f1:
        best_f1 = f1_class0
        best_thresh = t

print(f"\nBest threshold for class 0: {best_thresh:.2f}, F1(class 0): {best_f1:.3f}")

# --- Apply Optimized Threshold ---
y_pred_opt = (y_proba > best_thresh).astype(int)
print("\nClassification Report (Optimized Threshold):")
print(classification_report(y_test, y_pred_opt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_opt))

# --- Feature Importances ---
importances = pd.Series(model.feature_importances_, index=feature_cols).sort_values(ascending=False)
print("\nFeature Importances:\n", importances)



Default Threshold (0.5) Report:
              precision    recall  f1-score   support

           0       0.35      0.43      0.39        28
           1       0.87      0.83      0.85       128

    accuracy                           0.76       156
   macro avg       0.61      0.63      0.62       156
weighted avg       0.78      0.76      0.77       156

Confusion Matrix:
 [[ 12  16]
 [ 22 106]]

Best threshold for class 0: 0.45, F1(class 0): 0.414

Classification Report (Optimized Threshold):
              precision    recall  f1-score   support

           0       0.40      0.43      0.41        28
           1       0.87      0.86      0.87       128

    accuracy                           0.78       156
   macro avg       0.64      0.64      0.64       156
weighted avg       0.79      0.78      0.78       156

Confusion Matrix:
 [[ 12  16]
 [ 18 110]]

Feature Importances:
 Month_Quarter        0.109470
Order_Year           0.084605
Order_Month          0.080003
Unit_Price      