In [33]:
import pandas as pd
import joblib

In [35]:
# === Load Model, Scaler, and Reference Columns ===
def load_model_artifacts():
    model = joblib.load("model.pkl")
    scaler = joblib.load("scaler.pkl")
    reference_columns = joblib.load("columns.pkl")
    return model, scaler, reference_columns

In [51]:
# === Apply Feature Engineering (same as training) ===
def apply_feature_engineering(df):
    df['Order_Pressure'] = df['Ordered_Qty'] / df['Committed_Lead_Days']
    df['Vendor_Risk'] = (1 - df['Reliability_score'] / 100) * df['Quality_Rejection_Rate (%)']
    df['Transit_Risk'] = df['Avg_Transit_Days'] * df['Weather_Disruption_Index'] * df['Route Risk Score']
    df['Demand_vs_Reliability'] = df['Ordered_Qty'] / (df['Reliability_score'] + 1)
    df['Stress_Score'] = df['Transit_Risk'] * df['Order_Pressure']
    df['Price_per_Unit_vs_Order'] = df['Price_per_Unit'] / (df['Ordered_Qty'] + 1)

    df['High_Risk_Vendor'] = (df['Vendor_Risk'] > 1.5).astype(int)
    df['High_Order_Pressure'] = (df['Order_Pressure'] > 20).astype(int)
    df['High_Transit_Risk'] = (df['Transit_Risk'] > 25).astype(int)
    df['High_Demand_vs_Reliability'] = (df['Demand_vs_Reliability'] > 8).astype(int)
    df['High_Stress_Score'] = (df['Stress_Score'] > 400).astype(int)
    df['Low_Order_Pressure'] = (df['Order_Pressure'] < 6.2).astype(int)
    df['Low_Stress_Score'] = (df['Stress_Score'] < 40).astype(int)
    df['Low_Demand_vs_Reliability'] = (df['Demand_vs_Reliability'] < 3).astype(int)
    df['Low_Vendor_Risk'] = (df['Vendor_Risk'] < 0.5).astype(int)
    df['Low_Price_per_Unit_vs_Order'] = (df['Price_per_Unit_vs_Order'] < 0.05).astype(int)
    
    return df

In [53]:
# === Encode Categorical Features ===
def encode_features(df):
    id_cols = ['Component_ID', 'Vendor_ID', 'Route_ID', 'Source']
    for col in id_cols:
        df[col] = pd.Series(df[col]).astype('category').cat.codes

    df['Peak_Congestion_Indicator'] = {'Low': 0, 'Medium': 1, 'High': 2}[df['Peak_Congestion_Indicator'].iloc[0]]
    df['Mode_Lorry'] = int(df['Mode'].iloc[0] == 'Lorry')
    df['Mode_Train'] = int(df['Mode'].iloc[0] == 'Train')
    df['Backup Route Availability_Yes'] = int(df['Backup Route Availability'].iloc[0] == 'Yes')

    df.drop(columns=['Mode', 'Backup Route Availability'], inplace=True, errors='ignore')
    return df

In [57]:
# === Main Prediction Function ===
def predict_from_dataframe(df_input):
    model, scaler, reference_columns = load_model_artifacts()

    # Check if input is a single dictionary and convert to DataFrame
    is_single_row = isinstance(df_input, dict)
    if is_single_row:
        df_input = pd.DataFrame([df_input])

    # Store original input for displaying later
    original_cols = df_input.copy()

    # Feature engineering + encoding
    df_input = apply_feature_engineering(df_input)
    df_input = encode_features(df_input)
    df_input = df_input.reindex(columns=reference_columns, fill_value=0)

    # Scale + predict
    df_scaled = scaler.transform(df_input)
    probs = model.predict_proba(df_scaled)[:, 1]
    preds = (probs >= 0.5).astype(int)

    # Prepare output
    output_df = original_cols.copy()
    output_df['Shortfall_Prob'] = probs
    output_df['Predicted_Flag'] = preds

    if is_single_row:
        # For single row, return clean print
        print("\n===== SINGLE ROW PREDICTION =====")
        print(f"Shortfall Probability: {probs[0]:.4f}")
        print(f"Predicted Shortfall Flag: {preds[0]}")
        return None
    else:
        # For batch, return dataframe
        return output_df

In [87]:
test_input = {
    'Ordered_Qty': 100,
    'Committed_Lead_Days': 30,
    'Reliability_score': 87,
    'Quality_Rejection_Rate (%)': 6,
    'collaboration_tenure': 2,
    'avg_lead_days': 7,
    'past_incident_count': 1,
    'Price_per_Unit': 88,
    'Mode': 'Train',
    'Peak_Congestion_Indicator': 'Medium',
    'Backup Route Availability': 'Yes',
    'Avg_Transit_Days': 3,
    'Weather_Disruption_Index': 1.0,
    'Route Risk Score': 2,
    'Component_ID': 1,
    'Vendor_ID': 5,
    'Route_ID': 6,
    'Source': 15,
}

predict_from_dataframe(test_input)


===== SINGLE ROW PREDICTION =====
Shortfall Probability: 0.4242
Predicted Shortfall Flag: 0


In [49]:
df_test = pd.read_csv("full_merged_data_v1.csv").iloc[:5].copy()
result_df = predict_from_dataframe(df_test)
print(result_df.head())

  Order_ID Component_ID Vendor_ID Route_ID  Order_Date  \
0    O0001          C04      V009     R002  03-11-2001   
1    O0002          C01      V001     R022  10-11-2001   
2    O0003          C05      V001     R021  11-11-2001   
3    O0004          C01      V012     R014  11-11-2001   
4    O0005          C02      V002     R007  14-11-2001   

  Contractual_Delivery_Date Actual_Delivery_Date  Delivered_Qty  Ordered_Qty  \
0                01-01-2002           25-01-2002             50          580   
1                11-01-2002           07-01-2002             50          317   
2                01-01-2002           30-01-2002            124          140   
3                05-01-2002           23-01-2002             50           50   
4                06-01-2002           27-01-2002             50          362   

   Price_per_Unit  ...  vendor_id    vendor_name  Reliability_score  \
0           65.20  ...       V009   Iota Traders                 64   
1           25.50  ...      