In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")
df=pd.read_csv("supermarket_dataset.csv")

# Features (drop targets + identifiers that don't help)
X = df.drop(columns=["Leakage_Flag", "Anomaly_Type", "Invoice_Number", "Billing_Time","Service_Category","Transaction_Type","Store_Branch","Cashier_ID","Supplier_ID"])

# Encode target columns
leakage_encoder = LabelEncoder()
anomaly_encoder = LabelEncoder()

y = pd.DataFrame({
    "Leakage_Flag": leakage_encoder.fit_transform(df["Leakage_Flag"]),
    "Anomaly_Type": anomaly_encoder.fit_transform(df["Anomaly_Type"])
})

# Identify categorical and numeric columns
cat_cols = X.select_dtypes(include=["object"]).columns
num_cols = X.select_dtypes(exclude=["object"]).columns

# Preprocessor (OneHot for categorical, Scale for numeric)
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", StandardScaler(), num_cols)
    ]
)

# XGBoost Classifier
xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric="mlogloss"
)

# Multi-output wrapper
multi_xgb = MultiOutputClassifier(xgb)

# Build pipeline
pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("clf", multi_xgb)
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y["Leakage_Flag"]
)

# Fit model
pipeline.fit(X_train, y_train)

# Predictions (numeric)
y_pred = pipeline.predict(X_test)

# Convert back to original labels
y_pred_df = pd.DataFrame({
    "Leakage_Flag": leakage_encoder.inverse_transform(y_pred[:, 0]),
    "Anomaly_Type": anomaly_encoder.inverse_transform(y_pred[:, 1])
})

y_test_decoded = pd.DataFrame({
    "Leakage_Flag": leakage_encoder.inverse_transform(y_test["Leakage_Flag"]),
    "Anomaly_Type": anomaly_encoder.inverse_transform(y_test["Anomaly_Type"])
})

# Evaluation
print("Leakage_Flag Report:")
print(classification_report(y_test_decoded["Leakage_Flag"], y_pred_df["Leakage_Flag"]))

print("Anomaly_Type Report:")
print(classification_report(y_test_decoded["Anomaly_Type"], y_pred_df["Anomaly_Type"]))


Leakage_Flag Report:
              precision    recall  f1-score   support

     Anomaly       0.99      0.86      0.92       259
  No Leakage       0.92      1.00      0.96       437

    accuracy                           0.95       696
   macro avg       0.96      0.93      0.94       696
weighted avg       0.95      0.95      0.94       696

Anomaly_Type Report:
                         precision    recall  f1-score   support

      Duplicate Entries       1.00      1.00      1.00        33
         Excess Payment       1.00      1.00      1.00        47
        Missing Charges       0.98      0.57      0.72        83
             No Anomaly       0.92      1.00      0.96       437
Payment Status Mismatch       1.00      1.00      1.00        41
          Under Payment       1.00      1.00      1.00        55

               accuracy                           0.95       696
              macro avg       0.98      0.93      0.95       696
           weighted avg       0.95      0.95

In [3]:
df.columns

Index(['Invoice_Number', 'Customer_ID', 'Service_ID', 'Billing_Date',
       'Payment_Status', 'Leakage_Flag', 'Transaction_Type', 'Mode_of_Payment',
       'Product_Name', 'Service', 'Product_Category', 'Service_Category',
       'Product_Quantity', 'Tax_Amount', 'Actual_Amount', 'Billed_Amount',
       'Paid_Amount', 'Balance_Amount', 'Unit_Price', 'Tax_Rate',
       'Service_Charge', 'Discount_Amount', 'Store_Branch', 'Cashier_ID',
       'Supplier_ID', 'Billing_Time', 'Customer_Type', 'Order_Channel',
       'Anomaly_Type', 'Invoice_Num_Int', 'Is_Duplicate',
       'actual_billing_amnt'],
      dtype='object')

In [4]:
df.shape

(3477, 32)

In [8]:
dfc=pd.read_csv("revenue_leak_balanced.csv")
missing_cols = set(df.columns) - set(dfc.columns)
print(missing_cols)

{'Invoice_Num_Int', 'actual_billing_amnt', 'Is_Duplicate'}
