In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import warnings
import joblib

warnings.filterwarnings("ignore")

df = pd.read_csv(r"C:\Users\ragul\OneDrive\Desktop\hack\AI_Revenue_Leakage_Detection\model\super_market\datasets\supermarket_dataset.csv")

# Features (drop targets + identifiers that don't help)
X = df.drop(columns=["Leakage_Flag", "Anomaly_Type", "Invoice_Number", "Billing_Time","Service_Category","Transaction_Type","Store_Branch","Cashier_ID","Supplier_ID"])

# Encode target columns
leakage_encoder = LabelEncoder()
anomaly_encoder = LabelEncoder()

y = pd.DataFrame({
    "Leakage_Flag": leakage_encoder.fit_transform(df["Leakage_Flag"]),
    "Anomaly_Type": anomaly_encoder.fit_transform(df["Anomaly_Type"])
})

# Identify categorical and numeric columns
cat_cols = X.select_dtypes(include=["object"]).columns
num_cols = X.select_dtypes(exclude=["object"]).columns

# Preprocessor (OneHot for categorical, Scale for numeric)
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", StandardScaler(), num_cols)
    ]
)

# XGBoost Classifier
xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric="mlogloss"
)

# Multi-output wrapper
multi_xgb = MultiOutputClassifier(xgb)

# Build pipeline
pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("clf", multi_xgb)
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y["Leakage_Flag"]
)

# Fit model
pipeline.fit(X_train, y_train)

# Predictions (numeric)
y_pred = pipeline.predict(X_test)

# Convert back to original labels
y_pred_df = pd.DataFrame({
    "Leakage_Flag": leakage_encoder.inverse_transform(y_pred[:, 0]),
    "Anomaly_Type": anomaly_encoder.inverse_transform(y_pred[:, 1])
})

y_test_decoded = pd.DataFrame({
    "Leakage_Flag": leakage_encoder.inverse_transform(y_test["Leakage_Flag"]),
    "Anomaly_Type": anomaly_encoder.inverse_transform(y_test["Anomaly_Type"])
})

# Evaluation
print("Leakage_Flag Report:")
print(classification_report(y_test_decoded["Leakage_Flag"], y_pred_df["Leakage_Flag"]))

print("Anomaly_Type Report:")
print(classification_report(y_test_decoded["Anomaly_Type"], y_pred_df["Anomaly_Type"]))

# --- Save trained pipeline + encoders ---
joblib.dump(pipeline, r"C:\Users\ragul\OneDrive\Desktop\hack\AI_Revenue_Leakage_Detection\model\super_market\saved_models\trained_pipeline.pkl")
joblib.dump(leakage_encoder, r"C:\Users\ragul\OneDrive\Desktop\hack\AI_Revenue_Leakage_Detection\model\super_market\saved_modelsleakage_encoder.pkl")
joblib.dump(anomaly_encoder, r"C:\Users\ragul\OneDrive\Desktop\hack\AI_Revenue_Leakage_Detection\model\super_market\saved_modelsanomaly_encoder.pkl")
print("✅ Model and encoders saved!")


Leakage_Flag Report:
              precision    recall  f1-score   support

     Anomaly       0.99      0.86      0.92       259
  No Leakage       0.92      1.00      0.96       437

    accuracy                           0.95       696
   macro avg       0.96      0.93      0.94       696
weighted avg       0.95      0.95      0.94       696

Anomaly_Type Report:
                         precision    recall  f1-score   support

      Duplicate Entries       1.00      1.00      1.00        33
         Excess Payment       1.00      1.00      1.00        47
        Missing Charges       0.98      0.57      0.72        83
             No Anomaly       0.92      1.00      0.96       437
Payment Status Mismatch       1.00      1.00      1.00        41
          Under Payment       1.00      1.00      1.00        55

               accuracy                           0.95       696
              macro avg       0.98      0.93      0.95       696
           weighted avg       0.95      0.95

In [5]:
import pandas as pd
import joblib

# Load model & encoders
pipeline = joblib.load(r"C:\Users\ragul\OneDrive\Desktop\hack\AI_Revenue_Leakage_Detection\model\super_market\saved_models\trained_pipeline.pkl")
leakage_encoder = joblib.load(r"C:\Users\ragul\OneDrive\Desktop\hack\AI_Revenue_Leakage_Detection\model\super_market\saved_models\leakage_encoder.pkl")
anomaly_encoder = joblib.load(r"C:\Users\ragul\OneDrive\Desktop\hack\AI_Revenue_Leakage_Detection\model\super_market\saved_models\anomaly_encoder.pkl")

# Load new dataset (⚠️ this one does NOT have target columns)
new_df=pd.read_csv(r"C:\Users\ragul\OneDrive\Desktop\hack\AI_Revenue_Leakage_Detection\model\super_market\datasets\input_dataset_cleaned.csv")
new_df['Invoice_Num_Int'] = new_df['Invoice_Number'].str.replace("INV", "").astype(int)
new_df = new_df.sort_values(by='Invoice_Num_Int').reset_index(drop=True)

# Step 3: Create Is_Duplicate flag (check previous/next after sorting)
new_df['Is_Duplicate'] = (
    (new_df['Invoice_Number'] == new_df['Invoice_Number'].shift(1)) | 
    (new_df['Invoice_Number'] == new_df['Invoice_Number'].shift(-1))
).astype(int)
new_df["actual_billing_amnt"] = (
    new_df["Actual_Amount"] 
    + new_df["Tax_Amount"] 
    + new_df["Service_Charge"] 
    - new_df["Discount_Amount"]
)

new_df = new_df.drop(columns=['Leakage_Flag', 'Anomaly_Type'], errors='ignore')

# Drop same identifier columns
X_new = new_df.drop(columns=[
    "Invoice_Number", "Billing_Time",
    "Service_Category", "Transaction_Type",
    "Store_Branch", "Cashier_ID", "Supplier_ID"
], errors="ignore")

# Predict
y_pred = pipeline.predict(X_new)

# Decode predictions
pred_df = pd.DataFrame({
    "Leakage_Flag_Pred": leakage_encoder.inverse_transform(y_pred[:, 0]),
    "Anomaly_Type_Pred": anomaly_encoder.inverse_transform(y_pred[:, 1])
})

# Append predictions to dataset
new_df_with_preds = pd.concat([new_df.reset_index(drop=True), pred_df], axis=1)

# Save to CSV
output_path = r"C:\Users\ragul\OneDrive\Desktop\hack\AI_Revenue_Leakage_Detection\model\super_market\output_datasets\new_supermarket_with_predictions.csv"
new_df_with_preds.to_csv(output_path, index=False)
no_leakage_df = new_df_with_preds[new_df_with_preds["Leakage_Flag_Pred"] == "No Leakage"]
anomaly_df = new_df_with_preds[new_df_with_preds["Leakage_Flag_Pred"] == "Anomaly"]

# Save them as separate CSVs
no_leakage_df.to_csv(r"C:\Users\ragul\OneDrive\Desktop\hack\AI_Revenue_Leakage_Detection\model\super_market\output_datasets\no_leakage_data.csv", index=False)
anomaly_df.to_csv(r"C:\Users\ragul\OneDrive\Desktop\hack\AI_Revenue_Leakage_Detection\model\super_market\output_datasets\anomaly_data.csv", index=False)
print(f"✅ Predictions added and saved to {output_path}")


✅ Predictions added and saved to C:\Users\ragul\OneDrive\Desktop\hack\AI_Revenue_Leakage_Detection\model\super_market\output_datasets\new_supermarket_with_predictions.csv
