In [None]:

import pandas as pd
import json
import numpy as np
import re

# Load raw data
df = pd.read_csv(r'C:\Users\sagni\Downloads\GenAI Data Drift Firewall\GenAI Data Drift Firewall\data_drift.csv')

# Load reference schema
with open("genai_schema_reference.json", "r") as f:
    schema = json.load(f)

# Step 1: Column name/schema issues
expected_columns = set(schema.keys())
actual_columns = set(df.columns)

missing_columns = list(expected_columns - actual_columns)
unexpected_columns = list(actual_columns - expected_columns)

schema_column_issues = {
    "missing_columns": missing_columns,
    "unexpected_columns": unexpected_columns
}

# Save and print column-level issues
with open("schema_column_issues.json", "w") as f:
    json.dump(schema_column_issues, f, indent=2)

print("⚠️ Column Name Issues:")
print(json.dumps(schema_column_issues, indent=2))

# Step 2: Row-level validation (check ALL violations, even if value is missing)
available_schema = {col: rules for col, rules in schema.items() if col in df.columns}
violations = []

for index, row in df.iterrows():
    row_issues = {}

    for col, rules in available_schema.items():
        value = row.get(col, None)

        # Always check missing/blank
        if pd.isna(value) or (isinstance(value, str) and value.strip() == ""):
            row_issues[col] = {
                "issue": "Missing/blank value"
            }

        # Categorical check
        if rules["type"] == "categorical":
            if not isinstance(value, str):
                row_issues.setdefault(col, {})["issue"] = "Wrong data type for categorical"
                row_issues[col]["value"] = value
                row_issues[col]["expected_type"] = "string"
            elif value not in rules["valid_values"]:
                row_issues.setdefault(col, {})["issue"] = "Invalid category"
                row_issues[col]["value"] = value
                row_issues[col]["expected"] = rules["valid_values"]

        # Numeric check
        elif rules["type"] == "numeric":
            try:
                val = float(value)
                if val < rules["min_estimate"] or val > rules["max_estimate"]:
                    row_issues.setdefault(col, {})["issue"] = "Out of expected range"
                    row_issues[col]["value"] = val
                    row_issues[col]["expected_range"] = [rules["min_estimate"], rules["max_estimate"]]
            except:
                row_issues.setdefault(col, {})["issue"] = "Wrong data type"
                row_issues[col]["value"] = value
                row_issues[col]["expected_type"] = "numeric"

        # Datetime check
        elif rules["type"] == "datetime":
            try:
                pd.to_datetime(value)
            except:
                row_issues.setdefault(col, {})["issue"] = "Invalid datetime"
                row_issues[col]["value"] = value

        # Integer check
        elif rules["type"] == "int64":
            try:
                val = float(value)
                if not val.is_integer():
                    raise ValueError
            except:
                row_issues.setdefault(col, {})["issue"] = "Expected integer"
                row_issues[col]["value"] = value

    if row_issues:
        violations.append({
            "row": int(index),
            "problems": row_issues
        })

# Save violations
with open("genai_firewall_violations.json", "w") as f:
    json.dump(violations, f, indent=2)

# Show sample violations
print("\n🔍 Row-level Violations (Top 3):")
print(json.dumps(violations[:3], indent=2))

print(f"\n✅ Completed firewall check:\n- Column issues saved to schema_column_issues.json\n- Row violations saved to genai_firewall_violations.json")


⚠️ Column Name Issues:
{
  "missing_columns": [
    "order_date_(DateOrders)",
    "Product_Description"
  ],
  "unexpected_columns": [
    "untracked_feature",
    "Order_Item_Qty",
    "Cust_State"
  ]
}

🔍 Row-level Violations (Top 3):
[
  {
    "row": 5,
    "problems": {
      "Sales_per_customer": {
        "issue": "Wrong data type",
        "value": "unknown",
        "expected_type": "numeric"
      }
    }
  },
  {
    "row": 6,
    "problems": {
      "Days_for_shipping_(real)": {
        "issue": "Wrong data type",
        "value": "five",
        "expected_type": "numeric"
      },
      "Order_Item_Product_Price": {
        "issue": "Wrong data type",
        "value": "cheap",
        "expected_type": "numeric"
      }
    }
  },
  {
    "row": 7,
    "problems": {
      "Delivery_Status": {
        "issue": "Invalid category",
        "value": "arriving_soon",
        "expected": [
          "Late delivery",
          "Advance shipping",
          "Shipping on time",
   

In [6]:
import pandas as pd
import json
import numpy as np
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch



# === New GenAI Fixing step starts here ===

print("\n🛠️ Starting GenAI auto-fixing of violations...\n")

# Load model and tokenizer once (can be moved to a separate cell if needed)
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32).to("cpu")
model.eval()

def extract_json(text):
    try:
        if "[/INST]" in text:
            text = text.split("[/INST]", 1)[1].strip()
        json_start = text.find("{")
        json_end = text.rfind("}") + 1
        json_str = text[json_start:json_end]
        return json.loads(json_str)
    except Exception as e:
        print(f"⚠️ JSON parsing failed: {e}")
        return None

fixes = {}

for violation in violations:
    row_id = violation["row"]
    problems = violation["problems"]
    fixes.setdefault(str(row_id), {})

    for col, issue in problems.items():
        rule = schema.get(col)
        if not rule:
            print(f"⚠️ No schema for column '{col}', skipping fix...")
            continue

        try:
            row_data = df.iloc[row_id].to_dict()
            row_context = {k: v for k, v in row_data.items() if k != col}
            context_json = json.dumps(row_context, separators=(',', ':'))

            if rule["type"] == "numeric":
                rule_str = f"The value must be a float between {rule['min_estimate']} and {rule['max_estimate']}."
            elif rule["type"] == "categorical":
                valid = rule.get("valid_values", [])
                rule_str = f"The value must be one of: {', '.join([f'\"{v}\"' for v in valid])}."
            else:
                rule_str = "The value must follow schema rules."

            format_hint = f'{{"{row_id}": {{"{col}": VALUE}}}}'

            prompt = f"""[INST]
You are a data cleaner. Fix the column '{col}' using the row context and schema rule.

Column: {col}
Rule: {rule_str}

Row context (excluding the problematic column):
{context_json}

Output ONLY a valid JSON object in this format (no explanation, no notes):
{format_hint}
[/INST]"""

            inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to("cpu")

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=128,
                    do_sample=False,
                    pad_token_id=tokenizer.eos_token_id,
                    eos_token_id=tokenizer.eos_token_id
                )

            response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
            print(f"\n🔁 Raw response:\n{response}")

            fixed_value = extract_json(response)

            if (
                isinstance(fixed_value, dict)
                and str(row_id) in fixed_value
                and col in fixed_value[str(row_id)]
            ):
                value = fixed_value[str(row_id)][col]
                print(f"✅ Parsed fix for row {row_id}, column '{col}': {value}")
                fixes[str(row_id)][col] = value
            else:
                print(f"❌ Could not parse fix for row {row_id}, column '{col}'")

        except RuntimeError as e:
            print(f"💥 Runtime error fixing row {row_id}, column {col}: {e}")

        finally:
            gc.collect()

print("\n🔧 Final Fixes (JSON):\n")
print(json.dumps(fixes, indent=2))



🛠️ Starting GenAI auto-fixing of violations...



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



🔁 Raw response:
[INST]
You are a data cleaner. Fix the column 'Sales_per_customer' using the row context and schema rule.

Column: Sales_per_customer
Rule: The value must be a float between 8.39 and 1919.99.

Row context (excluding the problematic column):
{"Type":"PAYMENT","Days_for_shipping_(real)":"4","Days_for_shipment_(scheduled)":4,"Benefit_per_order":-60.79,"Delivery_Status":"Shipping on time","Late_delivery_risk":0,"Category_Id":71,"Category_Name":"Music","Customer_City":"Del Rio","Customer_Country":"EE. UU.","Customer_Email":"XXXXXXXXX","Customer_Fname":"Mira","Customer_Id":16096,"Customer_Lname":"Macdonald","Customer_Password":"XXXXXXXXX","Customer_Segment":"Home Office","Cust_State":"TX","Customer_Street":"8675 Lazy Pathway","Customer_Zipcode":78840,"Department_Id":9,"Department_Name":"Discs Shop","Latitude":29.7731,"Longitude":-100.6122,"Market":"Pacific Asia","Order_City":"Hefei","Order_Country":"China","Order_Customer_Id":16096,"Order_Id":72543,"Order_Item_Cardprod_Id":1

In [7]:
# Apply fixes to the dataframe
for row_id, cols in fixes.items():
    for col, val in cols.items():
        # Update the dataframe cell with the fixed value
        df.at[int(row_id), col] = val

# Save the fixed dataframe to CSV
fixed_csv_path = r'C:\Users\sagni\Downloads\GenAI Data Drift Firewall\GenAI Data Drift Firewall\data_drift_fixed_final.csv'
df.to_csv(fixed_csv_path, index=False)

print(f"\n✅ Fixed data saved to CSV: {fixed_csv_path}")



✅ Fixed data saved to CSV: C:\Users\sagni\Downloads\GenAI Data Drift Firewall\GenAI Data Drift Firewall\data_drift_fixed_final.csv


In [8]:
import pandas as pd
import joblib
import json
import os
import numpy as np

# === CONFIG ===
ARTIFACTS_DIR = "model_artifacts"

# === Load Model Artifacts ===
model = joblib.load(os.path.join(ARTIFACTS_DIR, "final_ml_model.pkl"))
scaler = joblib.load(os.path.join(ARTIFACTS_DIR, "standard_scaler.pkl"))
encoders = joblib.load(os.path.join(ARTIFACTS_DIR, "label_encoders.pkl"))

with open(os.path.join(ARTIFACTS_DIR, "model_features.json")) as f:
    expected_features = json.load(f)

with open(os.path.join(ARTIFACTS_DIR, "column_dtypes.json")) as f:
    expected_dtypes = json.load(f)

with open(os.path.join(ARTIFACTS_DIR, "classification_threshold.txt")) as f:
    threshold = float(f.read().strip())


# === Step 1: Derived Feature Creation ===
def add_derived_features(df):
    df = df.copy()
    if 'order_date_(DateOrders)' in df.columns:
        df['order_date_(DateOrders)'] = pd.to_datetime(df['order_date_(DateOrders)'], errors='coerce')
        df['is_holiday_week'] = df['order_date_(DateOrders)'].dt.isocalendar().week.isin([1, 52]).astype(int)
        df['order_day'] = df['order_date_(DateOrders)'].dt.weekday.astype('Int64')  # nullable integer dtype
    else:
        df['is_holiday_week'] = 0
        df['order_day'] = 0

    if 'shipping_date_(DateOrders)' in df.columns:
        df['shipping_date_(DateOrders)'] = pd.to_datetime(df['shipping_date_(DateOrders)'], errors='coerce')
        df['ship_month'] = df['shipping_date_(DateOrders)'].dt.month.astype('Int64')
    else:
        df['ship_month'] = 0

    return df


# === Step 2: Column Cleaning ===
def clean_columns(df):
    derived = ['is_holiday_week', 'order_day', 'ship_month']
    keep = set(expected_features + derived)
    return df[[col for col in df.columns if col in keep]]


# === Step 3: Schema Validation ===
def validate_schema(df):
    missing = set(expected_features) - set(df.columns)
    if missing:
        raise ValueError(f"❌ Missing required columns: {missing}")
    return df


# === Step 4: Full Preprocessing ===
def preprocess_input(df):
    df = df.copy()

    # Add derived features
    df = add_derived_features(df)

    # Clean columns - keep only expected + derived
    df = clean_columns(df)

    # Validate schema (missing columns)
    df = validate_schema(df)

    # Fill missing numeric values with mean or zero (depends on your domain)
    for col in df.columns:
        if df[col].dtype.kind in 'biufc':  # numeric types including nullable int/float
            if df[col].isnull().any():
                mean_val = df[col].mean()
                df[col] = df[col].fillna(mean_val if not np.isnan(mean_val) else 0)

    # Apply label encoding for categorical features
    for col, encoder in encoders.items():
        if col in df.columns:
            # Replace unseen categories with a placeholder if you have one, or drop rows, or raise error
            try:
                df[col] = encoder.transform(df[col])
            except ValueError as e:
                unseen = set(df[col].dropna().unique()) - set(encoder.classes_)
                raise ValueError(f"❌ Column '{col}' has unseen labels: {unseen}") from e

    # Enforce expected dtypes after encoding
    for col in expected_features:
        if col in df.columns:
            expected_dtype = expected_dtypes[col]
            try:
                df[col] = df[col].astype(expected_dtype)
            except Exception as e:
                raise ValueError(f"❌ Column '{col}' cannot be converted to {expected_dtype}: {e}")

    # Scale numeric columns only if they exist in df
    numeric_cols = [col for col in scaler.feature_names_in_ if col in df.columns]
    if numeric_cols:
        df[numeric_cols] = scaler.transform(df[numeric_cols])

    return df[expected_features]


# === Step 5: Prediction Pipeline ===
def predict_pipeline(df):
    try:
        print(f"DEBUG: Input type to predict_pipeline: {type(df)}")
        processed = preprocess_input(df)
        probs = model.predict_proba(processed)[:, 1]
        preds = (probs >= threshold).astype(int)
      
        return preds, probs
    except Exception as e:
        print(f"❌ Prediction Error: {e}")
        return None, None


# === MAIN FLOW ===
if __name__ == "__main__":
    # Assuming 'fixed_df' is loaded with your cleaned fixed CSV before this block
    fixed_df = pd.read_csv(r"C:\Users\sagni\Downloads\GenAI Data Drift Firewall\GenAI Data Drift Firewall\data_drift_fixed_final.csv")  # Replace with your actual fixed CSV path

    predictions, probabilities = predict_pipeline(fixed_df)

    if predictions is not None:
        output_df = fixed_df.copy()
        output_df["Predicted_Label"] = predictions
        output_df["Prediction_Prob"] = probabilities

        output_path = "predictions/final_predictions.csv"
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        output_df.to_csv(output_path, index=False)
        print(f"✅ Predictions saved to: {output_path}")
    else:
        print("⚠️ No predictions generated.")


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


DEBUG: Input type to predict_pipeline: <class 'pandas.core.frame.DataFrame'>
✅ Predictions saved to: predictions/final_predictions.csv
