In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import IsolationForest

#File Schema validation to be part of seperate notebook

# Function to load data from csv
#workspace.default.restaurant_sales_data
csv_path = "file:/Workspace/Users/niranchanadevi.r@gmail.com/restaurant_sales_data.csv"

def load_data_from_csv(csv_path):
    return pd.read_csv(csv_path)

df = load_data_from_csv(csv_path)
df.head(10)

#Describe the Schema - This should be seperate may be in a different notebook

#Validate the column data 

###Resturant Menu Items Validation
valid_menu_items = ['Burger', 'Pizza', 'Salad', 'Pasta', 'Soda']

# Function to detect new menu items
def detect_new_menu_items(df, column_name, valid_items, contamination=0.1):
    """
    Detect new menu items not in the valid_items list using Isolation Forest.
    
    Parameters:
    - df: DataFrame containing the restaurant sales data
    - column_name: Name of the column with menu items (e.g., 'Item_Name')
    - valid_items: List of predefined valid menu items
    - contamination: Expected proportion of new (outlier) items (0 to 0.5)
    
    Returns:
    - DataFrame with a new column 'is_new_item' indicating 'New' or 'Valid'
    """
    # Handle missing values in the column (if any)
    df[column_name] = df[column_name].fillna('Unknown')  # Replace NaN with 'Unknown'
    
    # Encode the categorical column
    le = LabelEncoder()
    le.fit(valid_items)
    
    # Transform the column; assign a unique code for new items
    encoded_values = []
    for value in df[column_name]:
        try:
            encoded_values.append(le.transform([value])[0])
        except ValueError:
            # Assign a unique code for new items (max encoded value + 1)
            encoded_values.append(len(valid_items))
    
    # Convert to numpy array for Isolation Forest
    X = np.array(encoded_values).reshape(-1, 1)
    
    # Train Isolation Forest
    iso_forest = IsolationForest(contamination=contamination, random_state=42)
    iso_forest.fit(X)
    
    # Predict anomalies (-1 for new items, 1 for valid items)
    predictions = iso_forest.predict(X)
    
    # Add results to DataFrame
    df['is_new_item'] = ['New' if pred == -1 else 'Valid' for pred in predictions]
    
    # Print summary of detected new items
    new_items = df[df['is_new_item'] == 'New'][column_name].unique()
    print(f"Contamination: {contamination}")
    print(f"Detected new menu items: {new_items}")
    print(f"Total new items detected: {len(df[df['is_new_item'] == 'New'])}")
    
    return df

# Test multiple contamination levels
contamination_levels = [0.1, 0.2, 0.3]  # Adjust based on expected new items
for cont in contamination_levels:
    print(f"\nTesting with contamination={cont}")
    result_df = detect_new_menu_items(df, 'Item', valid_menu_items, contamination=cont)
    print(result_df.head())  # Show first few rows
    print("-" * 50)

# Save results for the best contamination level (e.g., 0.2) to a CSV
best_result_df = detect_new_menu_items(df, 'Item', valid_menu_items, contamination=0.2)
best_result_df.to_csv('flagged_new_menu_items.csv', index=False)
print("Results saved to 'flagged_new_menu_items.csv'")

