In [1]:
"""
train_model.py - CORRECTED NAIVE BAYES MODEL TRAINING
Complete script with all visualizations and probability calculations
"""

import pandas as pd
import numpy as np
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib
import json
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("=" * 80)
print("NAIVE BAYES MODEL TRAINING - Restaurant Order Prediction")
print("=" * 80)

# ============================================================================
# STEP 1: LOAD AND EXPLORE DATASET
# ============================================================================

print("\n[STEP 1] Loading Dataset...")
df = pd.read_csv("D:\\Rachith Bharadwaj T N 24BDS062\\2nd Year\\3rd SEM\\DHV Project\\restaurant-order-prediction\\backend\\restaurant_orders_500.csv")

print(f"✓ Dataset loaded successfully!")
print(f"  - Shape: {df.shape}")
print(f"  - Total samples: {len(df)}")
print(f"\n  First 5 rows:")
print(df.head())
print(f"\n  Data types:")
print(df.dtypes)
print(f"\n  Class Distribution:")
print(df['OrderFood'].value_counts())

# ============================================================================
# STEP 2: ENCODE CATEGORICAL FEATURES
# ============================================================================

print("\n" + "=" * 80)
print("[STEP 2] Encoding Categorical Features...")
print("=" * 80)

le_cuisine = LabelEncoder()
le_time = LabelEncoder()
le_weather = LabelEncoder()
le_hunger = LabelEncoder()
le_order = LabelEncoder()

df['Cuisine_encoded'] = le_cuisine.fit_transform(df['Cuisine'])
df['TimeOfDay_encoded'] = le_time.fit_transform(df['TimeOfDay'])
df['Weather_encoded'] = le_weather.fit_transform(df['Weather'])
df['HungerLevel_encoded'] = le_hunger.fit_transform(df['HungerLevel'])
df['OrderFood_encoded'] = le_order.fit_transform(df['OrderFood'])

print("\n✓ Label Encoders Created:")
print(f"\n  Cuisine:")
print(f"    {dict(zip(le_cuisine.classes_, le_cuisine.transform(le_cuisine.classes_)))}")
print(f"\n  TimeOfDay:")
print(f"    {dict(zip(le_time.classes_, le_time.transform(le_time.classes_)))}")
print(f"\n  Weather:")
print(f"    {dict(zip(le_weather.classes_, le_weather.transform(le_weather.classes_)))}")
print(f"\n  HungerLevel:")
print(f"    {dict(zip(le_hunger.classes_, le_hunger.transform(le_hunger.classes_)))}")
print(f"\n  OrderFood (Target):")
print(f"    {dict(zip(le_order.classes_, le_order.transform(le_order.classes_)))}")

# ============================================================================
# STEP 3: PREPARE FEATURES AND TARGET
# ============================================================================

print("\n" + "=" * 80)
print("[STEP 3] Preparing Features and Target...")
print("=" * 80)

X = df[['Cuisine_encoded', 'TimeOfDay_encoded', 'Weather_encoded', 'HungerLevel_encoded']]
y = df['OrderFood_encoded']

print(f"\n✓ Features shape: {X.shape}")
print(f"✓ Target shape: {y.shape}")
print(f"✓ Feature names: ['Cuisine', 'TimeOfDay', 'Weather', 'HungerLevel']")

# ============================================================================
# STEP 4: TRAIN NAIVE BAYES MODEL
# ============================================================================

print("\n" + "=" * 80)
print("[STEP 4] Training Categorical Naive Bayes Model...")
print("=" * 80)

model = CategoricalNB()
model.fit(X, y)

print(f"\n✓ Model trained successfully!")
print(f"  - Algorithm: Categorical Naive Bayes")
print(f"  - Number of classes: {len(model.classes_)}")
print(f"  - Classes: {model.classes_}")
print(f"  - Class labels: {le_order.classes_}")

# ============================================================================
# STEP 5: MODEL EVALUATION
# ============================================================================

print("\n" + "=" * 80)
print("[STEP 5] Model Evaluation...")
print("=" * 80)

y_pred = model.predict(X)
y_pred_proba = model.predict_proba(X)

accuracy = accuracy_score(y, y_pred)
cm = confusion_matrix(y, y_pred)

print(f"\n✓ Model Performance:")
print(f"  - Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"\n✓ Confusion Matrix:")
print(f"  {cm}")
print(f"\n✓ Classification Report:")
print(classification_report(y, y_pred, target_names=le_order.classes_))

# ============================================================================
# STEP 6: EXTRACT FEATURE PROBABILITIES (FIXED VERSION)
# ============================================================================

print("\n" + "=" * 80)
print("[STEP 6] Extracting Feature Probabilities...")
print("=" * 80)

feature_names = ['Cuisine', 'TimeOfDay', 'Weather', 'HungerLevel']
class_names = le_order.classes_

print(f"\n✓ Feature Log Probabilities Structure:")
print(f"  - Type: {type(model.feature_log_prob_)}")
print(f"  - Length: {len(model.feature_log_prob_)} (one array per feature)")

# CORRECTED: Handle the list of arrays structure
prob_data = {}

for idx, feature_name in enumerate(feature_names):
    print(f"\n  Processing {feature_name}:")
    
    # Get the log probabilities array for this feature
    # feature_log_prob_ is a LIST, so index it first with feature index
    feature_log_probs = model.feature_log_prob_[idx]
    print(f"    - Shape: {feature_log_probs.shape}")
    print(f"    - Type: {type(feature_log_probs)}")
    
    # Convert log probabilities to regular probabilities
    feature_probs = np.exp(feature_log_probs)
    
    # Store probabilities for each class
    feature_prob_dict = {}
    for class_idx, class_name in enumerate(class_names):
        # Access probabilities for this class
        probs_for_class = feature_probs[class_idx].tolist()
        feature_prob_dict[class_name] = probs_for_class
        print(f"    - {class_name}: {[f'{p:.4f}' for p in probs_for_class]}")
    
    prob_data[feature_name] = feature_prob_dict

# Save probability data to JSON
with open('probability_data.json', 'w') as f:
    json.dump(prob_data, f, indent=2)

print(f"\n✓ Probability data saved to probability_data.json")

# ============================================================================
# STEP 7: SAVE MODEL AND ENCODERS
# ============================================================================

print("\n" + "=" * 80)
print("[STEP 7] Saving Model and Encoders...")
print("=" * 80)

joblib.dump(model, 'nb_model.pkl')
joblib.dump({
    'cuisine': le_cuisine, 
    'time': le_time, 
    'weather': le_weather, 
    'hunger': le_hunger, 
    'order': le_order
}, 'encoders.pkl')

print(f"\n✓ Model saved: nb_model.pkl")
print(f"✓ Encoders saved: encoders.pkl")

# ============================================================================
# STEP 8: GENERATE VISUALIZATION PLOTS
# ============================================================================

print("\n" + "=" * 80)
print("[STEP 8] Generating Visualization Plots...")
print("=" * 80)

sns.set_style("whitegrid")
plt.rcParams['figure.facecolor'] = 'white'

# Plot 1: Feature Distribution (2x2 grid)
print("\n  1. Creating feature distribution plots...")
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Feature Distribution by Order Status', fontsize=18, fontweight='bold', y=0.995)

# Cuisine Distribution
cuisine_data = df.groupby(['Cuisine', 'OrderFood']).size().unstack(fill_value=0)
cuisine_data.plot(kind='bar', ax=axes[0, 0], color=['#ff6b6b', '#51cf66'], width=0.8)
axes[0, 0].set_title('Cuisine Type vs Order Decision', fontsize=14, fontweight='bold')
axes[0, 0].set_ylabel('Count', fontsize=12)
axes[0, 0].set_xlabel('Cuisine', fontsize=12)
axes[0, 0].legend(title='Order Food', labels=['No', 'Yes'], fontsize=11)
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(axis='y', alpha=0.3)

# Time of Day Distribution
time_data = df.groupby(['TimeOfDay', 'OrderFood']).size().unstack(fill_value=0)
time_order = ['Breakfast', 'Lunch', 'Dinner', 'LateNight']
time_data = time_data.reindex([t for t in time_order if t in time_data.index])
time_data.plot(kind='bar', ax=axes[0, 1], color=['#ff6b6b', '#51cf66'], width=0.8)
axes[0, 1].set_title('Time of Day vs Order Decision', fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel('Count', fontsize=12)
axes[0, 1].set_xlabel('Time of Day', fontsize=12)
axes[0, 1].legend(title='Order Food', labels=['No', 'Yes'], fontsize=11)
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(axis='y', alpha=0.3)

# Weather Distribution
weather_data = df.groupby(['Weather', 'OrderFood']).size().unstack(fill_value=0)
weather_data.plot(kind='bar', ax=axes[1, 0], color=['#ff6b6b', '#51cf66'], width=0.8)
axes[1, 0].set_title('Weather vs Order Decision', fontsize=14, fontweight='bold')
axes[1, 0].set_ylabel('Count', fontsize=12)
axes[1, 0].set_xlabel('Weather', fontsize=12)
axes[1, 0].legend(title='Order Food', labels=['No', 'Yes'], fontsize=11)
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(axis='y', alpha=0.3)

# Hunger Level Distribution
hunger_data = df.groupby(['HungerLevel', 'OrderFood']).size().unstack(fill_value=0)
hunger_order = ['Low', 'Medium', 'High']
hunger_data = hunger_data.reindex([h for h in hunger_order if h in hunger_data.index])
hunger_data.plot(kind='bar', ax=axes[1, 1], color=['#ff6b6b', '#51cf66'], width=0.8)
axes[1, 1].set_title('Hunger Level vs Order Decision', fontsize=14, fontweight='bold')
axes[1, 1].set_ylabel('Count', fontsize=12)
axes[1, 1].set_xlabel('Hunger Level', fontsize=12)
axes[1, 1].legend(title='Order Food', labels=['No', 'Yes'], fontsize=11)
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('feature_distribution.png', dpi=300, bbox_inches='tight')
print("     ✓ Saved: feature_distribution.png")
plt.close()

# Plot 2: Confusion Matrix Heatmap
print("  2. Creating confusion matrix heatmap...")
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No', 'Yes'], 
            yticklabels=['No', 'Yes'],
            cbar_kws={'label': 'Count'},
            annot_kws={'fontsize': 16, 'fontweight': 'bold'},
            linewidths=2, linecolor='white')
plt.title('Confusion Matrix - Naive Bayes Model\nAccuracy: {:.2f}%'.format(accuracy*100), 
          fontsize=16, fontweight='bold', pad=20)
plt.ylabel('True Label', fontsize=13, fontweight='bold')
plt.xlabel('Predicted Label', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
print("     ✓ Saved: confusion_matrix.png")
plt.close()

# Plot 3: Class Distribution Pie Chart
print("  3. Creating class distribution pie chart...")
class_dist = df['OrderFood'].value_counts()
plt.figure(figsize=(10, 8))
colors = ['#51cf66', '#ff6b6b']
explode = (0.05, 0.05)
plt.pie(class_dist, labels=['Yes', 'No'], autopct='%1.1f%%', colors=colors, 
        startangle=90, explode=explode, shadow=True,
        textprops={'fontsize': 14, 'fontweight': 'bold'})
plt.title('Order Food Class Distribution\nTotal Samples: {}'.format(len(df)), 
          fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('class_distribution.png', dpi=300, bbox_inches='tight')
print("     ✓ Saved: class_distribution.png")
plt.close()

# Plot 4: Feature Importance
print("  4. Creating feature importance plot...")
feature_importance = []
for idx, feature_name in enumerate(feature_names):
    feature_log_probs = model.feature_log_prob_[idx]
    variance = np.var(feature_log_probs)
    feature_importance.append(variance)

plt.figure(figsize=(10, 6))
bars = plt.barh(feature_names, feature_importance, color=['#667eea', '#764ba2', '#51cf66', '#ff6b6b'])
plt.xlabel('Probability Variance (Importance)', fontsize=12, fontweight='bold')
plt.ylabel('Features', fontsize=12, fontweight='bold')
plt.title('Feature Importance in Naive Bayes Model', fontsize=14, fontweight='bold')
plt.grid(axis='x', alpha=0.3)
for i, v in enumerate(feature_importance):
    plt.text(v, i, f' {v:.4f}', va='center', fontweight='bold')
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
print("     ✓ Saved: feature_importance.png")
plt.close()

# ============================================================================
# STEP 9: SAMPLE PREDICTIONS
# ============================================================================

print("\n" + "=" * 80)
print("[STEP 9] Sample Predictions...")
print("=" * 80)

test_cases = [
    {'cuisine': 'Italian', 'time': 'Lunch', 'weather': 'Cloudy', 'hunger': 'High'},
    {'cuisine': 'Chinese', 'time': 'Breakfast', 'weather': 'Sunny', 'hunger': 'Low'},
    {'cuisine': 'Mexican', 'time': 'LateNight', 'weather': 'Rainy', 'hunger': 'High'},
    {'cuisine': 'Indian', 'time': 'Dinner', 'weather': 'Sunny', 'hunger': 'Medium'},
]

for i, case in enumerate(test_cases, 1):
    cuisine_idx = le_cuisine.transform([case['cuisine']])[0]
    time_idx = le_time.transform([case['time']])[0]
    weather_idx = le_weather.transform([case['weather']])[0]
    hunger_idx = le_hunger.transform([case['hunger']])[0]
    
    X_test = np.array([[cuisine_idx, time_idx, weather_idx, hunger_idx]])
    pred = model.predict(X_test)[0]
    proba = model.predict_proba(X_test)[0]
    
    print(f"\n  Test Case {i}:")
    print(f"    Input: {case['cuisine']}, {case['time']}, {case['weather']}, {case['hunger']}")
    print(f"    Prediction: {le_order.classes_[pred]}")
    print(f"    Prob(No):   {proba[0]:.4f} ({proba[0]*100:.2f}%)")
    print(f"    Prob(Yes):  {proba[1]:.4f} ({proba[1]*100:.2f}%)")
    print(f"    Confidence: {max(proba)*100:.2f}%")

# ============================================================================
# COMPLETION MESSAGE
# ============================================================================

print("\n" + "=" * 80)
print("✓ TRAINING COMPLETE!")
print("=" * 80)
print("\nGenerated Files:")
print("  ✓ nb_model.pkl - Trained Naive Bayes model")
print("  ✓ encoders.pkl - Label encoders for all features")
print("  ✓ probability_data.json - Feature probability distributions")
print("  ✓ feature_distribution.png - Feature analysis (2x2 grid)")
print("  ✓ confusion_matrix.png - Model performance heatmap")
print("  ✓ class_distribution.png - Class balance pie chart")
print("  ✓ feature_importance.png - Feature importance bar chart")
print("\nNext Steps:")
print("  1. Ensure Flask backend (app.py) is in the same directory")
print("  2. Run: python app.py")
print("  3. Open: http://localhost:5000")
print("=" * 80)


NAIVE BAYES MODEL TRAINING - Restaurant Order Prediction

[STEP 1] Loading Dataset...
✓ Dataset loaded successfully!
  - Shape: (500, 5)
  - Total samples: 500

  First 5 rows:
   Cuisine  TimeOfDay Weather HungerLevel OrderFood
0  Italian      Lunch  Cloudy        High       Yes
1  Italian     Dinner   Sunny      Medium       Yes
2  Mexican     Dinner   Sunny        High       Yes
3  Mexican  Breakfast   Rainy        High       Yes
4   Indian      Lunch   Sunny         Low        No

  Data types:
Cuisine        object
TimeOfDay      object
Weather        object
HungerLevel    object
OrderFood      object
dtype: object

  Class Distribution:
OrderFood
Yes    250
No     250
Name: count, dtype: int64

[STEP 2] Encoding Categorical Features...

✓ Label Encoders Created:

  Cuisine:
    {'Chinese': np.int64(0), 'Indian': np.int64(1), 'Italian': np.int64(2), 'Mexican': np.int64(3)}

  TimeOfDay:
    {'Breakfast': np.int64(0), 'Dinner': np.int64(1), 'LateNight': np.int64(2), 'Lunch': np.int