# Intent Classification Model Training - FINAL OPTIMIZED

Trains an **SVM classifier** on your full GeniSys + Fitness intent dataset.

**‚úÖ Features:**
- Loads `Big_intent.json` from root or `/data`
- Handles all 30+ intents (system + fitness)
- Preserves typos & spoken forms
- Safe train/test split with fallback
- Saves models to `/content/models/`

In [11]:
import json
import pickle
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter


In [12]:
# === FIND DATA FILE ===
def find_data_file():
    candidates = [
        "/Big_intent.json",
        "data/Big_intent.json",
        "/content/Big_intent.json",
        "/content/data/Big_intent.json"
    ]
    for path in candidates:
        if os.path.exists(path):
            return path
    raise FileNotFoundError("‚ùå Big_intent.json not found. Please upload it to Colab root or /data/")

DATA_PATH = find_data_file()
MODEL_DIR = "/content/models"
os.makedirs(MODEL_DIR, exist_ok=True)

print(f"‚úÖ Loading data from: {DATA_PATH}")
with open(DATA_PATH, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

if "intents" not in raw_data:
    raise ValueError("‚ùå JSON must contain top-level 'intents' key")

intents_list = raw_data["intents"]
print(f"üìÇ Loaded {len(intents_list)} intents")

‚úÖ Loading data from: /content/Big_intent.json
üìÇ Loaded 20 intents


In [13]:
# === BUILD CORPUS ===
corpus = []
labels = []

for intent_obj in intents_list:
    if not isinstance(intent_obj, dict):
        continue

    intent_tag = intent_obj.get("intent")
    text_list = intent_obj.get("text", [])

    if not intent_tag or not text_list:
        continue

    for phrase in text_list:
        if isinstance(phrase, str) and phrase.strip():
            clean_phrase = " ".join(phrase.split())  # normalize whitespace only
            corpus.append(clean_phrase)
            labels.append(intent_tag)

print(f"üìä Total phrases: {len(corpus)}")
print(f"üéØ Unique intents: {len(set(labels))}")

# Show distribution
label_counts = Counter(labels)
for intent, count in sorted(label_counts.items(), key=lambda x: -x[1]):
    print(f"  {intent}: {count}")

üìä Total phrases: 2658
üéØ Unique intents: 20
  healthy_habit_building: 165
  plateau_breaking: 155
  senior_fitness: 155
  pregnancy_fitness: 154
  time_efficient_workout: 152
  muscle_gain_query: 148
  cardio_vs_weights: 141
  injury_recovery_advice: 137
  weight_loss_plan: 136
  body_part_focus: 136
  macro_tracking: 131
  sleep_optimization: 130
  progress_check: 125
  supplement_guidance: 123
  calorie_target: 121
  fitness_motivation: 116
  meal_recommendation: 115
  hydration_reminder: 112
  bmi_calculation: 105
  workout_plan: 101


In [14]:
# === HANDLE SMALL CLASSES FOR STRATIFICATION ===
min_samples = 2
valid_labels = {lbl for lbl, cnt in label_counts.items() if cnt >= min_samples}

X_corpus = []
y_labels = []
for phrase, lbl in zip(corpus, labels):
    if lbl in valid_labels:
        X_corpus.append(phrase)
        y_labels.append(lbl)

removed = len(corpus) - len(X_corpus)
if removed > 0:
    print(f"\n‚ö†Ô∏è Removed {removed} samples from rare intents (<{min_samples} examples)")

In [15]:
# === VECTORIZE ===
print("\nüî§ Vectorizing with TF-IDF...")
vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),
    max_features=2000,
    min_df=1,
    max_df=0.9,
    sublinear_tf=True,
    strip_accents='unicode',
    lowercase=False  # keep "PCOS", "iPhone", etc.
)

X = vectorizer.fit_transform(X_corpus)
y = np.array(y_labels)
print(f"‚úÖ Feature matrix: {X.shape}")


üî§ Vectorizing with TF-IDF...
‚úÖ Feature matrix: (2658, 2000)


In [16]:
# === TRAIN/TEST SPLIT ===
try:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
except ValueError:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    print("‚ö†Ô∏è Used random split (stratification failed)")

print(f"üßÆ Train: {X_train.shape[0]} | Test: {X_test.shape[0]}")

üßÆ Train: 2126 | Test: 532


In [17]:
# === TRAIN MODEL ===
print("\nüèãÔ∏è Training SVM...")
model = SVC(kernel='rbf', C=10, gamma='scale', random_state=42, probability=True)
model.fit(X_train, y_train)
print("‚úÖ Training complete!")


üèãÔ∏è Training SVM...
‚úÖ Training complete!


In [18]:
# === EVALUATE ===
y_pred = model.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
cv_scores = cross_val_score(model, X_train, y_train, cv=min(5, len(np.unique(y_train))))

print(f"\nüìà Cross-Val: {cv_scores.mean()*100:.2f}% ¬± {cv_scores.std()*100:.2f}%")
print(f"üéØ Test Acc: {test_acc*100:.2f}%")

print("\nüìã Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))


üìà Cross-Val: 84.38% ¬± 0.48%
üéØ Test Acc: 84.59%

üìã Classification Report:
                        precision    recall  f1-score   support

       bmi_calculation       1.00      1.00      1.00        21
       body_part_focus       0.86      0.89      0.87        27
        calorie_target       0.84      0.88      0.86        24
     cardio_vs_weights       0.93      0.93      0.93        28
    fitness_motivation       0.95      0.87      0.91        23
healthy_habit_building       0.73      0.91      0.81        33
    hydration_reminder       0.95      0.91      0.93        22
injury_recovery_advice       0.90      0.68      0.78        28
        macro_tracking       0.91      0.77      0.83        26
   meal_recommendation       0.74      0.74      0.74        23
     muscle_gain_query       0.87      0.87      0.87        30
      plateau_breaking       0.73      0.77      0.75        31
     pregnancy_fitness       0.91      0.97      0.94        31
        progress_ch

In [19]:
# === SAMPLE PREDICTIONS ===
test_phrases = [
    "Hi there",
    "I want to lose 10 pounds",
    "What should I eat after workout?",
    "Tell me a joke",
    "How much water should I drink?",
    "Bye"
]

print("\nüîç Sample Predictions:")
for phrase in test_phrases:
    vec = vectorizer.transform([phrase])
    pred = model.predict(vec)[0]
    prob = model.predict_proba(vec)[0].max()
    print(f"'{phrase}' ‚Üí {pred} ({prob:.1%})")


üîç Sample Predictions:
'Hi there' ‚Üí plateau_breaking (9.6%)
'I want to lose 10 pounds' ‚Üí weight_loss_plan (90.4%)
'What should I eat after workout?' ‚Üí workout_plan (34.1%)
'Tell me a joke' ‚Üí fitness_motivation (98.1%)
'How much water should I drink?' ‚Üí hydration_reminder (98.9%)
'Bye' ‚Üí plateau_breaking (9.6%)


In [20]:
# === SAVE MODELS ===
with open(os.path.join(MODEL_DIR, "intent_model.pkl"), "wb") as f:
    pickle.dump(model, f)

with open(os.path.join(MODEL_DIR, "tfidf_vectorizer.pkl"), "wb") as f:
    pickle.dump(vectorizer, f)

print(f"\nüíæ Models saved to: {MODEL_DIR}/")
print("‚úÖ Model 1 (Intent Classifier) is ready for deployment!")


üíæ Models saved to: /content/models/
‚úÖ Model 1 (Intent Classifier) is ready for deployment!


In [23]:
import json
import pickle
import numpy as np
import os

# Global variables to cache the model and vectorizer (load once, use multiple times)
_cached_model = None
_cached_vectorizer = None

def load_intent_model(model_dir="models"):
    """
    Load and cache the intent classification model and vectorizer.
    This is called automatically by classify_user_intent if needed.

    Returns:
        bool: True if models were loaded successfully, False otherwise
    """
    global _cached_model, _cached_vectorizer

    try:
        model_path = os.path.join(model_dir, "intent_model.pkl")
        vectorizer_path = os.path.join(model_dir, "tfidf_vectorizer.pkl")

        if not (os.path.exists(model_path) and os.path.exists(vectorizer_path)):
            print(f"‚ùå Model files not found in {model_dir}/. Please train the model first.")
            return False

        with open(model_path, "rb") as f:
            _cached_model = pickle.load(f)

        with open(vectorizer_path, "rb") as f:
            _cached_vectorizer = pickle.load(f)

        print("‚úÖ Intent classification model loaded successfully!")
        return True

    except Exception as e:
        print(f"‚ùå Error loading model: {e}")
        return False

def classify_user_intent(user_input):
    """
    Classify the intent of user input text.
    This is the main function to call from your chatbot system.

    Args:
        user_input (str): The user's message text

    Returns:
        dict: Classification results with keys:
            - intent (str): Predicted intent name
            - confidence (float): Confidence score (0.0 to 1.0)
            - input_text (str): Cleaned input text
            - alternatives (list): Top alternative intents with confidence scores
            - success (bool): Whether classification was successful
    """
    global _cached_model, _cached_vectorizer

    # Load model if not already loaded
    if _cached_model is None or _cached_vectorizer is None:
        if not load_intent_model():
            return {
                "success": False,
                "error": "Model not loaded. Please train the intent classifier first."
            }

    # Validate input
    if not user_input or not isinstance(user_input, str):
        return {
            "success": False,
            "error": "Invalid input: must be a non-empty string"
        }

    try:
        # Preprocess the text (normalize whitespace only)
        clean_text = " ".join(user_input.split())

        # Vectorize the input text
        text_vec = _cached_vectorizer.transform([clean_text])

        # Get prediction and confidence
        predicted_intent = _cached_model.predict(text_vec)[0]
        confidence_scores = _cached_model.predict_proba(text_vec)[0]
        confidence = np.max(confidence_scores)

        # Get top 3 alternatives
        top_indices = np.argsort(confidence_scores)[::-1][:3]
        alternatives = [
            {
                "intent": _cached_model.classes_[i],
                "confidence": float(confidence_scores[i])
            }
            for i in top_indices
        ]

        return {
            "success": True,
            "intent": predicted_intent,
            "confidence": float(confidence),
            "input_text": clean_text,
            "alternatives": alternatives,
            "is_confident": confidence >= 0.70
        }

    except Exception as e:
        return {
            "success": False,
            "error": f"Classification error: {str(e)}"
        }

# Example usage function (for testing/demo purposes only)
def test_intent_classification():
    """
    Test function to demonstrate how to use classify_user_intent.
    This is not part of the core functionality - just for testing.
    """
    print("=" * 60)
    print("üîç INTENT CLASSIFICATION TEST FUNCTION")
    print("=" * 60)

    # Test with various inputs
    test_inputs = [
        "Hello there!",
        "I want to lose 10 pounds in 2 months",
        "How do I build more muscle?",
        "What time is it?",
        "I feel like giving up on my fitness journey"
    ]

    for user_input in test_inputs:
        print(f"\nüë§ USER: \"{user_input}\"")
        result = classify_user_intent(user_input)

        if result["success"]:
            print(f"üéØ INTENT: {result['intent']} ({result['confidence']:.1%} confidence)")
            if not result["is_confident"]:
                print("  ‚ö†Ô∏è Low confidence prediction")
                print("  üí° Possible alternatives:")
                for alt in result["alternatives"][1:3]:  # Show top 2 alternatives
                    print(f"    ‚Ä¢ {alt['intent']} ({alt['confidence']:.1%})")
        else:
            print(f"‚ùå ERROR: {result['error']}")

        print("-" * 60)

    # Interactive test mode
    print("\nüí¨ Try your own input (type 'exit' to quit):")
    while True:
        user_input = input("\nüë§ You: ").strip()
        if user_input.lower() in ['exit', 'quit', 'q']:
            break

        result = classify_user_intent(user_input)
        if result["success"]:
            print(f"üéØ BOT DETECTED INTENT: {result['intent']} ({result['confidence']:.1%} confidence)")
            if not result["is_confident"]:
                print("  ‚ö†Ô∏è Note: Low confidence prediction")
        else:
            print(f"‚ùå ERROR: {result['error']}")

# This allows the script to be imported without running the test function
if __name__ == "__main__":
    test_intent_classification()

üîç INTENT CLASSIFICATION TEST FUNCTION

üë§ USER: "Hello there!"
‚úÖ Intent classification model loaded successfully!
üéØ INTENT: plateau_breaking (9.6% confidence)
  ‚ö†Ô∏è Low confidence prediction
  üí° Possible alternatives:
    ‚Ä¢ muscle_gain_query (8.1%)
    ‚Ä¢ injury_recovery_advice (7.5%)
------------------------------------------------------------

üë§ USER: "I want to lose 10 pounds in 2 months"
üéØ INTENT: weight_loss_plan (98.6% confidence)
------------------------------------------------------------

üë§ USER: "How do I build more muscle?"
üéØ INTENT: muscle_gain_query (99.0% confidence)
------------------------------------------------------------

üë§ USER: "What time is it?"
üéØ INTENT: time_efficient_workout (22.0% confidence)
  ‚ö†Ô∏è Low confidence prediction
  üí° Possible alternatives:
    ‚Ä¢ injury_recovery_advice (9.7%)
    ‚Ä¢ progress_check (8.9%)
------------------------------------------------------------

üë§ USER: "I feel like giving up on my

KeyboardInterrupt: Interrupted by user