In [None]:
# !pip install opencv-python face_recognition
# !pip install SpeechRecognition pyaudio
# !pip install pyttsx3
# !pip install groq
# !pip install gtts playsound
! pip install deepface

# Load Modules

In [1]:
# Core Libraries
import os
import time
import threading
import platform
from collections import deque, Counter
from typing import Optional, Tuple

# Vision: OpenCV and Face Recognition
import cv2
import face_recognition
import numpy as np

# Audio: ASR and TTS
import speech_recognition as sr
import pyttsx3

# Conversational AI
from groq import Groq

# Used in the original notebook for TTS, can be kept for testing
from gtts import gTTS
import playsound
import uuid

# Face Recognition 

In [2]:
class SimpleFacerec:
    def __init__(self):
        self.known_face_encodings = []
        self.known_face_names = []
        # Resize frame for faster processing
        self.frame_resizing = 0.25

    def load_encoding_images(self, dataset_path):
        """
        Load face encodings from a dataset folder.
        The folder should contain subfolders named after each person.
        """
        print("Loading known face encodings...")
        for person_name in os.listdir(dataset_path):
            person_folder = os.path.join(dataset_path, person_name)
            if not os.path.isdir(person_folder):
                continue

            for filename in os.listdir(person_folder):
                if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
                    img_path = os.path.join(person_folder, filename)
                    img = face_recognition.load_image_file(img_path)
                    face_enc_list = face_recognition.face_encodings(img)

                    if len(face_enc_list) > 0:
                        self.known_face_encodings.append(face_enc_list[0])
                        self.known_face_names.append(person_name)
                    else:
                        print(f"[WARN] No face found in {filename} for {person_name}")
        print(f"Loaded {len(self.known_face_encodings)} face encodings.")

    def detect_known_faces(self, frame):
        """Detect and recognize faces in a single frame."""
        small_frame = cv2.resize(frame, (0, 0), fx=self.frame_resizing, fy=self.frame_resizing)
        rgb_small_frame = cv2.cvtColor(small_frame, cv2.COLOR_BGR2RGB)

        face_locations = face_recognition.face_locations(rgb_small_frame)
        face_encodings = face_recognition.face_encodings(rgb_small_frame, face_locations)

        face_names = []
        for face_encoding in face_encodings:
            matches = face_recognition.compare_faces(self.known_face_encodings, face_encoding)
            name = "Unknown"

            face_distances = face_recognition.face_distance(self.known_face_encodings, face_encoding)
            if len(face_distances) > 0:
                best_match_index = np.argmin(face_distances)
                if matches[best_match_index]:
                    name = self.known_face_names[best_match_index]
            face_names.append(name)

        # Rescale face locations to original frame size
        face_locations = np.array(face_locations) / self.frame_resizing
        return face_locations.astype(int), face_names

In [21]:
def start_recog(sfr: SimpleFacerec):
    """
    Activates the webcam to perform face recognition and returns the confirmed identity.
    """
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("Error: Cannot open camera")
        return "Unknown"

    detection_counter = 0
    recent_detections = deque(maxlen=5)
    confirmed_person = "Unknown"

    print("Starting face recognition...")
    while True:
        ret, frame = cap.read()
        if not ret:
            print("Failed to grab frame from camera.")
            break

        # Process every 15th frame to save resources
        if detection_counter % 15 == 0:
            face_locations, face_names = sfr.detect_known_faces(frame)

            if face_names:
                # Use the most prominent detected face
                detected_name = face_names[0]
                recent_detections.append(detected_name)
                print(f"[DEBUG] Detection: {detected_name}, History: {list(recent_detections)}")

                name_counts = Counter(recent_detections)
                most_common, freq = name_counts.most_common(1)[0]

                # Confirm identity if detected consistently (4 out of 5 times)
                if freq >= 4:
                    confirmed_person = most_common
                    print(f"\n✅ Confirmed identity: {confirmed_person}")
                    break

        # Stop after ~7 seconds if no stable recognition
        if detection_counter > 100:
            print("\n⚠️ Max detections reached — stability not achieved.")
            break
            
        detection_counter += 1

        # Display video feed with detections
        for face_loc, name in zip(face_locations, face_names):
            # print("Running face recognition...")
            y1, x2, y2, x1 = face_loc
            cv2.putText(frame, name, (x1, y1 - 10), cv2.FONT_HERSHEY_DUPLEX, 1, (0, 0, 200), 2)
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 200), 4)
        cv2.imshow("Face Recognition", frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    time.sleep(2)
    cap.release()
    cv2.destroyAllWindows()
    print(f"\nFinal recognized person: {confirmed_person}")
    
    return confirmed_person

# Automatic Speech Recognition

In [22]:
# asr.py
# pip install SpeechRecognition pyaudio
import time
from typing import Optional, Tuple
import speech_recognition as sr

class SpeechASR:
    def __init__(
        self,
        device_index: Optional[int] = None,
        language: str = "en-IN",
        calibration_seconds: float = 1.0,
        energy_threshold: int = 4000,
        dynamic_energy: bool = True,
        phrase_time_limit: float = 7.0,
        listen_timeout: float = 5.0,
    ):
        self.r = sr.Recognizer()
        self.r.energy_threshold = energy_threshold
        self.r.dynamic_energy_threshold = dynamic_energy
        self.language = language
        self.device_index = device_index
        self.calibration_seconds = calibration_seconds
        self.phrase_time_limit = phrase_time_limit
        self.listen_timeout = listen_timeout

    @staticmethod
    def list_microphones():
        return sr.Microphone.list_microphone_names() # returns all mic available

    def _capture_once(self) -> Optional[sr.AudioData]:
        with sr.Microphone(device_index=self.device_index) as source:
            # Calibrate noise each turn for robustness
            self.r.adjust_for_ambient_noise(source, duration=self.calibration_seconds)
            try:
                audio = self.r.listen(
                    source,
                    timeout=self.listen_timeout,
                    phrase_time_limit=self.phrase_time_limit,
                )
                return audio
            except sr.WaitTimeoutError:
                return None

    def _recognize(self, audio: Optional[sr.AudioData]) -> str:
        if not audio:
            return ""
        try:
            return self.r.recognize_google(audio, language=self.language).strip()
        except sr.UnknownValueError:
            return ""
        except sr.RequestError:
            # Network or quota error
            return ""

    def listen_for_keyword(self, keyword: str = "guard my room", window_seconds: float = 20.0) -> Tuple[bool, str]:
        deadline = time.time() + window_seconds
        heard_all = []
        while time.time() < deadline:
            audio = self._capture_once() 
            text = self._recognize(audio).lower() # lowercase for easier matching
            if text:
                print(f"[ASR] Heard: {text}")
                heard_all.append(text)
                if keyword.lower() in text:
                    return True, text
        # Return everything heard to aid debugging
        return False, " | ".join(heard_all)

    def transcribe_once(self, listen_seconds: Optional[float] = None) -> str:
        if listen_seconds is not None:
            old_limit = self.phrase_time_limit
            self.phrase_time_limit = listen_seconds
        audio = self._capture_once()
        text = self._recognize(audio)
        if listen_seconds is not None:
            self.phrase_time_limit = old_limit
        return text # transcribtion for better clarity what the machine heard and interpreted
    

# Driver Re-allocation to bypass Conflict

In [None]:
def _get_driver_name():
    system = platform.system().lower()
    if system == "windows":
        return "sapi5"
    elif system == "darwin":
        return "nsss"
    elif system == "linux":
        return "espeak"
    return None

def _speak_worker(text, rate_delta=0, volume=1.0, voice_name_substr=None):
    """Worker function that runs in its own thread with a fresh engine."""
    try:
        engine = pyttsx3.init(driverName=_get_driver_name())
        
        # Configure voice settings
        base_rate = engine.getProperty("rate")
        engine.setProperty("rate", max(80, base_rate + int(rate_delta)))
        engine.setProperty("volume", max(0.0, min(1.0, float(volume))))
        
        if voice_name_substr:
            voices = engine.getProperty("voices")
            for v in voices or []:
                if voice_name_substr.lower() in (v.name or "").lower():
                    engine.setProperty("voice", v.id)
                    break
        
        engine.say(str(text))
        engine.runAndWait()  # Blocks this thread until speech completes
    except Exception as e:
        print(f"TTS error: {e}")

class TTSVoice:
    def __init__(self, rate_delta=0, volume=1.0, voice_name_substr=None):
        self.rate_delta = int(rate_delta)
        self.volume = float(volume)
        self.voice_name_substr = voice_name_substr
        self.last_thread = None
    
    # we are doing forced multi threading for TTS to avoid blocking main thread and smooth interaction
    def say(self, text, block=True):
        """Speak text in a separate thread. If block=True, wait for completion."""
        if not text or not str(text).strip():
            return
            
        # Wait for previous speech to finish if block was True before
        if self.last_thread and self.last_thread.is_alive():
            self.last_thread.join()
        
        # Start new speech thread
        thread = threading.Thread(
            target=_speak_worker,
            args=(text, self.rate_delta, self.volume, self.voice_name_substr),
            daemon=True
        )
        thread.start()
        
        if block:
            thread.join()  # Wait for this utterance to complete
        else:
            self.last_thread = thread  # Track for next call
    
    def wait_until_done(self):
        """Block until the last speech thread completes."""
        if self.last_thread and self.last_thread.is_alive():
            self.last_thread.join()


In [23]:
DEFAULT_MODEL = os.getenv("GROQ_MODEL", "llama-3.3-70b-versatile") # using 70B model availbale in Groq_console

# Prompt to guide the LLM's behavior
SYSTEM_POLICY = (
    "You are an AI room guard. Use a 3-level escalation policy: "
    "Level 1: politely ask name and purpose; "
    "Level 2: warn and request leaving if unauthorized; "
    "Level 3: stern warning: monitoring and logging. "
    "Keep each reply calm, firm, and <= 20 words."
)

FALLBACK_REPLY = "Identify yourself and your purpose. This area is monitored."

class GroqLLM:
    def __init__(self, model: str = DEFAULT_MODEL, api_key: Optional[str] = None):
        api_key = api_key or os.getenv("GROQ_API_KEY")
        if not api_key:
            raise RuntimeError("GROQ_API_KEY not set")
        self.client = Groq(api_key=api_key)
        self.model = model

    def generate(self, user_text: str, level: int = 1) -> str:
        messages = [
            {"role": "system", "content": f"{SYSTEM_POLICY} Current escalation level={level}."},
            {"role": "user", "content": user_text or "(silence)"},
        ]
        try:
            rsp = self.client.chat.completions.create(
                model=self.model,
                messages=messages,
                temperature=0.3,
                max_tokens=60,
            )
        except Exception as e:
            print(f"[LLM] API error: {e}")
            return FALLBACK_REPLY

        if not rsp or not getattr(rsp, "choices", None):
            print("[LLM] No choices in response")
            return FALLBACK_REPLY

        choice = rsp.choices[0]
        finish = getattr(choice, "finish_reason", None)
        msg = getattr(choice, "message", None)
        text = (getattr(msg, "content", None) or "").strip() if msg else ""

        # Handle tool calls or empty content
        tool_calls = getattr(msg, "tool_calls", None) if msg else None
        if not text:
            print(f"[LLM] Empty content. finish_reason={finish}, tool_calls={bool(tool_calls)}")
            # Retry once with a simpler directive
            try:
                retry = self.client.chat.completions.create(
                    model=self.model,
                    messages=[
                        {"role": "system", "content": "Reply in <= 20 words."},
                        {"role": "user", "content": user_text or "(silence)"},
                    ],
                    temperature=0.2, 
                    max_tokens=40,
                )
                if retry and retry.choices:
                    rmsg = retry.choices[0].message
                    rtext = (getattr(rmsg, "content", None) or "").strip()
                    if rtext:
                        return rtext
            except Exception as e2:
                print(f"[LLM] Retry error: {e2}")
            return FALLBACK_REPLY

        if finish and finish != "stop":
            print(f"[LLM] finish_reason={finish}")
        return text

# ---------- Interactive CLI guard for quick testing ----------
def _beep_final(): # BEEP AFTER LEVEL 3 INTERACTION FAILS
    # Windows beep; on non-Windows this falls back to a bell character
    try:
        import winsound
        winsound.Beep(1000, 600)  # 1000 Hz for 600 ms
    except Exception:
        print("\a")  # Terminal bell as a fallback

def interactive_guard():
    """
    Simple text-based interaction loop:
    L1: ask identity & purpose (LLM asks)
    L2: ask for code word (checked locally)
    L3: cross-question (LLM asks), then final stern warning + beep
    """
    # Setup
    code_word = os.getenv("CODE_WORD", "delta")
    model = os.getenv("GROQ_MODEL", DEFAULT_MODEL)
    api_key = os.getenv("GROQ_API_KEY")
    if not api_key:
        raise RuntimeError("Set GROQ_API_KEY before running.")
    llm = GroqLLM(model=model, api_key=api_key)

    print(f"[Guard] Using model: {model}")
    level = 1

    # Level 1 — identity and purpose
    q1 = llm.generate("Ask their name and purpose in one sentence.", level=level)
    print(f"\n[Guard L{level}]: {q1}")
    u1 = input("[Visitor]: ").strip()

    # Level 2 — code word challenge
    level = 2
    q2 = llm.generate("Ask for today's code word in one sentence.", level=level)
    print(f"\n[Guard L{level}]: {q2}")
    u2 = input("[Visitor]: ").strip()
    if u2.lower() == code_word.lower():
        ack = llm.generate("Acknowledge correct code word and grant access.", level=1)
        print(f"\n[Guard]: {ack}")
        return
    warn = llm.generate("Warn unauthorized access and say one more question will confirm identity.", level=level)
    print(f"\n[Guard]: {warn}")

    # Level 3 — cross-question then final warning + beep
    level = 3
    q3 = llm.generate("Ask one short cross-question to verify identity.", level=level)
    print(f"\n[Guard L{level}]: {q3}")
    _ = input("[Visitor]: ").strip()

    final = llm.generate("Issue final stern warning to leave immediately; monitoring and logging active.", level=level)
    print(f"\n[Guard]: {final}")
    _beep_final()
    print("[Guard] Alarm triggered.")


# Master NLP Cell

In [24]:
# Helper functions from main.py
def beep_alarm():
    """Plays a system beep sound."""
    try:
        import winsound
        winsound.Beep(1000, 600)  # Frequency (Hz), Duration (ms)
    except ImportError:
        print("\a", end='', flush=True) # Terminal bell for non-Windows

def say_and_listen(asr: SpeechASR, tts: TTSVoice, prompt_text: str, listen_seconds: float) -> str:
    """Speaks a prompt, waits for TTS to finish, then listens for a response."""
    text = prompt_text if isinstance(prompt_text, str) and prompt_text.strip() else "State your name and purpose."
    print(f"AI Guard: {text}")
    tts.say(text, block=True)
    
    print("Listening for response...")
    heard = asr.transcribe_once(listen_seconds=listen_seconds)
    print(f"User said: '{heard}'")
    return (heard or "").strip()

def is_evasive(text: str) -> bool:
    """Checks for evasive or empty user responses."""
    t = (text or "").lower()
    bad_keywords = ["none of your", "not telling", "go away", "shut up", "leave me", "no comment"]
    return any(k in t for k in bad_keywords) or len(t) < 3

In [25]:
# Core conversational logic adapted from main.py
def handle_intruder(asr: SpeechASR, tts: TTSVoice, llm: GroqLLM):
    """
    Manages the escalating conversation with an unrecognized person.
    """
    # --- Config ---
    utterance_seconds = 7.0
    max_turns = 3
    code_word = "delta"
    
    level = 1
    turns = 0
    granted = False

    while turns < max_turns and not granted:
        turns += 1
        print(f"\n--- Turn {turns}/{max_turns}, Level {level} ---")

        if level == 1: # PASSING AS STRING PARAM
            prompt = llm.generate("Ask their name and purpose in one sentence.", level=level)
            user_response = say_and_listen(asr, tts, prompt, utterance_seconds)
            level = 2 # Always escalate to level 2 to ask for the code word

        elif level == 2:
            prompt = llm.generate("Ask for today's code word in one sentence.", level=level)
            response = say_and_listen(asr, tts, prompt, utterance_seconds)
            if code_word.lower() in response.lower():
                ack = llm.generate("Acknowledge correct code word and grant access.", level=1)
                tts.say(ack, block=True)
                granted = True
            else:
                warn = llm.generate("Warn unauthorized access; one more question will verify identity.", level=level)
                tts.say(warn, block=True)
                level = 3
        else:  # level >= 3
            final_warning = llm.generate("Final stern warning: monitored and logged. Leave immediately.", level=3)
            tts.say(final_warning, block=True)
            beep_alarm()
            break

    if not granted:
        print("\n--- Intruder Protocol Finished. Access Denied. ---")

# Eager Loading 

In [10]:
sfr = SimpleFacerec()
DATASET_PATH = "justtrail"
sfr.load_encoding_images(DATASET_PATH)

Loading known face encodings...
[WARN] No face found in WIN_20251007_17_13_09_Pro.jpg for Ricks
[WARN] No face found in WIN_20251007_17_13_20_Pro.jpg for Ricks
[WARN] No face found in WIN_20251007_17_13_23_Pro.jpg for Ricks
[WARN] No face found in WIN_20251007_17_13_25_Pro.jpg for Ricks
[WARN] No face found in WIN_20251007_17_13_27_Pro.jpg for Ricks
[WARN] No face found in WIN_20251007_17_13_35_Pro.jpg for Ricks
[WARN] No face found in WIN_20251007_17_13_37_Pro.jpg for Ricks
[WARN] No face found in WIN_20251007_17_13_45_Pro.jpg for Ricks
[WARN] No face found in WIN_20251007_17_14_03_Pro.jpg for Ricks
[WARN] No face found in WIN_20251007_17_14_09_Pro.jpg for Ricks
Loaded 12 face encodings.


# Master Agent

In [41]:
def run_ai_guard_system():
    # --- 1. Configuration ---
    # Update this path to your folder of known faces
   
    
    # Get your free API key from https://console.groq.com/keys
    GROQ_API_KEY = "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" # Grok-4 Key
    
    KEYWORD = "defend my room"
    ACTIVATION_WINDOW = 4.0 # seconds

    # --- 2. Initialize All Systems ---
    # Vision System
    

    # ASR, TTS, and LLM Systems
    if not GROQ_API_KEY:
        print("FATAL: GROQ_API_KEY is not set. The guard cannot function.")
        return
        
    asr = SpeechASR()
    tts = TTSVoice(rate_delta=-10) # Slightly slower for clarity
    llm = GroqLLM(api_key=GROQ_API_KEY)

    # --- 3. Wait for Activation ---
    tts.say("AI guard is ready. Say the activation phrase to begin.", block=True)
    activated, heard_phrase = asr.listen_for_keyword(keyword=KEYWORD, window_seconds=ACTIVATION_WINDOW)
    if not activated:
        tts.say("Activation phrase not detected. Shutting down.", block=True)
        print("Activation failed. Exiting.")
        return

    tts.say("Guard mode activated.", block=True)
    
    # --- 4. Perform Face Recognition ---
    recognized_person = start_recog(sfr)

    # --- 5. Handle Outcome ---
    if recognized_person != "Unknown":
        greeting = f"Welcome, {recognized_person}. You are cleared for entry."
        print(greeting)
        tts.say(greeting, block=True)
    else:
        tts.say("Unrecognized individual detected. Initiating verification protocol.", block=True)
        handle_intruder(asr, tts, llm)

    print("\n--- AI Guard session concluded. ---")


# --- Run the application ---
run_ai_guard_system()

[ASR] Heard: defend my room
Starting face recognition...

⚠️ Max detections reached — stability not achieved.

Final recognized person: Unknown

--- Turn 1/3, Level 1 ---
AI Guard: May I please have your name and purpose for being here?
Listening for response...
User said: 'no'

--- Turn 2/3, Level 2 ---
AI Guard: Please provide today's code word for authorized access.
Listening for response...
User said: 'delta'

--- AI Guard session concluded. ---
