In [48]:
import time
import cv2
import speech_recognition as sr
import keyboard
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn.functional as F
from threading import Thread, Event
from deepface import DeepFace
import numpy as np
from openai import OpenAI

In [49]:
class ImprovPartner:

    system_prompt = """
You are an improv partner who interacts dynamically with the user in an emotionally aware, escalating conversation.

For every user input, you are given two pieces of information:
1. Dialog: the words the user said.
2. Emotion: the detected emotional state (angry, sad, happy, scared, surprised, disgusted, neutral).

Your role:
- Respond in character, adapting to the user's dialog and emotional sentiment.
- Escalate emotional tension when appropriate. Do not immediately try to resolve conflict — let emotions breathe and build naturally.
- Maintain one consistent character during a scene. Create a vivid, believable persona that reacts authentically to the situation.
- Invent background context if it enriches the scene, but do not contradict any history that has already been established during the conversation.
- Adapt to the implied setting of the conversation (e.g., modern, fantasy, casual, dramatic), even if the user doesn't state it directly.
- Keep responses short and punchy, only a sentence or two at most. Avoid long monologues or explanations.
- Progress the scene with each response, moving the story forward and deepening the emotional engagement.

Emotion handling:
- If the user is angry, you may push back, argue, defend yourself, or escalate the conflict.
- If the user is sad, you may express guilt, distance, or confused sympathy (depending on the tone).
- If the user is happy, you may celebrate, tease, or bond with them.
- If the user is afraid, you may heighten the danger, share the fear, or act protective.
- If the user is disgusted, you may act defensive, embarrassed, or grossed out yourself.
- If the user is surprised, you may share in the shock or provide an emotionally charged explanation.
- If the user is neutral, continue naturally or build emotion based on context.

Always prioritize emotional engagement over politeness, logic, or realism.

Your goal is to build a memorable, emotionally charged scene together — not to calm things down unless that makes sense for the character you're playing.
"""

    def __init__(self, api_key: str, model="gpt-4o-mini"):
        self.api_key = api_key
        self.client = OpenAI(api_key=api_key)
        self.model = model
        self.messages = [
            {"role": "system", "content": self.system_prompt},
        ]

    def set_story_background(self, story_background: str):
        # Add story background to the messages list
        if len(self.messages) > 1:
            raise Exception("Story background can only be set once, and must be done before any dialog.")
        self.messages.append(
            {
                "role": "system",
                "content": f"The following defines the setting you and your partner are acting in: {story_background}",
            }
        )

    def get_next_improv_response(self, dialog, emotion):
        # Add user input to the messages list
        self.messages.append(
            {"role": "user", "content": f"Dialog: {dialog}, Emotion: {emotion}"}
        )
        # Generate response
        response = self.client.chat.completions.create(
            model=self.model, messages=self.messages, temperature=1.0
        )
        # Add response response to the messages list
        self.messages.append(
            {"role": "assistant", "content": response.choices[0].message.content}
        )
        # Extract and return the generated text
        return response.choices[0].message.content


In [50]:
def analyze_facial_emotion(
    cap: cv2.VideoCapture, filename: str
) -> dict[str, np.float32]:
    # If the camera is not opened, raise an error
    if not cap.isOpened():
        raise IOError("Cannot open camera")
    # Capture the image (twice ensures we get an unbuffered frame)
    cap.read()
    captured, image = cap.read()
    # Check if image is captured correctly
    if not captured:
        raise IOError("Cannot capture image")
    # Save the captured image to the specified filename
    cv2.imwrite(filename, image)
    # Analyze the image using DeepFace
    analysis = DeepFace.analyze(filename, ("emotion"), align=False)
    return analysis[0]["emotion"]

In [51]:
def analyze_facial_emotions(cap: cv2.VideoCapture, analyses: list, stop_event: Event):
    while not stop_event.is_set():
        # Do this while recognizer adjusts for ambient noise, then between each frame
        time.sleep(1)
        try:
            analysis = analyze_facial_emotion(cap, "data/captured_image.jpg")
            analyses.append(analysis)
        except Exception as e:
            pass

In [52]:
def record_audio() -> str | None:
    # Initialize the recognizer
    recognizer = sr.Recognizer()
    recognizer.pause_threshold = 1.0
    # Start recording
    with sr.Microphone() as mic:
        recognizer.adjust_for_ambient_noise(mic, duration=1)
        print("Recording started.")
        audio_data = recognizer.listen(mic, timeout=None, phrase_time_limit=None)
    print("Recording stopped.")
    # Parse text from the audio data
    try:
        text = recognizer.recognize_google(audio_data, language="en-US")
        return text
    except sr.UnknownValueError:
        print("Speech Recognition could not understand audio")
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service: {e}")

In [53]:
def get_textual_emotion_prediction(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probs = F.softmax(logits, dim=-1)
    emotion_labels = model.config.id2label
    results = {
        emotion_labels[i]: probs[0][i].item() for i in range(len(emotion_labels))
    }
    return results

In [54]:
def get_audio_features(model, tokenizer):
    text = record_audio()
    return get_textual_emotion_prediction(text, model, tokenizer), text

In [55]:
def normalize_facial_emotions(facial_emotions):
    normalized_emotions = {}
    for emotion, value in facial_emotions.items():
        if emotion == "angry":
            normalized_emotions["anger"] = value / 100.0
        elif emotion == "happy":
            normalized_emotions["joy"] = value / 100.0
        elif emotion == "sad":
            normalized_emotions["sadness"] = value / 100.0
        else:
            normalized_emotions[emotion] = value / 100.0
    return normalized_emotions

In [56]:
def combine_emotions(facial_emotions: dict, audio_emotions: dict):
    combined_emotions = {}
    for emotion in facial_emotions:
        combined_emotions[emotion] = (
            facial_emotions[emotion] + audio_emotions[emotion]
        ) / 2
    return combined_emotions

In [57]:
def get_input_data(cap: cv2.VideoCapture, model, tokenizer):
    # Create background thread for facial emotion analysis
    facial_emotions_list = []
    stop_event = Event()
    facial_emotion_thread = Thread(
        target=analyze_facial_emotions, args=(cap, facial_emotions_list, stop_event)
    )
    print("Press SPACE to start recording or stop cell execution to exit.")
    # Wait for the user to press the space key
    keyboard.wait("space")
    facial_emotion_thread.start()
    # Gather emotions from audio
    audio_emotions, dialog = get_audio_features(model, tokenizer)
    # Wait for the facial emotion analysis thread to finish
    stop_event.set()
    facial_emotion_thread.join()
    # Average the facial emotions
    facial_emotions = {}
    for key in facial_emotions_list[0]:
        facial_emotions[key] = sum(
            analysis[key] for analysis in facial_emotions_list
        ) / len(facial_emotions_list)
    # Normalize facial emotions
    facial_emotions = normalize_facial_emotions(facial_emotions)
    # Combine facial and audio emotions
    emotions = combine_emotions(facial_emotions, audio_emotions)
    emotion = max(emotions, key=emotions.get)
    return emotion, dialog

In [None]:
# Load the pre-trained model and tokenizer for textual emotion analysis
model_name = "michellejieli/emotion_text_classifier"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# Initialize the improv partner
partner = ImprovPartner(
    "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"
)
# Initialize the camera
cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)
if not cap.isOpened():
    cap = cv2.VideoCapture(0)
if not cap.isOpened():
    raise IOError("Cannot open camera")
while True:
    try:
        emotion, dialog = get_input_data(cap, model, tokenizer)
        print("Detected Emotion:", emotion)
        print("Detected Dialog:", dialog)
        partner_response = partner.get_next_improv_response(
            dialog, emotion
        )
        print("Improv Partner Response:", partner_response)
    except KeyboardInterrupt:
        print("Exiting...")
        break
    except Exception as e:
        print(f"Error: {e}")
        break
cap.release()

Press SPACE to start recording or stop cell execution to exit.
Recording started.
Recording stopped.
Detected Emotion: anger
Detected Dialog: I hate you for what you did
Improv Partner Response: How can you say that? I thought we were in this together! What exactly did you expect me to do when you left me hanging?
Press SPACE to start recording or stop cell execution to exit.


KeyboardInterrupt: 