OpenAdaptAI · angelala3252 · May 26, 2023 · May 27, 2023 · May 29, 2023 · May 31, 2023
diff --git a/alembic/versions/c176288cb508_add_audio_info.py b/alembic/versions/c176288cb508_add_audio_info.py
@@ -0,0 +1,47 @@
+"""Add audio info.
+
+Revision ID: c176288cb508
+Revises: 8713b142f5de
+Create Date: 2023-08-31 00:25:04.889325
+
+"""
+import sqlalchemy as sa
+
+from alembic import op
+from openadapt.models import ForceFloat
+
+# revision identifiers, used by Alembic.
+revision = "c176288cb508"
+down_revision = "8713b142f5de"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table(
+        "audio_info",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("flac_data", sa.LargeBinary(), nullable=True),
+        sa.Column("transcribed_text", sa.String(), nullable=True),
+        sa.Column(
+            "recording_timestamp",
+            ForceFloat(precision=10, scale=2, asdecimal=False),
+            nullable=True,
+        ),
+        sa.Column("sample_rate", sa.Integer(), nullable=True),
+        sa.Column("words_with_timestamps", sa.Text(), nullable=True),
+        sa.ForeignKeyConstraint(
+            ["recording_timestamp"],
+            ["recording.timestamp"],
+            name=op.f("fk_audio_info_recording_timestamp_recording"),
+        ),
+        sa.PrimaryKeyConstraint("id", name=op.f("pk_audio_info")),
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table("audio_info")
+    # ### end Alembic commands ###
diff --git a/openadapt/crud.py b/openadapt/crud.py
diff --git a/openadapt/db/crud.py b/openadapt/db/crud.py
@@ -4,6 +4,7 @@
 """
 
 from typing import Any
+import json
 
 from loguru import logger
 import sqlalchemy as sa
@@ -12,6 +13,7 @@
 from openadapt.db.db import BaseModel, Session
 from openadapt.models import (
     ActionEvent,
+    AudioInfo,
     MemoryStat,
     PerformanceStat,
     Recording,
@@ -411,3 +413,22 @@ def new_session() -> None:
     if db:
         db.close()
     db = Session()
+
+
+def insert_audio_info(
+    audio_data: bytes,
+    transcribed_text: str,
+    recording_timestamp: float,
+    sample_rate: int,
+    word_list: list,
+) -> None:
+    """Create an AudioInfo entry in the database."""
+    audio_info = AudioInfo(
+        flac_data=audio_data,
+        transcribed_text=transcribed_text,
+        recording_timestamp=recording_timestamp,
+        sample_rate=sample_rate,
+        words_with_timestamps=json.dumps(word_list),
+    )
+    db.add(audio_info)
+    db.commit()
diff --git a/openadapt/db/db.py b/openadapt/db/db.py
@@ -5,7 +5,7 @@
 
 from dictalchemy import DictableModel
 from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy.orm import sessionmaker
+from sqlalchemy.orm import scoped_session, sessionmaker
 from sqlalchemy.schema import MetaData
 import sqlalchemy as sa
 
@@ -67,4 +67,5 @@ def get_base(engine: sa.engine) -> sa.engine:
 
 engine = get_engine()
 Base = get_base(engine)
-Session = sessionmaker(bind=engine)
+session_factory = sessionmaker(bind=engine)
+Session = scoped_session(session_factory)
diff --git a/openadapt/models.py b/openadapt/models.py
@@ -61,6 +61,8 @@ class Recording(db.Base):
         order_by="WindowEvent.timestamp",
     )
 
+    audio_info = sa.orm.relationship("AudioInfo", back_populates="recording")
+
     _processed_action_events = None
 
     @property
@@ -378,6 +380,21 @@ def get_active_window_event(cls: "WindowEvent") -> "WindowEvent":
         return WindowEvent(**window.get_active_window_data())
 
 
+class AudioInfo(db.Base):
+    """Class representing the audio from a recording in the database."""
+
+    __tablename__ = "audio_info"
+
+    id = sa.Column(sa.Integer, primary_key=True)
+    flac_data = sa.Column(sa.LargeBinary)
+    transcribed_text = sa.Column(sa.String)
+    recording_timestamp = sa.Column(sa.ForeignKey("recording.timestamp"))
+    sample_rate = sa.Column(sa.Integer)
+    words_with_timestamps = sa.Column(sa.Text)
+
+    recording = sa.orm.relationship("Recording", back_populates="audio_info")
+
+
 class PerformanceStat(db.Base):
     """Class representing a performance statistic in the database."""
 

diff --git a/openadapt/record.py b/openadapt/record.py
@@ -4,11 +4,16 @@
 
     $ python openadapt/record.py "<description of task to be recorded>"
 
+To record audio:
+
+    $ python openadapt/record.py "<description of task to be recorded>" --enable_audio
+
 """
 
 from collections import namedtuple
 from functools import partial, wraps
 from typing import Any, Callable, Union
+import io
 import multiprocessing
 import os
 import queue
@@ -24,7 +29,11 @@
 from tqdm import tqdm
 import fire
 import mss.tools
+import numpy as np
 import psutil
+import sounddevice
+import soundfile
+import whisper
 
 from openadapt import config, utils, window
 from openadapt.db import crud
@@ -804,15 +813,101 @@ def read_mouse_events(
     mouse_listener.stop()
 
 
+def record_audio(
+    terminate_event: multiprocessing.Event,
+    recording_timestamp: float,
+) -> None:
+    """Record audio narration during the recording and store data in database.
+
+    Args:
+        terminate_event: The event to signal termination of event reading.
+        recording_timestamp: The timestamp of the recording.
+    """
+    utils.configure_logging(logger, LOG_LEVEL)
+    utils.set_start_time(recording_timestamp)
+
+    audio_frames = []  # to store audio frames
+
+    def audio_callback(
+        indata: np.ndarray, frames: int, time: Any, status: sounddevice.CallbackFlags
+    ) -> None:
+        """Callback function used when new audio frames are recorded.
+
+        Note: time is of type cffi.FFI.CData, but since we don't use this argument
+        and we also don't use the cffi library, the Any type annotation is used.
+        """
+        # called whenever there is new audio frames
+        audio_frames.append(indata.copy())
+
+    # open InputStream and start recording while ActionEvents are recorded
+    audio_stream = sounddevice.InputStream(
+        callback=audio_callback, samplerate=16000, channels=1
+    )
+    logger.info("Audio recording started.")
+    audio_stream.start()
+    terminate_event.wait()
+    audio_stream.stop()
+    audio_stream.close()
+
+    # Concatenate into one Numpy array
+    concatenated_audio = np.concatenate(audio_frames, axis=0)
+    # convert concatenated_audio to format expected by whisper
+    converted_audio = concatenated_audio.flatten().astype(np.float32)
+
+    # Convert audio to text using OpenAI's Whisper
+    logger.info("Transcribing audio...")
+    model = whisper.load_model("base")
+    result_info = model.transcribe(converted_audio, word_timestamps=True, fp16=False)
+    logger.info(f"The narrated text is: {result_info['text']}")
+    # empty word_list if the user didn't say anything
+    word_list = []
+    # segments could be empty
+    if len(result_info["segments"]) > 0:
+        # there won't be a 'words' list if the user didn't say anything
+        if "words" in result_info["segments"][0]:
+            word_list = result_info["segments"][0]["words"]
+
+    # compress and convert to bytes to save to database
+    logger.info(
+        "Size of uncompressed audio data: {} bytes".format(converted_audio.nbytes)
+    )
+    # Create an in-memory file-like object
+    file_obj = io.BytesIO()
+    # Write the audio data using lossless compression
+    soundfile.write(
+        file_obj, converted_audio, int(audio_stream.samplerate), format="FLAC"
+    )
+    # Get the compressed audio data as bytes
+    compressed_audio_bytes = file_obj.getvalue()
+
+    logger.info(
+        "Size of compressed audio data: {} bytes".format(len(compressed_audio_bytes))
+    )
+
+    file_obj.close()
+
+    # To decompress the audio and restore it to its original form:
+    # restored_audio, restored_samplerate = sf.read(
+    # io.BytesIO(compressed_audio_bytes))
+
+    # Create AudioInfo entry
+    crud.insert_audio_info(
+        compressed_audio_bytes,
+        result_info["text"],
+        recording_timestamp,
+        int(audio_stream.samplerate),
+        word_list,
+    )
+
+
 @logger.catch
 @trace(logger)
-def record(
-    task_description: str,
-) -> None:
+def record(task_description: str, enable_audio: bool = False) -> None:
     """Record Screenshots/ActionEvents/WindowEvents.
 
     Args:
         task_description: A text description of the task to be recorded.
+        enable_audio: a flag to enable or disable audio recording (default: False)
     """
     logger.info(f"{task_description=}")
 
@@ -943,6 +1038,13 @@ def record(
         )
         mem_plotter.start()
 
+    if enable_audio:
+        audio_recorder = threading.Thread(
+            target=record_audio,
+            args=(terminate_event, recording_timestamp),
+        )
+        audio_recorder.start()
+
     # TODO: discard events until everything is ready
 
     collect_stats()
@@ -972,6 +1074,9 @@ def record(
     screen_event_writer.join()
     action_event_writer.join()
     window_event_writer.join()
+    if enable_audio:
+        audio_recorder.join()
+
     terminate_perf_event.set()
 
     if PLOT_PERFORMANCE:

diff --git a/openadapt/strategies/mixins/openai.py b/openadapt/strategies/mixins/openai.py
@@ -11,7 +11,6 @@ class MyReplayStrategy(OpenAIReplayStrategyMixin):
 
 from loguru import logger
 import openai
-import tiktoken
 
 from openadapt import cache, config, models
 from openadapt.strategies.base import BaseReplayStrategy
@@ -29,7 +28,6 @@ class MyReplayStrategy(OpenAIReplayStrategyMixin):
 MODEL_NAME = "gpt-4"
 
 openai.api_key = config.OPENAI_API_KEY
-encoding = tiktoken.get_encoding("cl100k_base")
 
 
 class OpenAIReplayStrategyMixin(BaseReplayStrategy):
@@ -187,50 +185,3 @@ def _get_completion(prompt: str) -> str:
     logger.debug(f"appending assistant_message=\n{pformat(assistant_message)}")
     messages.append(assistant_message)
     return messages
-
-
-# XXX TODO not currently in use
-# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
-def num_tokens_from_messages(messages: list, model: str = "gpt-3.5-turbo-0301") -> int:
-    """Returns the number of tokens used by a list of messages."""
-    try:
-        encoding = tiktoken.encoding_for_model(model)
-    except KeyError:
-        logger.info("Warning: model not found. Using cl100k_base encoding.")
-        encoding = tiktoken.get_encoding("cl100k_base")
-    if model == "gpt-3.5-turbo":
-        logger.info(
-            "Warning: gpt-3.5-turbo may change over time. Returning num tokens "
-            "assuming gpt-3.5-turbo-0301."
-        )
-        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
-    elif model == "gpt-4":
-        logger.info(
-            "Warning: gpt-4 may change over time. Returning num tokens "
-            "assuming gpt-4-0314."
-        )
-        return num_tokens_from_messages(messages, model="gpt-4-0314")
-    elif model == "gpt-3.5-turbo-0301":
-        tokens_per_message = (
-            4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
-        )
-        tokens_per_name = -1  # if there's a name, the role is omitted
-    elif model == "gpt-4-0314":
-        tokens_per_message = 3
-        tokens_per_name = 1
-    else:
-        raise NotImplementedError(
-            f"""num_tokens_from_messages() is not implemented for model "
-            "{model}. See "
-            "https://github.com/openai/openai-python/blob/main/chatml.md for "
-            information on how messages are converted to tokens."""
-        )
-    num_tokens = 0
-    for message in messages:
-        num_tokens += tokens_per_message
-        for key, value in message.items():
-            num_tokens += len(encoding.encode(value))
-            if key == "name":
-                num_tokens += tokens_per_name
-    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
-    return num_tokens
diff --git a/openadapt/visualize.py b/openadapt/visualize.py
@@ -12,7 +12,7 @@
 import click
 
 from openadapt import config
-from openadapt.db.crud import get_latest_recording, get_recording
+from openadapt.db.crud import get_audio_info, get_latest_recording, get_recording
 from openadapt.events import get_events
 from openadapt.utils import (
     EMPTY,
@@ -147,6 +147,10 @@ def main(timestamp: str, notify: bool = True) -> None:
         scrub.scrub_text(recording.task_description)
     logger.debug(f"{recording=}")
 
+    audio_info = row2dict(get_audio_info(recording))
+    # don't display the FLAC data
+    del audio_info["flac_data"]
+
     meta = {}
     action_events = get_events(recording, process=PROCESS_EVENTS, meta=meta)
     event_dicts = rows2dicts(action_events)