From 4da4f22f480250dbea967a56c110bab56eed8921 Mon Sep 17 00:00:00 2001 From: Lucas Faria Date: Fri, 17 Oct 2025 09:28:49 -0300 Subject: [PATCH] feat: transcription & AI - Whisper, summaries, task extraction - OpenAI Whisper transcription (gpt-4o-mini-transcribe) - Auto-generate 3-7 word summaries (GPT-4o-mini) - Extract actionable tasks from transcripts - TranscriptionSection UI component - SettingsPanel for OpenAI key management - Secure API key storage in authStore - 25MB file size limit handling --- src/main/index.ts | 2 +- src/main/services/recording.ts | 410 ++++++++++++++++++ src/main/services/transcription-prompts.ts | 25 ++ .../recordings/components/RecordingDetail.tsx | 46 +- .../recordings/components/RecordingsView.tsx | 4 + .../recordings/components/SettingsPanel.tsx | 304 ++++++++++++- .../components/TranscriptionSection.tsx | 167 +++++++ .../recordings/hooks/useRecordings.ts | 72 +++ src/renderer/stores/authStore.ts | 33 +- 9 files changed, 1049 insertions(+), 14 deletions(-) create mode 100644 src/main/services/recording.ts create mode 100644 src/main/services/transcription-prompts.ts create mode 100644 src/renderer/features/recordings/components/TranscriptionSection.tsx diff --git a/src/main/index.ts b/src/main/index.ts index 4b436354..728dba39 100644 --- a/src/main/index.ts +++ b/src/main/index.ts @@ -13,7 +13,7 @@ import { registerAgentIpc, type TaskController } from "./services/agent.js"; import { registerFsIpc } from "./services/fs.js"; import { registerOsIpc } from "./services/os.js"; import { registerPosthogIpc } from "./services/posthog.js"; -import { registerRecordingIpc } from "./services/recording-notranscribe.js"; +import { registerRecordingIpc } from "./services/recording.js"; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); diff --git a/src/main/services/recording.ts b/src/main/services/recording.ts new file mode 100644 index 00000000..9145382c --- /dev/null +++ b/src/main/services/recording.ts @@ -0,0 +1,410 @@ +import fs from "node:fs"; +import { readFile } from "node:fs/promises"; +import path from "node:path"; +import { createOpenAI } from "@ai-sdk/openai"; +import { generateObject, experimental_transcribe as transcribe } from "ai"; +import { app, desktopCapturer, ipcMain } from "electron"; +import { z } from "zod"; +import type { Recording } from "../../shared/types.js"; + +import { + SUMMARY_PROMPT, + TASK_EXTRACTION_PROMPT, +} from "./transcription-prompts.js"; + +let FileConstructor: typeof File; +try { + const { File: NodeFile } = await import("node:buffer"); + FileConstructor = NodeFile as typeof File; +} catch { + FileConstructor = class File extends Blob { + name: string; + lastModified: number; + + constructor(bits: BlobPart[], name: string, options?: FilePropertyBag) { + super(bits, options); + this.name = name; + this.lastModified = options?.lastModified ?? Date.now(); + } + } as typeof File; +} + +if (!globalThis.File) { + globalThis.File = FileConstructor; +} + +interface RecordingSession { + id: string; + startTime: Date; +} + +const activeRecordings = new Map(); +const recordingsDir = path.join(app.getPath("userData"), "recordings"); + +if (!fs.existsSync(recordingsDir)) { + fs.mkdirSync(recordingsDir, { recursive: true }); +} + +/** + * Validates a recording ID to prevent path traversal attacks + */ +function validateRecordingId(recordingId: string): boolean { + const safePattern = /^[a-zA-Z0-9._-]+$/; + if (!safePattern.test(recordingId)) { + return false; + } + + const resolvedPath = path.resolve(path.join(recordingsDir, recordingId)); + const recordingsDirResolved = path.resolve(recordingsDir); + return resolvedPath.startsWith(recordingsDirResolved + path.sep); +} + +async function generateTranscriptSummary( + transcriptText: string, + openaiApiKey: string, +): Promise { + try { + const openai = createOpenAI({ apiKey: openaiApiKey }); + + const { object } = await generateObject({ + model: openai("gpt-4o-mini"), + schema: z.object({ + title: z.string().describe("A brief 3-7 word summary title"), + }), + messages: [ + { + role: "system", + content: + "You are a helpful assistant that creates concise titles for conversation transcripts. The title should be 3-7 words and capture the main topic.", + }, + { + role: "user", + content: `${SUMMARY_PROMPT}\n${transcriptText}`, + }, + ], + }); + + return object.title || null; + } catch (error) { + console.error("Failed to generate summary title:", error); + return null; + } +} + +async function extractTasksFromTranscript( + transcriptText: string, + openaiApiKey: string, +): Promise> { + try { + const openai = createOpenAI({ apiKey: openaiApiKey }); + + const schema = z.object({ + tasks: z.array( + z.object({ + title: z.string().describe("Brief task title"), + description: z.string().describe("Detailed description with context"), + }), + ), + }); + + const { object } = await generateObject({ + model: openai("gpt-4o-mini"), + schema, + messages: [ + { + role: "system", + content: + "You are a helpful assistant that extracts actionable tasks from conversation transcripts. Be generous in identifying work items - include feature requests, requirements, and any work that needs to be done.", + }, + { + role: "user", + content: `${TASK_EXTRACTION_PROMPT}\n${transcriptText}`, + }, + ], + }); + + return object.tasks || []; + } catch (error) { + console.error("Failed to extract tasks from transcription:", error); + return []; + } +} + +function safeLog(...args: unknown[]): void { + try { + console.log(...args); + } catch { + // Ignore logging errors + } +} + +export function registerRecordingIpc(): void { + ipcMain.handle( + "desktop-capturer:get-sources", + async (_event, options: { types: ("screen" | "window")[] }) => { + const sources = await desktopCapturer.getSources(options); + + const plainSources = sources.map((source) => { + return { + id: String(source.id), + name: String(source.name), + }; + }); + + safeLog(`[Desktop Capturer] Found ${plainSources.length} sources`); + return plainSources; + }, + ); + + ipcMain.handle("recording:start", async (_event) => { + const recordingId = `recording-${Date.now()}`; + const session: RecordingSession = { + id: recordingId, + startTime: new Date(), + }; + + activeRecordings.set(recordingId, session); + + return { recordingId, startTime: session.startTime.toISOString() }; + }); + + ipcMain.handle( + "recording:stop", + async ( + _event, + recordingId: string, + audioData: Uint8Array, + duration: number, + ) => { + const session = activeRecordings.get(recordingId); + if (!session) { + throw new Error("Recording session not found"); + } + + const filename = `recording-${session.startTime.toISOString().replace(/[:.]/g, "-")}.webm`; + const filePath = path.join(recordingsDir, filename); + const metadataPath = path.join(recordingsDir, `${filename}.json`); + + const buffer = Buffer.from(audioData); + fs.writeFileSync(filePath, buffer); + + const metadata = { + duration, + created_at: session.startTime.toISOString(), + }; + fs.writeFileSync(metadataPath, JSON.stringify(metadata, null, 2)); + + const recording: Recording = { + id: filename, + filename, + duration, + created_at: session.startTime.toISOString(), + file_path: filePath, + }; + + activeRecordings.delete(recordingId); + + return recording; + }, + ); + + ipcMain.handle("recording:list", async (_event) => { + const recordings: Recording[] = []; + + if (!fs.existsSync(recordingsDir)) { + return recordings; + } + + const files = fs.readdirSync(recordingsDir); + + for (const file of files) { + if (!file.endsWith(".webm")) continue; + + const filePath = path.join(recordingsDir, file); + const metadataPath = path.join(recordingsDir, `${file}.json`); + + let duration = 0; + let createdAt = new Date().toISOString(); + let transcription: Recording["transcription"]; + + if (fs.existsSync(metadataPath)) { + try { + const metadataContent = fs.readFileSync(metadataPath, "utf-8"); + const metadata = JSON.parse(metadataContent); + duration = metadata.duration || 0; + createdAt = metadata.created_at || createdAt; + transcription = metadata.transcription; + } catch (err) { + console.error("Failed to read metadata:", err); + } + } + + recordings.push({ + id: file, + filename: file, + duration, + created_at: createdAt, + file_path: filePath, + transcription, + }); + } + + return recordings.sort( + (a, b) => + new Date(b.created_at).getTime() - new Date(a.created_at).getTime(), + ); + }); + + ipcMain.handle("recording:delete", async (_event, recordingId: string) => { + if (!validateRecordingId(recordingId)) { + throw new Error("Invalid recording ID"); + } + + const filePath = path.join(recordingsDir, recordingId); + + const resolvedPath = fs.realpathSync.native(filePath); + const recordingsDirResolved = fs.realpathSync.native(recordingsDir); + if (!resolvedPath.startsWith(recordingsDirResolved)) { + throw new Error("Invalid recording path"); + } + + const metadataPath = path.join(recordingsDir, `${recordingId}.json`); + + let deleted = false; + + if (fs.existsSync(filePath)) { + fs.unlinkSync(filePath); + deleted = true; + } + + if (fs.existsSync(metadataPath)) { + fs.unlinkSync(metadataPath); + } + + return deleted; + }); + + ipcMain.handle("recording:get-file", async (_event, recordingId: string) => { + if (!validateRecordingId(recordingId)) { + throw new Error("Invalid recording ID"); + } + + const filePath = path.join(recordingsDir, recordingId); + + if (!fs.existsSync(filePath)) { + throw new Error("Recording file not found"); + } + + const resolvedPath = fs.realpathSync.native(filePath); + const recordingsDirResolved = fs.realpathSync.native(recordingsDir); + if (!resolvedPath.startsWith(recordingsDirResolved)) { + throw new Error("Invalid recording path"); + } + + const buffer = fs.readFileSync(filePath); + return buffer; + }); + + ipcMain.handle( + "recording:transcribe", + async (_event, recordingId: string, openaiApiKey: string) => { + if (!validateRecordingId(recordingId)) { + throw new Error("Invalid recording ID"); + } + + const filePath = path.join(recordingsDir, recordingId); + const metadataPath = path.join(recordingsDir, `${recordingId}.json`); + + if (!fs.existsSync(filePath)) { + throw new Error("Recording file not found"); + } + + const resolvedPath = fs.realpathSync.native(filePath); + const recordingsDirResolved = fs.realpathSync.native(recordingsDir); + if (!resolvedPath.startsWith(recordingsDirResolved)) { + throw new Error("Invalid recording path"); + } + + let metadata: Record = {}; + if (fs.existsSync(metadataPath)) { + metadata = JSON.parse(fs.readFileSync(metadataPath, "utf-8")); + } + + metadata.transcription = { + status: "processing", + text: "", + }; + fs.writeFileSync(metadataPath, JSON.stringify(metadata, null, 2)); + + try { + const openai = createOpenAI({ apiKey: openaiApiKey }); + + const audio = await readFile(filePath); + const maxSize = 25 * 1024 * 1024; + const fileSize = audio.length; + + safeLog( + `[Transcription] Starting (${(fileSize / 1024 / 1024).toFixed(2)} MB)`, + ); + + if (fileSize > maxSize) { + throw new Error( + `Recording file is too large (${(fileSize / 1024 / 1024).toFixed(1)} MB). ` + + `OpenAI Whisper API has a 25 MB limit. ` + + `Please record shorter segments (under ~2 hours at standard quality).`, + ); + } + + const result = await transcribe({ + model: openai.transcription("gpt-4o-mini-transcribe"), + audio, + }); + + safeLog("[Transcription] Result:", result.text); + + const fullTranscriptText = result.text; + + const summaryTitle = await generateTranscriptSummary( + fullTranscriptText, + openaiApiKey, + ); + + const extractedTasks = await extractTasksFromTranscript( + fullTranscriptText, + openaiApiKey, + ); + + safeLog( + `[Transcription] Complete - ${extractedTasks.length} tasks extracted`, + ); + + metadata.transcription = { + status: "completed", + text: fullTranscriptText, + summary: summaryTitle, + extracted_tasks: extractedTasks, + }; + fs.writeFileSync(metadataPath, JSON.stringify(metadata, null, 2)); + + return { + status: "completed", + text: fullTranscriptText, + summary: summaryTitle, + extracted_tasks: extractedTasks, + }; + } catch (error) { + console.error("[Transcription] Error:", error); + + metadata.transcription = { + status: "error", + text: "", + error: + error instanceof Error ? error.message : "Transcription failed", + }; + fs.writeFileSync(metadataPath, JSON.stringify(metadata, null, 2)); + + throw error; + } + }, + ); +} diff --git a/src/main/services/transcription-prompts.ts b/src/main/services/transcription-prompts.ts new file mode 100644 index 00000000..a7d7a68a --- /dev/null +++ b/src/main/services/transcription-prompts.ts @@ -0,0 +1,25 @@ +/** + * Transcription and AI processing prompts + * + * These prompts are used for: + * - Generating concise summaries of transcribed audio + * - Extracting actionable tasks from conversations + * + * Future: Move to user-editable config file (~/.array/prompts.json) + */ + +export const SUMMARY_PROMPT = `Create a very brief (3-7 words) title that summarizes what this conversation is about. + +Transcript:`; + +export const TASK_EXTRACTION_PROMPT = `Analyze the following conversation transcript and extract any actionable tasks, feature requests, bug fixes, or work items that were discussed or requested. This includes: +- Explicit action items ("we need to...", "let's build...") +- Feature requests ("I want...", "please build...") +- Bug reports ("this is broken...", "fix the...") +- Requirements ("it should have...", "make it...") + +For each task, provide a clear title and a description with relevant context from the conversation. + +If there are no actionable tasks, return an empty tasks array. + +Transcript:`; diff --git a/src/renderer/features/recordings/components/RecordingDetail.tsx b/src/renderer/features/recordings/components/RecordingDetail.tsx index 52e76724..c03f9088 100644 --- a/src/renderer/features/recordings/components/RecordingDetail.tsx +++ b/src/renderer/features/recordings/components/RecordingDetail.tsx @@ -1,6 +1,7 @@ import { Trash, X } from "@phosphor-icons/react"; import { Box, + Button, Card, Flex, Heading, @@ -13,21 +14,27 @@ import { import type { Recording } from "@shared/types"; import { format } from "date-fns"; import { useHotkeys } from "react-hotkeys-hook"; +import { useAuthStore } from "../../../stores/authStore"; import { useRecordingStore } from "../stores/recordingStore"; import { AudioPlayer } from "./AudioPlayer"; interface RecordingDetailProps { recording: Recording; onDelete: (recordingId: string) => void; + onTranscribe: (params: { recordingId: string; apiKey: string }) => void; + isTranscribing: boolean; isSettingsOpen?: boolean; } export function RecordingDetail({ recording, onDelete, + onTranscribe, + isTranscribing, isSettingsOpen = false, }: RecordingDetailProps) { const { setSelectedRecording } = useRecordingStore(); + const openaiApiKey = useAuthStore((state) => state.openaiApiKey); useHotkeys( "esc", @@ -166,8 +173,27 @@ export function RecordingDetail({ - Transcription not available + No transcription yet + {openaiApiKey && ( + + )} + {!openaiApiKey && ( + + Add OpenAI API key in settings to transcribe + + )} @@ -176,15 +202,29 @@ export function RecordingDetail({ {recording.transcription?.status === "error" && ( - + Transcription failed {recording.transcription.error && ( - + {recording.transcription.error} )} + {openaiApiKey && ( + + )} diff --git a/src/renderer/features/recordings/components/RecordingsView.tsx b/src/renderer/features/recordings/components/RecordingsView.tsx index e8aeae62..9879cb6b 100644 --- a/src/renderer/features/recordings/components/RecordingsView.tsx +++ b/src/renderer/features/recordings/components/RecordingsView.tsx @@ -23,6 +23,8 @@ export function RecordingsView() { isLoading, saveRecording, deleteRecording, + transcribeRecording, + isTranscribing, transcriptionError, clearTranscriptionError, } = useRecordings(); @@ -142,6 +144,8 @@ export function RecordingsView() { ) : ( diff --git a/src/renderer/features/recordings/components/SettingsPanel.tsx b/src/renderer/features/recordings/components/SettingsPanel.tsx index 09907d68..96963756 100644 --- a/src/renderer/features/recordings/components/SettingsPanel.tsx +++ b/src/renderer/features/recordings/components/SettingsPanel.tsx @@ -1,6 +1,17 @@ -// Implementation in the next PR - -import type { RecordingMode } from "@/renderer/features/recordings/stores/recordingStore"; +import { Key, MicrophoneIcon, X } from "@phosphor-icons/react"; +import { + Button, + Flex, + IconButton, + Kbd, + Select, + Text, + TextField, +} from "@radix-ui/themes"; +import { useCallback, useState } from "react"; +import { useHotkeys } from "react-hotkeys-hook"; +import { useAuthStore } from "../../../stores/authStore"; +import type { RecordingMode } from "../stores/recordingStore"; interface SettingsPanelProps { open: boolean; @@ -12,6 +23,289 @@ interface SettingsPanelProps { onMicrophoneChange: (deviceId: string) => void; } -export function SettingsPanel(_props: SettingsPanelProps) { - return
SettingsPanel
; +export function SettingsPanel({ + open, + onClose, + recordingMode, + availableDevices, + selectedMicId, + onRecordingModeChange, + onMicrophoneChange, +}: SettingsPanelProps) { + const { openaiApiKey, setOpenAIKey } = useAuthStore(); + const [apiKeyInput, setApiKeyInput] = useState(""); + const [isEditing, setIsEditing] = useState(false); + + const handleSaveApiKey = useCallback(async () => { + if (!apiKeyInput.trim()) { + alert("Please enter a valid API key"); + return; + } + try { + await setOpenAIKey(apiKeyInput); + setIsEditing(false); + setApiKeyInput(""); + } catch (error) { + console.error("Failed to save API key:", error); + alert("Failed to save API key. Please try again."); + } + }, [apiKeyInput, setOpenAIKey]); + + useHotkeys( + "escape", + () => { + onClose(); + }, + { enabled: open, enableOnFormTags: true }, + [onClose, open], + ); + + if (!open) return null; + + return ( + <> +