ParabolInc · jordanh · Feb 27, 2024 · Feb 9, 2024 · Feb 13, 2024 · Feb 13, 2024
diff --git a/.env.example b/.env.example
@@ -8,6 +8,11 @@ SERVER_ID='1'
 # Websocket port for the websocket server, only used in development (yarn dev)
 SOCKET_PORT='3001'
 
+# AI MODELS
+AI_EMBEDDING_MODELS='[{"model": "text-embeddings-inference:llmrails/ember-v1", "url": "http://localhost:3040/"}]'
+AI_GENERATION_MODELS='[{"model": "text-generation-inference:TheBloke/zephyr-7b-beta", "url": "http://localhost:3050/"}]'
+AI_EMBEDDER_ENABLED='true'
+
 # APPLICATION
 # AMPLITUDE_WRITE_KEY='key_AMPLITUDE_WRITE_KEY'
 # Enter a short url redirect service for invitations, it needs to redirecto to /invitation-link

diff --git a/docker/dev.yml b/docker/dev.yml
@@ -13,8 +13,6 @@ services:
       - /var/run/docker.sock:/var/run/docker.sock
       - /proc/:/host/proc/:ro
       - /sys/fs/cgroup:/host/sys/fs/cgroup:ro
-      - "./dd-conf.d:/etc/datadog-agent/conf.d/local.d/"
-      - "../dev/logs:/var/log/datadog/logs"
   db:
     image: rethinkdb:2.4.2
     restart: unless-stopped
@@ -72,10 +70,25 @@ services:
       - "8082:8081"
     networks:
       parabol-network:
+  text-embeddings-inference:
+    container_name: text-embeddings-inference
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-0.6
+    command:
+      - "--model-id=llmrails/ember-v1"
+    platform: linux/x86_64
+    hostname: text-embeddings-inference
+    restart: unless-stopped
+    ports:
+      - "3040:80"
+    volumes:
+      - text-embeddings-inference-data:/data
+    networks:
+      parabol-network:
 networks:
   parabol-network:
 volumes:
   redis-data: {}
   rethink-data: {}
   postgres-data: {}
   pgadmin-data: {}
+  text-embeddings-inference-data: {}
diff --git a/package.json b/package.json
@@ -103,8 +103,8 @@
     "html-webpack-plugin": "^5.5.0",
     "husky": "^7.0.4",
     "jscodeshift": "^0.14.0",
-    "kysely": "^0.26.3",
-    "kysely-codegen": "^0.10.0",
+    "kysely": "^0.27.2",
+    "kysely-codegen": "^0.11.0",
     "lerna": "^6.4.1",
     "mini-css-extract-plugin": "^2.7.2",
     "minimist": "^1.2.5",

diff --git a/packages/embedder/.eslintrc.js b/packages/embedder/.eslintrc.js
@@ -0,0 +1,11 @@
+module.exports = {
+  extends: [
+    '../../.eslintrc.js'
+  ],
+  parserOptions: {
+    project: './tsconfig.json',
+    ecmaVersion: 2020,
+    sourceType: 'module'
+  },
+  "ignorePatterns": ["**/lib", "*.js"]
+}
diff --git a/packages/embedder/README.md b/packages/embedder/README.md
@@ -0,0 +1,71 @@
+# `Embedder`
+
+This service builds embedding vectors for semantic search and for other AI/ML
+use cases. It does so by:
+
+1.  Updating a list of all possible items to create embedding vectors for and
+    storing that list in the `EmbeddingsMetadata` table
+2.  Adding these items in batches to the `EmbeddingsJobQueue` table and a redis
+    priority queue called `embedder:queue`
+3.  Allowing one or more parallel embedding services to calculate embedding
+    vectors (EmbeddingJobQueue states transistion from `queued` -> `embedding`,
+    then `embedding` -> [deleting the `EmbeddingJobQueue` row]
+
+    In addition to deleteing the `EmbeddingJobQueue` row, when a job completes
+    successfully:
+
+    - A row is added to the model table with the embedding vector; the
+      `EmbeddingMetadataId` field on this row points the appropriate
+      metadata row on `EmbeddingsMetadata`
+    - The `EmbeddingsMetadata.models` array is updated with the name of the
+      table that the embedding has been generated for
+
+4.  This process repeats forever using a silly polling loop
+
+In the future, it would be wonderful to enhance this service such that it were
+event driven.
+
+## Prerequisites
+
+The Embedder service depends on pgvector being available in Postgres.
+
+The predeploy script checks for an environment variable
+`POSTGRES_USE_PGVECTOR=true` to enable this extension in production.
+
+## Configuration
+
+The Embedder service takes no arguments and is controlled by the following
+environment variables, here given with example configuration:
+
+- `AI_EMBEDDER_ENABLE`: enable/disable the embedder service from
+  performing work, or sleeping indefinitely
+
+`AI_EMBEDDER_ENABLED='true'`
+
+- `AI_EMBEDDING_MODELS`: JSON configuration for which embedding models
+  are enabled. Each model in the array will be instantiated by
+  `ai_models/ModelManager`. Each model instance will have its own
+  database table created for it (if it does not exist already) used
+  to store calculated vectors. See `ai_models/ModelManager` for
+  which configurations are supported.
+
+  Example:
+
+`AI_EMBEDDING_MODELS='[{"model": "text-embeddings-inference:llmrails/ember-v1", "url": "http://localhost:3040/"}]'`
+
+- `AI_GENERATION_MODELS`: JSON configuration for which AI generation
+  models (i.e. GPTS are enabled). These models are used for summarization
+  text to be embedded by an embedding model if the text length would be
+  greater than the context window of the embedding model. Each model in
+  the array will be instantiated by `ai_models/ModelManager`.
+  See `ai_models/ModelManager` for which configurations are supported.
+
+  Example:
+
+`AI_GENERATION_MODELS='[{"model": "text-generation-inference:TheBloke/zephyr-7b-beta", "url": "http://localhost:3050/"}]'`
+
+## Usage
+
+The Embedder service is stateless and takes no arguments. Multiple instances
+of the service may be started in order to match embedding load, or to
+catch up on history more quickly.
diff --git a/packages/embedder/ai_models/AbstractModel.ts b/packages/embedder/ai_models/AbstractModel.ts
@@ -0,0 +1,75 @@
+export interface ModelConfig {
+  model: string
+  url: string
+}
+
+export interface EmbeddingModelConfig extends ModelConfig {
+  tableSuffix: string
+}
+
+export interface GenerationModelConfig extends ModelConfig {}
+
+export abstract class AbstractModel {
+  public readonly url?: string
+  public modelInstance: any
+
+  constructor(config: ModelConfig) {
+    this.url = this.normalizeUrl(config.url)
+  }
+
+  // removes a trailing slash from the inputUrl
+  private normalizeUrl(inputUrl: string | undefined) {
+    if (!inputUrl) return undefined
+    const regex = /[/]+$/
+    return inputUrl.replace(regex, '')
+  }
+}
+
+export interface EmbeddingModelParams {
+  embeddingDimensions: number
+  maxInputTokens: number
+  tableSuffix: string
+}
+
+export abstract class AbstractEmbeddingsModel extends AbstractModel {
+  readonly embeddingDimensions: number
+  readonly maxInputTokens: number
+  readonly tableName: string
+  constructor(config: EmbeddingModelConfig) {
+    super(config)
+    const modelParams = this.constructModelParams(config)
+    this.embeddingDimensions = modelParams.embeddingDimensions
+    this.maxInputTokens = modelParams.maxInputTokens
+    this.tableName = `Embeddings_${modelParams.tableSuffix}`
+  }
+  protected abstract constructModelParams(config: EmbeddingModelConfig): EmbeddingModelParams
+  abstract getEmbedding(content: string): Promise<number[]>
+}
+
+export interface GenerationModelParams {
+  maxInputTokens: number
+}
+
+export interface GenerationOptions {
+  maxNewTokens?: number
+  seed?: number
+  stop?: string
+  temperature?: number
+  topK?: number
+  topP?: number
+  truncate?: boolean
+}
+
+export abstract class AbstractGenerationModel extends AbstractModel {
+  readonly maxInputTokens: number
+  constructor(config: GenerationModelConfig) {
+    super(config)
+    const modelParams = this.constructModelParams(config)
+    this.maxInputTokens = modelParams.maxInputTokens
+  }
+
+  protected abstract constructModelParams(config: GenerationModelConfig): GenerationModelParams
+  abstract summarize(content: string, options: GenerationOptions): Promise<string>
+}
+
+export default AbstractModel
diff --git a/packages/embedder/ai_models/ModelManager.ts b/packages/embedder/ai_models/ModelManager.ts
@@ -0,0 +1,153 @@
+import {Kysely, sql} from 'kysely'
+
+import {
+  AbstractEmbeddingsModel,
+  AbstractGenerationModel,
+  EmbeddingModelConfig,
+  GenerationModelConfig,
+  ModelConfig
+} from './AbstractModel'
+import TextEmbeddingsInference from './TextEmbeddingsInference'
+import TextGenerationInference from './TextGenerationInference'
+
+interface ModelManagerConfig {
+  embeddingModels: EmbeddingModelConfig[]
+  generationModels: GenerationModelConfig[]
+}
+
+export type EmbeddingsModelType = 'text-embeddings-inference'
+export type GenerationModelType = 'text-generation-inference'
+
+export class ModelManager {
+  embeddingModels: AbstractEmbeddingsModel[]
+  embeddingModelsMapByTable: {[key: string]: AbstractEmbeddingsModel}
+  generationModels: AbstractGenerationModel[]
+
+  private isValidConfig(
+    maybeConfig: Partial<ModelManagerConfig>
+  ): maybeConfig is ModelManagerConfig {
+    if (!maybeConfig.embeddingModels || !Array.isArray(maybeConfig.embeddingModels)) {
+      throw new Error('Invalid configuration: embedding_models is missing or not an array')
+    }
+    if (!maybeConfig.generationModels || !Array.isArray(maybeConfig.generationModels)) {
+      throw new Error('Invalid configuration: summarization_models is missing or not an array')
+    }
+
+    maybeConfig.embeddingModels.forEach((model: ModelConfig) => {
+      this.isValidModelConfig(model)
+    })
+
+    maybeConfig.generationModels.forEach((model: ModelConfig) => {
+      this.isValidModelConfig(model)
+    })
+
+    return true
+  }
+
+  private isValidModelConfig(model: ModelConfig): model is ModelConfig {
+    if (typeof model.model !== 'string') {
+      throw new Error('Invalid ModelConfig: model field should be a string')
+    }
+    if (model.url !== undefined && typeof model.url !== 'string') {
+      throw new Error('Invalid ModelConfig: url field should be a string')
+    }
+
+    return true
+  }
+
+  constructor(config: ModelManagerConfig) {
+    // Validate configuration
+    this.isValidConfig(config)
+
+    // Initialize embeddings models
+    this.embeddingModelsMapByTable = {}
+    this.embeddingModels = config.embeddingModels.map((modelConfig) => {
+      const [modelType] = modelConfig.model.split(':') as [EmbeddingsModelType, string]
+
+      switch (modelType) {
+        case 'text-embeddings-inference': {
+          const embeddingsModel = new TextEmbeddingsInference(modelConfig)
+          this.embeddingModelsMapByTable[embeddingsModel.tableName] = embeddingsModel
+          return embeddingsModel
+        }
+        default:
+          throw new Error(`unsupported embeddings model '${modelType}'`)
+      }
+    })
+
+    // Initialize summarization models
+    this.generationModels = config.generationModels.map((modelConfig) => {
+      const [modelType, _] = modelConfig.model.split(':') as [GenerationModelType, string]
+
+      switch (modelType) {
+        case 'text-generation-inference': {
+          const generator = new TextGenerationInference(modelConfig)
+          return generator
+        }
+        default:
+          throw new Error(`unsupported summarization model '${modelType}'`)
+      }
+    })
+  }
+
+  async maybeCreateTables(pg: Kysely<any>) {
+    const maybePromises = this.embeddingModels.map(async (embeddingsModel) => {
+      const tableName = embeddingsModel.tableName
+      const hasTable =
+        (
+          await sql<number[]>`SELECT 1 FROM ${sql.id('pg_catalog', 'pg_tables')} WHERE ${sql.id(
+            'tablename'
+          )} = ${tableName}`.execute(pg)
+        ).rows.length > 0
+      if (hasTable) return undefined
+      const vectorDimensions = embeddingsModel.embeddingDimensions
+      console.log(`ModelManager: creating ${tableName} with ${vectorDimensions} dimensions`)
+      const query = sql`
+      DO $$
+  BEGIN
+  CREATE TABLE IF NOT EXISTS ${sql.id(tableName)} (
+    "id" INT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY,
+    "embedText" TEXT,
+    "embedding" vector(${sql.raw(vectorDimensions.toString())}),
+    "embeddingsMetadataId" INTEGER NOT NULL,
+    FOREIGN KEY ("embeddingsMetadataId")
+      REFERENCES "EmbeddingsMetadata"("id")
+      ON DELETE CASCADE
+  );
+  CREATE INDEX IF NOT EXISTS "idx_${sql.raw(tableName)}_embedding_vector_cosign_ops"
+    ON ${sql.id(tableName)}
+    USING hnsw ("embedding" vector_cosine_ops);
+  END $$;
+
+      `
+      return query.execute(pg)
+    })
+    Promise.all(maybePromises)
+  }
+}
+
+let modelManager: ModelManager | undefined
+export function getModelManager() {
+  if (modelManager) return modelManager
+  const {AI_EMBEDDING_MODELS, AI_GENERATION_MODELS} = process.env
+  const config: ModelManagerConfig = {
+    embeddingModels: [],
+    generationModels: []
+  }
+  try {
+    config.embeddingModels = AI_EMBEDDING_MODELS && JSON.parse(AI_EMBEDDING_MODELS)
+  } catch (e) {
+    throw new Error(`Invalid AI_EMBEDDING_MODELS .env JSON: ${e}`)
+  }
+  try {
+    config.generationModels = AI_GENERATION_MODELS && JSON.parse(AI_GENERATION_MODELS)
+  } catch (e) {
+    throw new Error(`Invalid AI_GENERATION_MODELS .env JSON: ${e}`)
+  }
+
+  modelManager = new ModelManager(config)
+
+  return modelManager
+}
+
+export default getModelManager