Feature/add back scripts (#128)

* Add back sciphi * cleanup scripts * expand scripts
SciPhi-AI · Dec 17, 2023 · a97c6f4 · a97c6f4
1 parent 2cc8a35
commit a97c6f4
Show file tree

Hide file tree

Showing 16 changed files with 554 additions and 92 deletions.
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -23,7 +23,7 @@ Welcome to Synthesizer 💡
 
 A multi-purpose LLM framework for inference, RAG, and data creation.
 
-Looking for the AgentSearch documentation? `Click Here <agent-search.readthedocs.io>`_.
+Looking for the AgentSearch documentation? `Click Here <https://agent-search.readthedocs.io>`_.
 
 With Synthesizer, you can:
 

diff --git a/docs/source/setup/quickstart.rst b/docs/source/setup/quickstart.rst
@@ -7,9 +7,7 @@ Welcome to the Synthesizer quickstart guide! Synthesizer, or ΨΦ, is your porta
 
 This guide will introduce you to:
 
-- Generating data tailored to your needs.
 - Using the RAG provider interface.
-- Creating RAG-enhanced textbooks.
 - Evaluating your RAG pipeline.
 
 
@@ -26,10 +24,43 @@ Before you start, ensure you've installed Synthesizer:
 
 For additional details, refer to the `installation guide <https://sciphi.readthedocs.io/en/latest/setup/installation.html>`_.
 
-Instantiate Your LLM and RAG Provider
+Using Synthesizer
+-----------------
+
+1. **Generate synthetic question answer pairs**
+
+   .. code-block:: bash
+
+      export SCIPHI_API_KEY=MY_SCIPHI_API_KEY
+      python -m synthesizer.scripts.data_augmenter run --dataset="wiki_qa"
+
+   .. code-block:: bash
+
+      tail augmented_output/config_name_eq_answer_question__dataset_name_eq_wiki_qa.jsonl
+      { "formatted_prompt": "... ### Question:\nwhat country did wine originate in\n\n### Input:\n1. URL: https://en.wikipedia.org/wiki/History%20of%20wine (Score: 0.85)\nTitle:History of wine....",
+      { "completion": Wine originated in the South Caucasus, which is now part of modern-day Armenia ...
+
+2. **Evaluate RAG pipeline performance**
+
+   .. code-block:: bash
+
+      export SCIPHI_API_KEY=MY_SCIPHI_API_KEY
+      python -m synthesizer.scripts.rag_harness --rag_provider="agent-search" --llm_provider_name="sciphi" --n_samples=25
+
+   .. code-block:: bash
+      ...
+      INFO:__main__:Now generating completions...
+      100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:29<00:00,  3.40it/s]
+      INFO:__main__:Final Accuracy=0.42
+
+.. note::
+    This is a basic introduction to Synthesizer. Check back later for more detailed and intricate documentation that delves deeper into advanced features and customization options.
+
+
+Developing with Synthesizer
 -------------------------------------
 
-Here's how you can use Synthesizer to quickly set up and retrieve chat completions, without diving deep into intricate configurations:
+Here's how you can use Synthesizer to quickly set up and RAG augmented generation, without diving deep into intricate configurations:
 
 .. code-block:: python
     
@@ -45,8 +76,7 @@ Here's how you can use Synthesizer to quickly set up and retrieve chat completio
 
     # RAG Provider Settings
     rag_interface = RAGInterfaceManager.get_interface_from_args(
-        RAGProviderName(rag_provider_name),
-        api_base=rag_api_base,
+        RAGProviderName("agent-search"),
         limit_hierarchical_url_results=rag_limit_hierarchical_url_results,
         limit_final_pagerank_results=rag_limit_final_pagerank_results,
     )
@@ -65,13 +95,7 @@ Here's how you can use Synthesizer to quickly set up and retrieve chat completio
         # other generation params here ...
     )
 
-    formatted_prompt = rag_prompt.format(rag_context=rag_context)
+    formatted_prompt = raw_prompt.format(rag_context=rag_context)
     completion = llm_interface.get_completion(
         formatted_prompt, generation_config
-    )
-    print(completion)
-
-    ### Output:
-    # Fermat's Last Theorem was proven by British mathematician Andrew Wiles in 1994 (Wikipedia). Wiles's proof was based on a special case of the modularity theorem for elliptic curves, along with Ribet's theorem (Wikipedia). The modularity theorem and Fermat's Last Theorem were previously considered inaccessible to proof by contemporaneous mathematicians (Wikipedia). However, Wiles's proof provided a solution to Fermat's Last Theorem, which had remained unproved for over 300 years (PlanetMath). Wiles's proof is widely accepted and has been recognized with numerous awards, including the Abel Prize in 2016 (Wikipedia).
-
-    # It is important to note that Wiles's proof of Fermat's Last Theorem is a mathematical proof and not related to the science fiction novel "The Last Theorem" by Arthur C. Clarke and Frederik Pohl (Wikipedia). The novel is a work of fiction and does not provide a real mathematical proof for Fermat's Last Theorem (Wikipedia). Additionally, there have been other attempts to prove Fermat's Last Theorem, such as Sophie Germain's approach, but Wiles's proof is the most widely accepted and recognized (Math Stack Exchange).
+    )
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,7 +11,7 @@ authors = ["Owen Colegrove <owen@sciphi.ai>"]
 license = "Apache-2.0"
 readme = "README.md"
 name = 'sciphi-synthesizer'
-version = '1.0.0'
+version = '1.0.1'
 packages = [
     { include = "synthesizer" }
 ]
@@ -24,7 +24,6 @@ fire = "^0.5.0"
 openai = { version = "0.27.8" }
 pyyaml = "^6.0.1"
 retrying = "^1.3.4"
-tqdm = "^4.66.1"
 
 # Begin optional dependencies
 

diff --git a/synthesizer/config/prompts/answer_question.yaml b/synthesizer/config/prompts/answer_question.yaml
diff --git a/synthesizer/config/prompts/question_and_answer.yaml b/synthesizer/config/prompts/question_and_answer.yaml
diff --git a/synthesizer/eval/rag/science_multiple_choice/rag_science_evaluator.py b/synthesizer/eval/rag/science_multiple_choice/rag_science_evaluator.py
@@ -30,7 +30,7 @@
             E: {E}
         
         #### Wikipedia Context:
-            {wiki_context}
+            {search_context}
             
         #### Answer:
 """
@@ -70,17 +70,16 @@ def __init__(
                 "evals",
                 f"{ScienceMultipleChoiceEvaluator.NAME.lower().replace(' ', '_')}.csv",
             )
-        )
+        ).head(n_samples)
 
     def initialize_prompts(self):
         contexts = (
-            self.rag_interface.get_rag_context(
-                list(
-                    self.evals[
-                        ScienceMultipleChoiceEvaluator.PROMPT_FIELD
-                    ].values
-                )
-            )
+            [
+                self.rag_interface.get_rag_context(prompt)
+                for prompt in self.evals[
+                    ScienceMultipleChoiceEvaluator.PROMPT_FIELD
+                ].values
+            ]
             if self.rag_interface
             else [ScienceMultipleChoiceEvaluator.RAG_DISABLED_RESPONSE]
             * len(self.evals)
@@ -98,7 +97,7 @@ def build_prompt(self, entry: dict, context: str) -> str:
             + "\n"
             + SCIENCE_QUESTION_TEMPLATE.format(
                 example_number=self.n_few_shot + 1,
-                wiki_context=context,
+                search_context=context,
                 **entry,
             )
         )
@@ -119,8 +118,8 @@ def n_shot_science_template(self) -> str:
                 SCIENCE_QUESTION_TEMPLATE.format(
                     example_number=example_number,
                     prompt=example_prompt,
-                    wiki_context=self.rag_interface.get_rag_context(
-                        [example_prompt]
+                    search_context=self.rag_interface.get_rag_context(
+                        example_prompt
                     )
                     if self.rag_interface
                     else ScienceMultipleChoiceEvaluator.RAG_DISABLED_RESPONSE,
@@ -140,9 +139,9 @@ def n_shot_science_template(self) -> str:
                 SCIENCE_QUESTION_TEMPLATE.format(
                     example_number=example_number,
                     prompt=example_prompt,
-                    wiki_context=self.rag_interface.get_rag_context(
-                        [example_prompt]
-                    )[0]
+                    search_context=self.rag_interface.get_rag_context(
+                        example_prompt
+                    )
                     if self.rag_interface
                     else ScienceMultipleChoiceEvaluator.RAG_DISABLED_RESPONSE,
                     A="Mitochondria are primarily responsible for protein synthesis using ribosomes.",
@@ -161,9 +160,9 @@ def n_shot_science_template(self) -> str:
                 SCIENCE_QUESTION_TEMPLATE.format(
                     example_number=example_number,
                     prompt=example_prompt,
-                    wiki_context=self.rag_interface.get_rag_context(
-                        [example_prompt]
-                    )[0]
+                    search_context=self.rag_interface.get_rag_context(
+                        example_prompt
+                    )
                     if self.rag_interface
                     else ScienceMultipleChoiceEvaluator.RAG_DISABLED_RESPONSE,
                     A="Oxidation involves the addition of oxygen to a molecule or the loss of electrons from an atom or molecule.",

diff --git a/synthesizer/interface/__init__.py b/synthesizer/interface/__init__.py
@@ -9,6 +9,7 @@
     HuggingFaceLLMInterface,
 )
 from synthesizer.interface.llm.openai_interface import OpenAILLMInterface
+from synthesizer.interface.llm.sciphi_interface import SciPhiLLMInterface
 from synthesizer.interface.llm.vllm_interface import vLLMInterface
 from synthesizer.interface.llm_interface_manager import LLMInterfaceManager
 from synthesizer.interface.rag.agent_search import (
@@ -27,6 +28,7 @@
     "AnthropicLLMInterface",
     "HuggingFaceLLMInterface",
     "OpenAILLMInterface",
+    "SciPhiLLMInterface",
     "vLLMInterface",
     # RAG
     "RAGInterfaceManager",

diff --git a/synthesizer/interface/llm/sciphi_interface.py b/synthesizer/interface/llm/sciphi_interface.py
@@ -0,0 +1,60 @@
+"""A module for interfacing with the SciPhi API"""
+import logging
+
+from synthesizer.interface.base import LLMInterface, LLMProviderName
+from synthesizer.interface.llm_interface_manager import llm_interface
+from synthesizer.llm import GenerationConfig, SciPhiConfig, SciPhiLLM
+
+logger = logging.getLogger(__name__)
+
+
+@llm_interface
+class SciPhiLLMInterface(LLMInterface):
+    """A class to interface with the SciPhi API."""
+
+    provider_name = LLMProviderName.SCIPHI
+    system_message = "You are a helpful assistant."
+
+    def __init__(
+        self,
+        config: SciPhiConfig,
+        *args,
+        **kwargs,
+    ) -> None:
+        self.config = config
+        self._model = SciPhiLLM(config)
+
+    def get_completion(
+        self, prompt: str, generation_config: GenerationConfig
+    ) -> str:
+        """Get a completion from the SciPhi API based on the provided prompt."""
+
+        logger.debug(
+            f"Getting completion from SciPhi API for model={generation_config.model_name}"
+        )
+        if "instruct" in generation_config.model_name:
+            return self.model.get_instruct_completion(
+                prompt, generation_config
+            )
+        else:
+            return self._model.get_chat_completion(
+                [
+                    {
+                        "role": "system",
+                        "content": SciPhiLLMInterface.system_message,
+                    },
+                    {"role": "user", "content": prompt},
+                ],
+                generation_config,
+            )
+
+    def get_chat_completion(
+        self, conversation: list[dict], generation_config: GenerationConfig
+    ) -> str:
+        raise NotImplementedError(
+            "Chat completion not yet implemented for SciPhi."
+        )
+
+    @property
+    def model(self) -> SciPhiLLM:
+        return self._model
diff --git a/synthesizer/interface/rag/agent_search.py b/synthesizer/interface/rag/agent_search.py
@@ -2,6 +2,7 @@
 from dataclasses import dataclass
 
 from agent_search.core import SERPClient
+
 from synthesizer.core import RAGProviderName
 from synthesizer.interface.base import RAGInterface, RAGProviderConfig
 from synthesizer.interface.rag_interface_manager import (

diff --git a/synthesizer/llm/__init__.py b/synthesizer/llm/__init__.py
@@ -1,17 +1,17 @@
-from synthesizer.llm.base import LLM, GenerationConfig, LLMConfig, ModelName
+from synthesizer.llm.base import LLM, GenerationConfig, LLMConfig
 from synthesizer.llm.config_manager import LLMConfigManager
 from synthesizer.llm.models.anthropic_llm import AnthropicConfig, AnthropicLLM
 from synthesizer.llm.models.hugging_face_llm import (
     HuggingFaceConfig,
     HuggingFaceLLM,
 )
 from synthesizer.llm.models.openai_llm import OpenAIConfig, OpenAILLM
+from synthesizer.llm.models.sciphi_llm import SciPhiConfig, SciPhiLLM
 from synthesizer.llm.models.vllm_llm import vLLM, vLLMConfig
 
 __all__ = [
     # Base
     "LLM",
-    "ModelName",
     "LLMConfig",
     "LLMConfigManager",
     "GenerationConfig",
@@ -22,6 +22,8 @@
     "HuggingFaceLLM",
     "OpenAIConfig",
     "OpenAILLM",
+    "SciPhiConfig",
+    "SciPhiLLM",
     "vLLMConfig",
     "vLLM",
 ]
diff --git a/synthesizer/llm/base.py b/synthesizer/llm/base.py
@@ -1,36 +1,11 @@
 """Base classes for language model providers."""
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field, fields
-from enum import Enum
 from typing import Optional
 
 from synthesizer.core import LLMProviderName
 
 
-class ModelName(Enum):
-    """An enum to hold the names of supported models."""
-
-    # OpenAI Models
-
-    ## GPT-3.5
-    GPT_3p5_TURBO_0301 = "gpt-3.5-turbo-0301"
-    GPT_3p5_TURBO_0613 = "gpt-3.5-turbo-0613"
-    GPT_3p5_TURBO_16k_0613 = "gpt-3.5-turbo-16k-0613"
-    GPT_3p5_TURBO = "gpt-3.5-turbo"
-    GPT_3p5_TURBO_INSTRUCT = "gpt-3.5-turbo-instruct"
-
-    ## GPT-4
-    GPT_4_0314 = "gpt-4-0314"
-    GPT_4_0613 = "gpt-4-0613"
-    GPT_4 = "gpt-4"
-    GPT_4_32k = "gpt-4-32k"
-
-    # Anthropic Models
-
-    CLAUDE_INSTANT_1 = "claude-instant-1"
-    CLAUDE_2 = "claude-2"
-
-
 @dataclass
 class LLMConfig(ABC):
     provider_name: LLMProviderName