In [1]:
# !pip install langchain chromadb ctransformers transformers sentence_transformers
# Apple silicon:
# !pip uninstall ctransformers
# !CT_METAL=1 pip install ctransformers --no-binary ctransformers

In [2]:
from pathlib import Path
from pprint import pprint

DATA_DIR = Path("../data")
SNAPSHOTS_DIR = DATA_DIR / "platform-docs-snapshots"
VERSIONS_DIR = DATA_DIR / "platform-docs-versions"

# hparams
chunk_size = 1024
chunk_overlap = chunk_size // 10
embedder_name = "BAAI/bge-small-en-v1.5" # https://huggingface.co/spaces/mteb/leaderboard

# Pre-processing

In [3]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader, DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

docs = DirectoryLoader(VERSIONS_DIR, glob="[!.]*/[!.]*.md", loader_cls=TextLoader).load()
docs = [d for d in docs if Path(d.metadata['source']) != VERSIONS_DIR / "README.md"]

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs_split = text_splitter.split_documents(docs)

print(len(docs))
print(len(docs_split))

139
11598


# Build index

In [6]:
import uuid
from tqdm.notebook import tqdm
import chromadb
from chromadb.utils import embedding_functions
import re

# Chroma uses all-MiniLM-L6-v2 by default
chroma_client = chromadb.PersistentClient()
 
def format_model_name(name):
    # chromaDB only allows these characters
    return re.sub(r'[^a-zA-Z0-9_-]', '_', name)

collection_name = f"DSA_{format_model_name(embedder_name)}"

try:
    collection = chroma_client.get_collection(collection_name)
except ValueError:
    print("Building collection...")
    embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(embedder_name=embedder_name)

    collection = chroma_client.create_collection(name=collection_name, embedding_function=embedding_fn)
    
    text = [d.page_content for d in docs_split]
    metadatas = [d.metadata for d in docs_split]
    ids = [uuid.uuid4().hex for _ in range(len(docs_split))]
    
    collection.add(
        documents=text,
        metadatas=metadatas,
        ids=ids,
    )

In [7]:
results = collection.query(
    query_texts=["How do I get activity logs from the Twitter API?"],
    n_results=5,
)

In [8]:
pprint(results["metadatas"])
results["documents"][0][0]

[[{'source': '../data/platform-docs-versions/X_Twitter-API-V1/Tweets.md'},
  {'source': '../data/platform-docs-versions/X_Twitter-API-V2/Tweets.md'},
  {'source': '../data/platform-docs-versions/X_Twitter-API-V1/Tweets.md'},
  {'source': '../data/platform-docs-versions/X_Twitter-API-V2/Users.md'},
  {'source': '../data/platform-docs-versions/X_Twitter-API-V2/Tweets.md'}]]


'Working with timelines  \n\n-------------------------\n\nThe Twitter API has several methods, such as\xa0[GET statuses / user\\_timeline](https://developer.twitter.com/en/docs/tweets/timelines/api-reference/get-statuses-user_timeline.html)\xa0and\xa0[GET statuses / home\\_timeline](https://developer.twitter.com/en/docs/tweets/timelines/api-reference/get-statuses-home_timeline.html), which return a timeline of Tweet data. Such timelines can grow very large, so there are limits to how much of a timeline a client application may fetch in a single request. Applications must therefore iterate through timeline results in order to build a more complete list.\n\nBecause of Twitter’s realtime nature and the volume of data which is constantly being added to timelines, standard paging approaches are not always effective. The goal of this page is to demonstrate the issues Twitter developers may face when paging through result sets and to give best practices for processing a timeline.\n\n### The p

# Load Generator

### Quantized Mistral 7B, finetuned on code instructions
- https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-16k-GGUF#provided-files

In [9]:
from ctransformers import AutoModelForCausalLM, AutoConfig
from transformers import AutoTokenizer
from typing import List, Dict

gpu_layers = 0

model_file = "openhermes-2.5-mistral-7b-16k.Q4_K_M.gguf"

llm = AutoModelForCausalLM.from_pretrained(
    "TheBloke/OpenHermes-2.5-Mistral-7B-16k-GGUF",
    model_file="openhermes-2.5-mistral-7b-16k.Q4_K_M.gguf",
    model_type="mistral", 
    gpu_layers=gpu_layers,
    max_new_tokens=4000,
    context_length=16_000,
)

tokenizer = AutoTokenizer.from_pretrained("NurtureAI/OpenHermes-2.5-Mistral-7B-16k")

def format_prompt(messages: List[Dict[str, str]]) -> str:
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# RAG Query

In [18]:
system_message = '''
You are a QA assistant that answers questions based only on the context you are given.
- Give an  answer only based on the context and with no prior knowledge.
- You can only give answers to subjects that are provided in the CONTEXT section.
- If you cannot come up with an answer to the user question, answer that you do not know the answer.
- Your answers are short, complete and easy to follow. Include code examples if neccessary.

Example Questions:
Q: I have access to a set of tweets URLs that I consider to be hateful. How can I use Twitter's API to monitor the average duration between the tweet's creation and its moderation?
A: """In order to monitor the average duration between a tweet's publication and its moderation using Twitter's API, you could do the following:
- Gather the unsafe tweet's IDs;
- Post them on Twitter's batch compliance API;
- Once the result is ready, you can see whether the tweets have been deleted.
In order to monitor the delay of moderation, you could repeat the previous steps several times, and check when the prohibited tweets have been acted upon."""

Q: I am a coding expert. How could I get more info about a Facebook post whose url slug ends with 123456789_123456789? I have a crowdtangle API token (TOKEN). Your output is a bash code snippet.
A: ```bash
curl -L https://api.crowdtangle.com/post/123456789_123456789?token=<CROWDTANGLE_API_TOKEN>
```

------------------
CONTEXT
{context}
'''

In [19]:
def init_chat_log(context: str):
    return [
        {"role": "system", "content": system_message.format(context=context)},
    ]

def generate_answer(chat_log: List[Dict[str, str]], query: str):
    if query:
        chat_log.append({"role": "user", "content": query})
    if chat_log[-1]["role"] != "user":
        raise ValueError("query required")
            
    prompt = format_prompt(chat)
    output = llm(prompt, stop=["<|im_end|>"])
    chat_log.append({"role": "assistant", "content": output})
    return chat_log

from IPython.display import display, Markdown

def print_chat(chat_log):
    for entry in chat_log:
        if entry['role'] != 'system':
            display(Markdown(f"**{entry['role'].capitalize()}:** \n{entry['content']}\n"))

In [20]:
k = 5

query = "Tell me how to work with WhatsApp business accounts using the Facebook Graph API"
results = collection.query(
    query_texts=[query],
    n_results=k,
)

contexts = []
for i in range(k):
    source_i = results["metadatas"][0][i]["source"]
    doc_i = results["documents"][0][i]
    contexts.append(f"CONTEXT {i}\nFILENAME: {source_i}\n{doc_i}")

context = "\n\n".join(contexts)
print(context)

CONTEXT 0
FILENAME: ../data/platform-docs-versions/Facebook_Graph-API/Graph API Overview.md
#### X-Ad-Account-Usage Header Contents

| Key | Value Description |
| --- | --- |
| `acc_id_util_pct` | The percentage of calls made for this ad account before the rate limit is reached. |
| `reset_time_duration` | Time duration (in seconds) it takes to reset the current rate limit to 0. |
| `ads_api_access_tier` | Tiers allows your app to access the Marketing API. By default, apps are in the `development_access` tier. `Standard_access` enables lower rate limiting. To get a higher rate limit and get to the standard tier, you can apply for the "Advanced Access" to the [Ads Management Standard Access](https://developers.facebook.com/docs/marketing-api/overview/authorization#layer-2--access-levels--permissions--and-features) feature. |

#### Total CPU Time

The amount of CPU time the request takes to process. When `total_cputime` reaches 100, calls may be throttled.

#### Total Time

The length of

In [21]:
chat = init_chat_log(context)
chat = generate_answer(chat, query)
print_chat(chat)

**User:** 
Tell me how to work with WhatsApp business accounts using the Facebook Graph API


**Assistant:** 
To work with WhatsApp Business Accounts using the Facebook Graph API, you need to have the following permissions: `whatsapp_business_management`, `whatsapp_messaging`, and `public_profile`. You also need a WhatsApp Business Account (WABA) ID and a user access token.

To retrieve information about a WhatsApp Business Account, you can use the following cURL command:
```bash
curl -i -X GET "https://graph.facebook.com/LATEST-VERSION/WHATSAPP-BUSINESS-ACCOUNT-ID?access_token=USER-ACCESS-TOKEN"
```
or in SDK Objective-C:
```objective-c
GraphRequest request = GraphRequest.newGraphPathRequest(
  accessToken,
  "/WHATSAPP-BUSINESS-ACCOUNT-ID",
  new GraphRequest.Callback() {
    @Override
    public void onCompleted(GraphResponse response) {
      // Insert your code here
    }
});
request.executeAsync();
```


In [22]:
chat = generate_answer(chat, query="and how do I cook pancakes?")
print_chat(chat)

**User:** 
Tell me how to work with WhatsApp business accounts using the Facebook Graph API


**Assistant:** 
To work with WhatsApp Business Accounts using the Facebook Graph API, you need to have the following permissions: `whatsapp_business_management`, `whatsapp_messaging`, and `public_profile`. You also need a WhatsApp Business Account (WABA) ID and a user access token.

To retrieve information about a WhatsApp Business Account, you can use the following cURL command:
```bash
curl -i -X GET "https://graph.facebook.com/LATEST-VERSION/WHATSAPP-BUSINESS-ACCOUNT-ID?access_token=USER-ACCESS-TOKEN"
```
or in SDK Objective-C:
```objective-c
GraphRequest request = GraphRequest.newGraphPathRequest(
  accessToken,
  "/WHATSAPP-BUSINESS-ACCOUNT-ID",
  new GraphRequest.Callback() {
    @Override
    public void onCompleted(GraphResponse response) {
      // Insert your code here
    }
});
request.executeAsync();
```


**User:** 
and how do I cook pancakes?


**Assistant:** 
I'm not a cooking assistant, but here's a basic recipe to get you started with pancake cooking. Note that I might make the batter too runny or too thick. Please adjust accordingly:

Ingredients
----------

* 240 grams plain flour (can also be AP Flour)
* 375 ml milk
* 2 eggs
* 1 tablespoon sugar
* 2 tablespoons melted butter
* 1/2 teaspoon baking powder
* Pinch of salt
* Optional: 1/2 teaspoon vanilla extract or 1 mashed ripe banana (for a bit of flavour and sweetness)

Instructions
------------

1. Mix all the ingredients together in a bowl. The batter should be the consistency of heavy cream, i.e., thick enough to coat the back of a spoon but pourable. Adjust with more flour or milk as needed.
2. Heat a non-stick pan over medium heat and add some butter.
3. Pour 1/4 cup of batter per pancake into the pan. Wait for bubbles to appear on the surface, then flip and cook until both sides are golden brown.

This is a basic recipe and can be adjusted according to your taste. You may add chocolate chips or blueberries or cinnamon sugar or whatever you like. Enjoy!
