In [1]:
# !pip install langchain chromadb ctransformers transformers sentence_transformers
# Apple silicon:
# !pip uninstall ctransformers
# !CT_METAL=1 pip install ctransformers --no-binary ctransformers

In [2]:
from pathlib import Path
from pprint import pprint

DATA_DIR = Path("../data")
SNAPSHOTS_DIR = DATA_DIR / "platform-docs-snapshots"
VERSIONS_DIR = DATA_DIR / "platform-docs-versions"

# hparams
chunk_size = 1024
chunk_overlap = chunk_size // 10
embedder_name = "BAAI/bge-small-en-v1.5" # https://huggingface.co/spaces/mteb/leaderboard

# Load data

In [3]:
from data import get_training_data

train_queries, train_answers, train_context = get_training_data()

query = [train_queries[0], train_queries[-1]]
query

['Écris-moi une requête curl. Cette requête doit me permettre d\'obtenir, auprès de Facebook, les publicités politiques contenant le mot "europe" et qui ont atteint la France et la Belgique. La réponse ne doit contenir que le code de la requête curl.',
 'How is content moderation carried out on X?']

In [4]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader, DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

docs = DirectoryLoader(VERSIONS_DIR, glob="[!.]*/[!.]*.md", loader_cls=TextLoader).load()
docs = [d for d in docs if Path(d.metadata['source']) != VERSIONS_DIR / "README.md"]

# Load Retriever, build index

In [5]:
from retrievers.chroma_dpr import ChromaRetriever
import re

retriever = ChromaRetriever()
collection_name = "DSA"
retriever.build(docs, collection_name)

In [6]:
pprint(query)

retriever.query(query, k=1)

["Écris-moi une requête curl. Cette requête doit me permettre d'obtenir, "
 'auprès de Facebook, les publicités politiques contenant le mot "europe" et '
 'qui ont atteint la France et la Belgique. La réponse ne doit contenir que le '
 'code de la requête curl.',
 'How is content moderation carried out on X?']


{'ids': [['437eb619f8a8442184a6363ad1fc066f'],
  ['ae37b11903bf4d1e861340a5409fbe24']],
 'distances': [[1.6061259508132935], [1.5492799282073975]],
 'metadatas': [[{'source': '../data/platform-docs-versions/TikTok_Transparency/Community Guidelines Enforcement Reports.md'}],
  [{'source': '../data/platform-docs-versions/Facebook_Community-Report/Community Standards Enforcement Report.md'}]],
 'embeddings': None,
 'documents': [['**Définitions**\n\n* **Les réseaux ont opéré depuis :** indique l’emplacement géographique des opérations du réseau selon des preuves techniques et comportementales récoltées depuis des sources privées et ouvertes. Il est possible que TikTok ne soit pas en mesure de relier ces réseaux à des entités, des individus ou des groupes définis.\n* **Source de la détection :** considérée comme interne quand la présence de l’activité a été identifiée uniquement via une enquête menée en interne. La détection externe renvoie à des enquêtes qui ont débuté avec un rapport ext

In [7]:
results = retriever.query(
    query=["How do I get activity logs from the Twitter API?"],
    k=1,
)

In [8]:
results

{'ids': [['1973f7cb76834c268c8a273b4cb4ede4']],
 'distances': [[1.625510334968567]],
 'metadatas': [[{'source': '../data/platform-docs-versions/X_Twitter-API-V2/Tweets.md'}]],
 'embeddings': None,
 'documents': [['|     |     |     |\n| --- | --- | --- |\n| **Description** | **Standard v1.1** | **Twitter API v2** |\n| Documentation | [API Reference](https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get-statuses-user_timeline.html) | [API Reference](https://developer.twitter.com/en/docs/twitter-api/tweets/timelines/api-reference/get-users-id-tweets.html) |\n| HTTP methods supported | GET | GET |\n| Host domain | https://api.twitter.com | https://api.twitter.com |\n| Endpoint paths | /1.1/statuses/user\\_timeline.json | /2/users/:id/tweets |\n| Required parameters | user\\_id or screen\\_name | User ID set as path parameter :id |\n| Authentication | OAuth 1.0a User Context<br><br>OAuth 2.0 App-Only | OAuth 1.0a User Context<br><br>OAuth 2.0 App-Only<br><

In [9]:
pprint(results["metadatas"])
results["documents"][0][0]

[[{'source': '../data/platform-docs-versions/X_Twitter-API-V2/Tweets.md'}]]


'|     |     |     |\n| --- | --- | --- |\n| **Description** | **Standard v1.1** | **Twitter API v2** |\n| Documentation | [API Reference](https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get-statuses-user_timeline.html) | [API Reference](https://developer.twitter.com/en/docs/twitter-api/tweets/timelines/api-reference/get-users-id-tweets.html) |\n| HTTP methods supported | GET | GET |\n| Host domain | https://api.twitter.com | https://api.twitter.com |\n| Endpoint paths | /1.1/statuses/user\\_timeline.json | /2/users/:id/tweets |\n| Required parameters | user\\_id or screen\\_name | User ID set as path parameter :id |\n| Authentication | OAuth 1.0a User Context<br><br>OAuth 2.0 App-Only | OAuth 1.0a User Context<br><br>OAuth 2.0 App-Only<br><br>OAuth 2.0 Authorization Code with PKCE |'

# Load Generator

### Quantized Mistral 7B, finetuned on code instructions
- https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-16k-GGUF#provided-files

In [10]:
from IPython.display import display, Markdown

def print_chat(chat_log):
    for entry in chat_log:
        if entry['role'] != 'system':
            display(Markdown(f"**{entry['role'].capitalize()}:** \n{entry['content']}\n"))

In [11]:
from generators.mistral import MistralRAGGenerator

rag_generator = MistralRAGGenerator()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
k = 5

results = retriever.query(query, k)

context = results['documents']

In [13]:
chat_logs = rag_generator.generate_batch(context, query)

  0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
# For evaluation

answer = [c[-1]["content"] for c in chat_logs]

for q, a in zip(query, answer):
    print("Q:", q)
    print("A", a)

Q: Écris-moi une requête curl. Cette requête doit me permettre d'obtenir, auprès de Facebook, les publicités politiques contenant le mot "europe" et qui ont atteint la France et la Belgique. La réponse ne doit contenir que le code de la requête curl.
A ```bash
curl -G -d "q=Europe&place=FR,BE&type=political_ads" https://graph.facebook.com/v12.0/adstories
```
Q: How is content moderation carried out on X?
A 


In [15]:
chat_logs[-1]

[{'role': 'system',
  'content': '\nYou are a QA assistant that answers questions based only on the context you are given.\n\nExamples:\nQ: I am a coding expert. How could I get more info about a Facebook post whose url slug ends with 123456789_123456789? I have a crowdtangle API token (TOKEN). Your output is a bash code snippet.\nA: ```bash\ncurl -L https://api.crowdtangle.com/post/123456789_123456789?token=<CROWDTANGLE_API_TOKEN>\n```\n\nNext section will contain the context. You can use it to formulate the answer to the user question.\n------------------\nCONTEXT 0\nTerrorism\n\nOrganized Hate\n\nHow we calculate it\n\nContent actioned is the total number of pieces of content that Facebook took action on for terrorism and organized hate. It includes both content we actioned after someone reported it, and content that we found proactively.\n\n[Read about this data](https://transparency.fb.com/policies/improving/content-actioned-metric/)\n\nproactive rate\n--------------\n\nOf the vio