In [1]:
# !pip install langchain chromadb ctransformers transformers sentence_transformers
# Apple silicon:
# !pip uninstall ctransformers
# !CT_METAL=1 pip install ctransformers --no-binary ctransformers

In [2]:
from pathlib import Path
from pprint import pprint

DATA_DIR = Path("../data")
SNAPSHOTS_DIR = DATA_DIR / "platform-docs-snapshots"
VERSIONS_DIR = DATA_DIR / "platform-docs-versions"

# Load data

In [3]:
from data import get_training_data

train_queries, train_answers, train_context = get_training_data()

In [4]:
from translation import Translator

translator = Translator()

train_queries_translated = [translator.translate(q, 'en') for q in train_queries]
train_queries, source_languages = zip(*train_queries_translated)

In [7]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader, DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

docs = DirectoryLoader(VERSIONS_DIR, glob="[!.]*/[!.]*.md", loader_cls=TextLoader).load()
docs = [d for d in docs if Path(d.metadata['source']) != VERSIONS_DIR / "README.md"]

In [8]:
import langdetect
from collections import Counter

languages = [langdetect.detect(d.page_content) for d in docs]

for doc, language in zip(docs, languages):
    doc.metadata["language"] = language

print(Counter(languages))



Counter({'en': 131, 'fr': 8})


# Load Retriever, build index

In [10]:
from retrievers.chroma_dpr import ChromaRetriever
from retrievers.colbert import ColbertRetriever
import re

retriever = ColbertRetriever()
collection_name = "DSA"
retriever.build(docs, collection_name)

Loading index from .ragatouille/colbert/indexes/DSA...


# Load Generator

### Quantized Mistral 7B, finetuned on code instructions
- https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-16k-GGUF#provided-files

In [11]:
from IPython.display import display, Markdown

def print_chat(chat_log):
    for entry in chat_log:
        if entry['role'] != 'system':
            display(Markdown(f"**{entry['role'].capitalize()}:** \n{entry['content']}\n"))

In [12]:
from generators.mistral import MistralRAGGenerator

rag_generator = MistralRAGGenerator()

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
k = 5

queries = train_queries[:1]
source_languages = source_languages[:1]

results = retriever.query(queries, k)

context = [r['content'] for r in results]

Loading searcher for index DSA for the first time... This may take a few seconds
[Feb 02, 13:15:04] #> Loading codec...
[Feb 02, 13:15:04] #> Loading IVF...
[Feb 02, 13:15:04] Loading segmented_lookup_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...




[Feb 02, 13:15:04] #> Loading doclens...


100%|█████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2661.36it/s]

[Feb 02, 13:15:04] #> Loading codes and residuals...



100%|███████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 56.42it/s]

[Feb 02, 13:15:04] Loading filter_pids_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...





[Feb 02, 13:15:04] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
Searcher loaded!

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . I am writing a curl request. This request is intended to enable me to obtain from Facebook the political advertisements containing the word 'europe' which have reached France and Belgium. The answer must contain only the curl request code., 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  1045,  2572,  3015,  1037, 15390,  5227,  1012,  2023,
         5227,  2003,  3832,  2000,  9585,  2033,  2000,  6855,  2013,  9130,
         1996,  2576, 14389,  4820,  1996,  2773,  1005,  2885,  1005,  2029,
         2031,   102])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1])



1it [00:00, 110.58it/s]


In [17]:
chat_logs = rag_generator.generate_batch(context, queries)

  0%|          | 0/1 [00:00<?, ?it/s]

In [18]:
answers = [c[-1]["content"] for c in chat_logs]



In [21]:
answers = [translator.translate(ans, lang)[0] for ans, lang in zip(answers, source_languages)]

In [22]:
# For evaluation

for q, a in zip(queries, answers):
    print("Q:", q)
    print("A", a)

Q: I am writing a curl request. This request is intended to enable me to obtain from Facebook the political advertisements containing the word 'europe' which have reached France and Belgium. The answer must contain only the curl request code.
A Pour obtenir une liste des annonces politiques sur Facebook qui contiennent le mot « Europe » et qui ont atteint tant la France que la Belgique, vous pouvez utiliser le commandement curl suivant :
` `battre
curl -X GET -G 'https://graph.facebook.com/v13.0/act_<AD_ACCOUNT_ID>/adsinsights?fields=name,adtype,reach_locations&location=FR-fr,BE-nl,location_search_text=europe' --header "Access_Token: <ACCESS_TOKEN"
` ` ` ` ` ` ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́