In [1]:
# !pip install langchain chromadb ctransformers transformers sentence_transformers
# Apple silicon:
# !pip uninstall ctransformers
# !CT_METAL=1 pip install ctransformers --no-binary ctransformers

In [2]:
from pathlib import Path
from pprint import pprint

DATA_DIR = Path("../data")
SNAPSHOTS_DIR = DATA_DIR / "platform-docs-snapshots"
VERSIONS_DIR = DATA_DIR / "platform-docs-versions"

# Load data

In [3]:
from data import get_training_data

train_queries, train_answers, train_context = get_training_data()

query = [train_queries[-3]]
query

["How to get the number of likes of a TikTok post. I'd like an example query and an explanation of the parameters used."]

In [4]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader, DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

docs = DirectoryLoader(VERSIONS_DIR, glob="[!.]*/[!.]*.md", loader_cls=TextLoader).load()
docs = [d for d in docs if Path(d.metadata['source']) != VERSIONS_DIR / "README.md"]

In [5]:
import langdetect
from collections import Counter

languages = [langdetect.detect(d.page_content) for d in docs]

for doc, language in zip(docs, languages):
    doc.metadata["language"] = language

print(Counter(languages))

Counter({'en': 131, 'fr': 8})


# Load Retriever, build index

In [6]:
from retrievers.chroma_dpr import ChromaRetriever
from retrievers.colbert import ColbertRetriever
import re

retriever = ColbertRetriever()
collection_name = "DSA"
retriever.build(docs, collection_name)

Loading index from .ragatouille/colbert/indexes/DSA...
[Jan 30, 23:53:54] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...




In [9]:
results = retriever.query(
    query=["How do I get activity logs from the Twitter API?"],
    k=5,
)

1it [00:00, 133.97it/s]


In [10]:
results

[{'content': "When you apply for Account Activity API access, you'll need this app ID.  \n    \xa0\n\n### 2\\. Get Account Activity API access\n\nAfter creating a Twitter app, the next step is applying for Account Activity API access.\xa0\n\nThose needing enterprise level access to more than 250 account subscriptions and 3+ webhooks will need to submit an application at the following link. If you can satisfy your use case with less access than this, you may want to check out [Account Activity API premium](https://developer.twitter.com/en/docs/twitter-api/premium/account-activity-api/overview).\xa0\n\n[Apply for Enterprise access](https://developer.twitter.com/en/enterprise-application.html)\n\n### 3\\. Develop webhook consumer app\n\nOnce you have received Account Activity API access, you need to develop, deploy and host a web app that will receive Twitter webhook events.\xa0\n\n* Create a web app with a URL to use as your webhook to receive events. This is the endpoint deployed on you

In [9]:
pprint(results["metadatas"])
results["documents"][0][0]

[[{'source': '../data/platform-docs-versions/X_Twitter-API-V2/Tweets.md'}]]


'|     |     |     |\n| --- | --- | --- |\n| **Description** | **Standard v1.1** | **Twitter API v2** |\n| Documentation | [API Reference](https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get-statuses-user_timeline.html) | [API Reference](https://developer.twitter.com/en/docs/twitter-api/tweets/timelines/api-reference/get-users-id-tweets.html) |\n| HTTP methods supported | GET | GET |\n| Host domain | https://api.twitter.com | https://api.twitter.com |\n| Endpoint paths | /1.1/statuses/user\\_timeline.json | /2/users/:id/tweets |\n| Required parameters | user\\_id or screen\\_name | User ID set as path parameter :id |\n| Authentication | OAuth 1.0a User Context<br><br>OAuth 2.0 App-Only | OAuth 1.0a User Context<br><br>OAuth 2.0 App-Only<br><br>OAuth 2.0 Authorization Code with PKCE |'

# Load Generator

### Quantized Mistral 7B, finetuned on code instructions
- https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-16k-GGUF#provided-files

In [9]:
from IPython.display import display, Markdown

def print_chat(chat_log):
    for entry in chat_log:
        if entry['role'] != 'system':
            display(Markdown(f"**{entry['role'].capitalize()}:** \n{entry['content']}\n"))

In [10]:
from generators.mistral import MistralRAGGenerator

rag_generator = MistralRAGGenerator()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
k = 5

results = retriever.query(query, k)

context = results['documents']

In [12]:
chat_logs = rag_generator.generate_batch(context, query)

  0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
print_chat(chat_logs[-1])

**User:** 
How to get the number of likes of a TikTok post. I'd like an example query and an explanation of the parameters used.


**Assistant:** 
I do not know how to get the number of likes of a TikTok post based on the given context. The provided context is related to Twitter API and does not contain information about TikTok. For TikTok, you would need to use the TikTok Developer Documentation or an unofficial TikTok API documentation. However, in general, to get the number of likes for a TikTok post, you would typically make a request to the TikTok API endpoint that provides information about a specific post and retrieve the "likes" count from the response data. The exact parameters and methods may vary depending on the API documentation or unofficial API you are using. Please provide me with a more suitable context, specific to the API or service you would like to work with (like "TikTok", "Facebook", "Instagram", etc.).


In [13]:
# For evaluation

answer = [c[-1]["content"] for c in chat_logs]

for q, a in zip(query, answer):
    print("Q:", q)
    print("A", a)

Q: Écris-moi une requête curl. Cette requête doit me permettre d'obtenir, auprès de Facebook, les publicités politiques contenant le mot "europe" et qui ont atteint la France et la Belgique. La réponse ne doit contenir que le code de la requête curl.
A ```bash
curl -G -d 'q=Europe&place=FR_BE&type=political_ads' https://graph.facebook.com/v13.0/act_<AD_ACCOUNT_ID>/adinsights
```
Q: How to get the number of likes of a TikTok post. I'd like an example query and an explanation of the parameters used.
A I do not know how to get the number of likes of a TikTok post based on the given context. The provided context is related to Twitter API and does not contain information about TikTok. For TikTok, you would need to use the TikTok Developer Documentation or an unofficial TikTok API documentation. However, in general, to get the number of likes for a TikTok post, you would typically make a request to the TikTok API endpoint that provides information about a specific post and retrieve the "like