In [1]:
# !pip install langchain chromadb ctransformers transformers sentence_transformers
# Apple silicon:
# !pip uninstall ctransformers
# !CT_METAL=1 pip install ctransformers --no-binary ctransformers

In [2]:
from pathlib import Path
from pprint import pprint

DATA_DIR = Path("../data")
SNAPSHOTS_DIR = DATA_DIR / "platform-docs-snapshots"
VERSIONS_DIR = DATA_DIR / "platform-docs-versions"

# hparams
chunk_size = 1024
chunk_overlap = chunk_size // 10
embedder_name = "BAAI/bge-small-en-v1.5" # https://huggingface.co/spaces/mteb/leaderboard

# Load data

In [29]:
from data import get_training_data

train_queries, train_answers, train_context = get_training_data()

query = [train_queries[-3]]
query

["How to get the number of likes of a TikTok post. I'd like an example query and an explanation of the parameters used."]

In [41]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader, DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

docs = DirectoryLoader(VERSIONS_DIR, glob="[!.]*/[!.]*.md", loader_cls=TextLoader).load()
docs = [d for d in docs if Path(d.metadata['source']) != VERSIONS_DIR / "README.md"]

In [43]:
import langdetect
from collections import Counter

languages = [langdetect.detect(d.page_content) for d in docs]

for doc, language in zip(docs, languages):
    doc.metadata["language"] = language

print(Counter(languages))

Counter({'en': 131, 'fr': 8})


# Load Retriever, build index

In [31]:
from retrievers.chroma_dpr import ChromaRetriever
import re

retriever = ChromaRetriever()
collection_name = "DSA"
retriever.build(docs, collection_name)

In [6]:
pprint(query)

retriever.query(query, k=1)

["Écris-moi une requête curl. Cette requête doit me permettre d'obtenir, "
 'auprès de Facebook, les publicités politiques contenant le mot "europe" et '
 'qui ont atteint la France et la Belgique. La réponse ne doit contenir que le '
 'code de la requête curl.',
 'How is content moderation carried out on X?']


{'ids': [['01ac1023d09d4e2ab61d45e94ce4c06c'],
  ['52d8347e2fd74663ab331cea0b6d02c9']],
 'distances': [[1.606126070022583], [1.5231363773345947]],
 'metadatas': [[{'source': '../data/platform-docs-versions/TikTok_Transparency/Community Guidelines Enforcement Reports.md'}],
  [{'source': '../data/platform-docs-versions/X_DSA-Transparency-Report/DSA Transparency Report.md'}]],
 'embeddings': None,
 'documents': [['**Définitions**\n\n* **Les réseaux ont opéré depuis :** indique l’emplacement géographique des opérations du réseau selon des preuves techniques et comportementales récoltées depuis des sources privées et ouvertes. Il est possible que TikTok ne soit pas en mesure de relier ces réseaux à des entités, des individus ou des groupes définis.\n* **Source de la détection :** considérée comme interne quand la présence de l’activité a été identifiée uniquement via une enquête menée en interne. La détection externe renvoie à des enquêtes qui ont débuté avec un rapport externe, ayant ensu

In [7]:
results = retriever.query(
    query=["How do I get activity logs from the Twitter API?"],
    k=1,
)

In [8]:
results

{'ids': [['59d914610861410dbfbab1452c3a8519']],
 'distances': [[1.5914931297302246]],
 'metadatas': [[{'source': '../data/platform-docs-versions/X_Twitter-API-Enterprise/Account Activity API.md'}]],
 'embeddings': None,
 'documents': [['####\n\n# Resource URL: https://developer.twitter.com/en/docs/twitter-api/enterprise/account-activity-api/guides/authenticating-with-the-account-activity-api\nAuthenticating with the Account Activity API\n\n**Please note**:\xa0\n\nTwitter needs to enable access to the Account Activity API for your developer App before you can start using the API. To this end, make sure to share the App ID that you intend to use for authentication purposes with your account manager or technical support team.\n\nThe [Account Activity API](https://developer.twitter.com/en/docs/twitter-api/enterprise/account-activity-api/overview) consists of a set of endpoints that allow you to create and manage user subscriptions to receive real-time account activities for all of your sub

In [9]:
pprint(results["metadatas"])
results["documents"][0][0]

[[{'source': '../data/platform-docs-versions/X_Twitter-API-Enterprise/Account '
             'Activity API.md'}]]


'####\n\n# Resource URL: https://developer.twitter.com/en/docs/twitter-api/enterprise/account-activity-api/guides/authenticating-with-the-account-activity-api\nAuthenticating with the Account Activity API\n\n**Please note**:\xa0\n\nTwitter needs to enable access to the Account Activity API for your developer App before you can start using the API. To this end, make sure to share the App ID that you intend to use for authentication purposes with your account manager or technical support team.\n\nThe [Account Activity API](https://developer.twitter.com/en/docs/twitter-api/enterprise/account-activity-api/overview) consists of a set of endpoints that allow you to create and manage user subscriptions to receive real-time account activities for all of your subscribed accounts through a single connection.'

# Load Generator

### Quantized Mistral 7B, finetuned on code instructions
- https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-16k-GGUF#provided-files

In [9]:
from IPython.display import display, Markdown

def print_chat(chat_log):
    for entry in chat_log:
        if entry['role'] != 'system':
            display(Markdown(f"**{entry['role'].capitalize()}:** \n{entry['content']}\n"))

In [10]:
from generators.mistral import MistralRAGGenerator

rag_generator = MistralRAGGenerator()

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
k = 5

results = retriever.query(query, k)

context = results['documents']

In [12]:
chat_logs = rag_generator.generate_batch(context, query)

  0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
print_chat(chat_logs[-1])

**User:** 
How to get the number of likes of a TikTok post. I'd like an example query and an explanation of the parameters used.


**Assistant:** 
I do not know how to get the number of likes of a TikTok post based on the given context. The provided context is related to Twitter API and does not contain information about TikTok. For TikTok, you would need to use the TikTok Developer Documentation or an unofficial TikTok API documentation. However, in general, to get the number of likes for a TikTok post, you would typically make a request to the TikTok API endpoint that provides information about a specific post and retrieve the "likes" count from the response data. The exact parameters and methods may vary depending on the API documentation or unofficial API you are using. Please provide me with a more suitable context, specific to the API or service you would like to work with (like "TikTok", "Facebook", "Instagram", etc.).


In [13]:
# For evaluation

answer = [c[-1]["content"] for c in chat_logs]

for q, a in zip(query, answer):
    print("Q:", q)
    print("A", a)

Q: Écris-moi une requête curl. Cette requête doit me permettre d'obtenir, auprès de Facebook, les publicités politiques contenant le mot "europe" et qui ont atteint la France et la Belgique. La réponse ne doit contenir que le code de la requête curl.
A ```bash
curl -G -d "q=political%20ads%20euurope&country_codes=FR,BE" https://graph.facebook.com/v13.0/act_4581848968367609/adinsights
```
Q: How to get the number of likes of a TikTok post. I'd like an example query and an explanation of the parameters used.
A I do not know how to get the number of likes of a TikTok post based on the given context. The provided context is related to Twitter API and does not contain information about TikTok. For TikTok, you would need to use the TikTok Developer Documentation or an unofficial TikTok API documentation. However, in general, to get the number of likes for a TikTok post, you would typically make a request to the TikTok API endpoint that provides information about a specific post and retrieve 

In [15]:
chat_logs[-1]

[{'role': 'system',
  'content': '\nYou are a QA assistant that answers questions based only on the context you are given.\n\nExamples:\nQ: I am a coding expert. How could I get more info about a Facebook post whose url slug ends with 123456789_123456789? I have a crowdtangle API token (TOKEN). Your output is a bash code snippet.\nA: ```bash\ncurl -L https://api.crowdtangle.com/post/123456789_123456789?token=<CROWDTANGLE_API_TOKEN>\n```\n\nNext section will contain the context. You can use it to formulate the answer to the user question.\n------------------\nCONTEXT 0\nCLOSING STATEMENT ON CONTENT MODERATION ACTIVITIES\n\nOur content moderation systems are designed and tailored to mitigate systematic risks without unnecessarily restricting the use of our service and fundamental rights, especially freedom of expression. Content moderation activities are implemented and anchored on principled policies and leverage a diverse set of interventions to ensure that our actions are reasonable, 

In [17]:
train_queries

['Écris-moi une requête curl. Cette requête doit me permettre d\'obtenir, auprès de Facebook, les publicités politiques contenant le mot "europe" et qui ont atteint la France et la Belgique. La réponse ne doit contenir que le code de la requête curl.',
 'Comment savoir si une campagne publicitaire sur Facebook a dévié de sa cible prévue ? La réponse doit contenir un lien vers la documentation des API de Facebook au sujet des publicités. La réponse doit nommer les champs les plus pertinents disponibles via cette API.',
 'Où puis-je trouver le centre de transparence de Linkedin ? Veuillez indiquer une seule URL.',
 'How many Italian government data requests did LinkedIn receive in 2022? Please provide the URL of the source.',
 'How many languages spoken in the EU are spoken by the Pinterest team ? Can you specify the link where this information is mentioned ?',
 'I am a coding expert. How could I get more info about a Facebook post whose url slug ends with 123456789_123456789? I have a c

In [18]:
train_answers

[['```curl -G -d "search_terms=\\\'europe\\\'" -d "ad_type=POLITICAL_AND_ISSUE_ADS" -d "ad_reached_countries=[\\\'FR\\\',\\\'BE\\\']" -d "access_token=<ACCESS_TOKEN>" "https://graph.facebook.com/<API_VERSION>/ads_archive"```'],
 ['L\'API "Ad Library" de Meta permet de connaître la cible prévue pour une campagne publicitaire, et permet d\'obtenir des informations démographiques sur les personnes ayant été effectivement touchées par cette campagne. La documentation de cette API se trouve à l\'adresse suivante : https://www.facebook.com/ads/library/api/. Voici quelques champs pertinents :\n- \'target_ages\', \'target_gender\', \'target_locations\' : ces champs permettent de comprendre la cible prévue pour la campagne.\n- \'impressions\', \'eu_total_reach\', \'estimated_audience_size\', \'demographic_distribution\', \'age_country_gender_reach_breakdown\' : ces champs permettent d\'évaluer la performance de la campagne et le profil socio-démographique des personnes ayant interagi avec cette

In [19]:
train_context

['https://developers.facebook.com/docs/graph-api/reference/ads_archive/',
 'https://www.facebook.com/ads/library/api/',
 'https://about.linkedin.com/transparency',
 'https://about.linkedin.com/transparency/government-requests-report',
 'https://policy.pinterest.com/en/digital-services-act-transparency-report',
 'https://help.crowdtangle.com/en/articles/3443476-api-cheat-sheet',
 'https://developer.twitter.com/en/docs/twitter-api/compliance/batch-compliance/introduction',
 'https://developers.tiktok.com/doc/research-api-specs-query-video-comments',
 'https://values.snap.com/fr-FR/privacy/transparency/european-union',
 'https://transparency.twitter.com/dsa-transparency-report.html']

In [49]:
answers = []
for query in train_queries[5:6]:
    answer = {}
    results = retriever.query(query, k)
    context = results['documents'][0]
    answer['source_documents'] = results['metadatas'][0]
    chat_log = rag_generator.generate_answer(context, query)
    answer["content"] = chat_log[-1]["content"]
    answers.append(answer)

In [50]:
answers

[{'source_documents': [{'source': '../data/platform-docs-versions/Facebook_Graph-API/Canvas Endpoint.md'},
   {'source': '../data/platform-docs-versions/Facebook_Graph-API/Canvas Endpoint.md'},
   {'source': '../data/platform-docs-versions/Facebook_Graph-API/Canvas Endpoint.md'},
   {'source': '../data/platform-docs-versions/Facebook_Graph-API/Canvas Endpoint.md'},
   {'source': '../data/platform-docs-versions/Facebook_Graph-API/Graph API Overview.md'}],
  'content': '```bash\ncurl -L https://api.crowdtangle.com/post/123456789_123456789?token=TOKEN\n```'}]

In [51]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

ref_score = 0
total_rouge_score = 0
for idx, answer in enumerate(answers):
    ref_answer = train_answers[idx]
    ref_context = train_context[idx]
    
    context_links = []
    print(answer['source_documents'])
    for context_source in answer['source_documents']:
        context_file = context_source['source']
        print(context_file)
        with open(context_file) as f:
            context_link = f.readline()[16:-1]
        context_links.append(context_link)
    
    if context_links[0] == ref_context:
        ref_score += 1
    
    rouge_score = scorer.score(answer['content'], ref_answer)
    total_rouge_score += rouge_score
    
    print(context_links)    
    print(ref_context)

[{'source': '../data/platform-docs-versions/Facebook_Graph-API/Canvas Endpoint.md'}, {'source': '../data/platform-docs-versions/Facebook_Graph-API/Canvas Endpoint.md'}, {'source': '../data/platform-docs-versions/Facebook_Graph-API/Canvas Endpoint.md'}, {'source': '../data/platform-docs-versions/Facebook_Graph-API/Canvas Endpoint.md'}, {'source': '../data/platform-docs-versions/Facebook_Graph-API/Graph API Overview.md'}]
../data/platform-docs-versions/Facebook_Graph-API/Canvas Endpoint.md
../data/platform-docs-versions/Facebook_Graph-API/Canvas Endpoint.md
../data/platform-docs-versions/Facebook_Graph-API/Canvas Endpoint.md
../data/platform-docs-versions/Facebook_Graph-API/Canvas Endpoint.md
../data/platform-docs-versions/Facebook_Graph-API/Graph API Overview.md
['https://developers.facebook.com/docs/graph-api/reference/canvas/', 'https://developers.facebook.com/docs/graph-api/reference/canvas/', 'https://developers.facebook.com/docs/graph-api/reference/canvas/', 'https://developers.fac

In [55]:
answers[0]['content']

'```bash\ncurl -L https://api.crowdtangle.com/post/123456789_123456789?token=TOKEN\n```'