# Data Ingestion Pipeline
The first step to building our data ingestion pipeline is to create a single source of truth for our training data. To do so, we will use the set of 120 Question-Answer pairs to common asylum-related questions produced by RSN and its partner Refugee Rights Turkey (RRT) for the [Informational Portal for Refugees in Turkey](https://refugeerights.info). This data set is available in a total of 10 languages: English, Turkish, Arabic, Farsi, Sorani (Kurdish), Somali, Russian, Urdu, Pashto and French.

To simplify the ingestion of this data, and to enable enrichment of these Q&A pairs with additional metadata, the text pairs will be saved in an Airtable base.

In [7]:
# Define a list of URLs containing Q&A Data for Syrians in Turkish
tr_syr = ['https://multecihaklari.info/tr/services/kayit-ve-status/',
          'https://multecihaklari.info/tr/services/adli-yardim/',
          'https://multecihaklari.info/tr/services/konut-ev-sahibi-ve-mulk/',
          'https://multecihaklari.info/tr/services/evlenme-ve-bosanma/',
          'https://multecihaklari.info/tr/services/isgucu-piyasasina-erisim/',
          'https://multecihaklari.info/tr/services/saglik-hizmetleri/',
          'https://multecihaklari.info/tr/services/egitim/',
          'https://multecihaklari.info/tr/services/idari-gozetim-altindaki-kisiler-icin-haklar-ve-prosedurler/',
          'https://multecihaklari.info/tr/services/refakatsiz-cocuklar-icin-haklar-ve-prosedurler/',
          'https://multecihaklari.info/tr/services/iletisim/']

# Define a list of URLs containing Q&A Data for Non-Syrians in Turkish
tr_nsyr = ['https://multecihaklari.info/tr/services/kayit-ve-status-2/',
           'https://multecihaklari.info/tr/services/adli-yardim-2/',
           'https://multecihaklari.info/tr/services/konut-ev-sahibi-ve-mulk-2/',
           'https://multecihaklari.info/tr/services/evlenme-ve-bosanma-2/',
           'https://multecihaklari.info/tr/services/isgucu-piyasasina-erisim-2/',
           'https://multecihaklari.info/tr/services/saglik-hizmetleri-2/',
           'https://multecihaklari.info/tr/services/education-2-2/',
           'https://multecihaklari.info/tr/services/idari-gozetim-altindaki-kisiler-icin-haklar-ve-prosedurler-2/',
           'https://multecihaklari.info/tr/services/refakatsiz-cocuklar-icin-haklar-ve-prosedurler-2/',
           'https://multecihaklari.info/tr/services/iletisim-2/']

# Define a list of URLs containing Q&A Data for Syrians in Arabic
ar_syr = ['https://multecihaklari.info/ar/services/%d8%a7%d9%84%d8%aa%d8%b3%d8%ac%d9%8a%d9%84-%d9%88-%d8%a7%d9%84%d8%ad%d8%a7%d9%84%d8%a9-%d8%a7%d9%84%d9%82%d8%a7%d9%86%d9%88%d9%86%d9%8a%d8%a9/',
          'https://multecihaklari.info/ar/services/%d8%ae%d8%af%d9%85%d8%a7%d8%aa-%d9%82%d8%a7%d9%86%d9%88%d9%86%d9%8a%d8%a9/',
          'https://multecihaklari.info/ar/services/%d8%a7%d9%84%d8%b3%d9%83%d9%86%d8%8c-%d8%a7%d9%84%d9%85%d8%a7%d9%84%d9%83-%d9%88%d8%a7%d9%84%d8%b9%d9%82%d8%a7%d8%b1%d8%a7%d8%aa/',
          'https://multecihaklari.info/ar/services/%d8%a7%d9%84%d8%b2%d9%88%d8%a7%d8%ac-%d9%88-%d8%a7%d9%84%d8%b7%d9%84%d8%a7%d9%82/',
          'https://multecihaklari.info/ar/services/%d8%b3%d9%88%d9%82-%d8%a7%d9%84%d8%b9%d9%85%d9%84/',
          'https://multecihaklari.info/ar/services/%d8%a7%d9%84%d8%b5%d8%ad%d8%a9/',
          'https://multecihaklari.info/ar/services/%d8%a7%d9%84%d8%aa%d8%b9%d9%84%d9%8a%d9%85/',
          'https://multecihaklari.info/ar/services/%d8%ad%d9%82%d9%88%d9%82-%d9%88%d8%a5%d8%ac%d8%b1%d8%a7%d8%a1%d8%a7%d8%aa-%d8%a7%d9%84%d8%a3%d8%b4%d8%ae%d8%a7%d8%b5-%d8%a7%d9%84%d9%85%d8%ad%d8%aa%d8%ac%d8%b2%d9%8a%d9%86-%d8%a5%d8%af%d8%a7%d8%b1/',
          'https://multecihaklari.info/ar/services/%d9%85%d8%ad%d8%aa%d9%88%d9%89-%d8%a7%d9%84%d9%85%d8%b9%d9%84%d9%88%d9%85%d8%a7%d8%aa-%d9%84%d9%84%d8%a3%d8%b7%d9%81%d8%a7%d9%84-%d8%ba%d9%8a%d8%b1-%d8%a7%d9%84%d9%85%d8%b5%d8%ad%d9%88%d8%a8%d9%8a/',
          'https://multecihaklari.info/ar/services/rrt-services-ar/'
          ]

# Define a list of URLs containing Q&A Data for Non-Syrians in Arabic
ar_nsyr = ['https://multecihaklari.info/ar/services/%d8%a7%d9%84%d8%aa%d8%b3%d8%ac%d9%8a%d9%84-%d9%88-%d8%a7%d9%84%d8%ad%d8%a7%d9%84%d8%a9-%d8%a7%d9%84%d9%82%d8%a7%d9%86%d9%88%d9%86%d9%8a%d8%a9-2/',
           'https://multecihaklari.info/ar/services/%d8%ae%d8%af%d9%85%d8%a7%d8%aa-%d9%82%d8%a7%d9%86%d9%88%d9%86%d9%8a%d8%a9-2/',
           'https://multecihaklari.info/ar/services/%d8%a7%d9%84%d8%b3%d9%83%d9%86%d8%8c-%d8%a7%d9%84%d9%85%d8%a7%d9%84%d9%83-%d9%88%d8%a7%d9%84%d8%b9%d9%82%d8%a7%d8%b1%d8%a7%d8%aa-2/',
           'https://multecihaklari.info/ar/services/%d8%a7%d9%84%d8%b2%d9%88%d8%a7%d8%ac-%d9%88-%d8%a7%d9%84%d8%b7%d9%84%d8%a7%d9%82-2/',
           'https://multecihaklari.info/ar/services/%d8%b3%d9%88%d9%82-%d8%a7%d9%84%d8%b9%d9%85%d9%84-2/',
           'https://multecihaklari.info/ar/services/%d8%a7%d9%84%d8%b5%d8%ad%d8%a9-%d9%84%d9%84%d8%a7%d8%ac%d8%a6%d9%8a%d9%86-2/',
           'https://multecihaklari.info/ar/services/%d8%a7%d9%84%d8%aa%d8%b9%d9%84%d9%8a%d9%85-%d9%84%d9%84%d8%a7%d8%ac%d8%a6%d9%8a%d9%86-2/',
           'https://multecihaklari.info/ar/services/%d8%ad%d9%82%d9%88%d9%82-%d9%88%d8%a5%d8%ac%d8%b1%d8%a7%d8%a1%d8%a7%d8%aa-%d8%a7%d9%84%d8%a3%d8%b4%d8%ae%d8%a7%d8%b5-%d8%a7%d9%84%d9%85%d8%ad%d8%aa%d8%ac%d8%b2%d9%8a%d9%86-%d8%a5%d8%af%d8%a7%d8%b1-2/',
           'https://multecihaklari.info/ar/services/%d9%85%d8%ad%d8%aa%d9%88%d9%89-%d8%a7%d9%84%d9%85%d8%b9%d9%84%d9%88%d9%85%d8%a7%d8%aa-%d9%84%d9%84%d8%a3%d8%b7%d9%81%d8%a7%d9%84-%d8%ba%d9%8a%d8%b1-%d8%a7%d9%84%d9%85%d8%b5%d8%ad%d9%88%d8%a8%d9%8a-2/',
           'https://multecihaklari.info/ar/services/rrt-services-ar-2-2/']

# Define a list of URLs containing Q&A Data for Non-Syrians in Farsi
fa_nsyr = ['https://multecihaklari.info/fa/services/%d8%ab%d8%a8%d8%aa-%d9%86%d8%a7%d9%85-%d9%88%d8%b6%d8%b9%db%8c%d8%aa-2/',
           'https://multecihaklari.info/fa/services/%da%a9%d9%85%da%a9-%d9%82%d8%a7%d9%86%d9%88%d9%86%db%8c-2/',
           'https://multecihaklari.info/fa/services/%d9%85%d8%b3%da%a9%d9%86%d8%8c-%d8%b5%d8%a7%d8%ad%d8%a8%d8%ae%d8%a7%d9%86%d9%87-%d9%88-%d9%85%d9%84%da%a9-3/',
           'https://multecihaklari.info/fa/services/%d8%a7%d8%b2%d8%af%d9%88%d8%a7%d8%ac-%d9%88-%d8%b7%d9%84%d8%a7%d9%82-2/',
           'https://multecihaklari.info/fa/services/%da%a9%d8%a7%d8%b1-2/',
           'https://multecihaklari.info/fa/services/%d8%ae%d8%af%d9%85%d8%a7%d8%aa-%d9%be%d8%b2%d8%b4%da%a9%db%8c-2/',
           'https://multecihaklari.info/fa/services/%d8%aa%d8%ad%d8%b5%db%8c%d9%84%d8%a7%d8%aa-2/',
           'https://multecihaklari.info/fa/services/%d8%ad%d9%82%d9%88%d9%82-%d9%88-%d8%b1%d9%88%d9%86%d8%af-%d8%a8%d8%b1%d8%a7%db%8c-%d8%a7%d9%81%d8%b1%d8%a7%d8%af-%d8%af%d8%b1-%d8%a8%d8%a7%d8%b2%d8%af%d8%a7%d8%b4%d8%aa-%d8%a7%d8%af%d8%a7%d8%b1%db%8c/',
           'https://multecihaklari.info/fa/services/%d8%ac%d8%b2%d9%88%d9%87-%d8%a7%d8%b7%d9%84%d8%a7%d8%b9%d8%a7%d8%aa%db%8c-%d8%a8%d8%b1%d8%a7%db%8c-%d8%a7%d9%81%d8%b1%d8%a7%d8%af-%d8%ae%d8%b1%d8%af%d8%b3%d8%a7%d9%84-%d8%a8%d8%af%d9%88%d9%86-%d9%87/',
           'https://multecihaklari.info/fa/services/access-rrt-services/']

# Define a list of URLs containing Q&A Data for Non-Syrians in Urdu
ur_nsyr = ['https://multecihaklari.info/ur/services/registration-and-status-6/',
           'https://multecihaklari.info/ur/services/legal-assistance-8/',
           'https://multecihaklari.info/ur/services/rent-and-property-5/',
           'https://multecihaklari.info/ur/services/marriage-and-divorce-6/',
           'https://multecihaklari.info/ur/services/labor-market-8/',
           'https://multecihaklari.info/ur/services/healthcare-8/',
           'https://multecihaklari.info/ur/services/education-8/',
           'https://multecihaklari.info/ur/services/detention/',
           'https://multecihaklari.info/ur/services/unaccompanied-minors/'
           ]

# Define a list of URLs containing Q&A Data for Non-Syrians in Pashto
ps_nsyr = ['https://multecihaklari.info/ps/services/registration-and-status-4/',
           'https://multecihaklari.info/ps/services/legal-assistance-7/',
           'https://multecihaklari.info/ps/services/rent-and-property-6/',
           'https://multecihaklari.info/ps/services/marriage-and-divorce-5/',
           'https://multecihaklari.info/ps/services/healthcare-7/',
           'https://multecihaklari.info/ps/services/education-7/',
           'https://multecihaklari.info/ps/services/detention-2/',
           'https://multecihaklari.info/ps/services/%d8%af-%d8%a8%db%90-%d8%b3%d8%b1%d9%be%d8%b1%d8%b3%d8%aa%d9%87-%da%a9%d9%88%da%86%d9%86%db%8c%d8%a7%d9%86%d9%88-%d9%84%d9%be%d8%a7%d8%b1%d9%87-%d8%ad%d9%82%d9%88%d9%82-%d8%a7%d9%88-%d9%be%d8%b1%d9%88/'
           ]

# Define a list of URLs containing Q&A Data for Non-Syrians in Russian
ru_nsyr = ['https://multecihaklari.info/ru/services/registration-and-status-5/',
           'https://multecihaklari.info/ru/services/legal-assistance-6/',
           'https://multecihaklari.info/ru/services/rent-and-property-4/',
           'https://multecihaklari.info/ru/services/marriage-and-divorce-4/',
           'https://multecihaklari.info/ru/services/labor-market-6/',
           'https://multecihaklari.info/ru/services/healthcare-6/',
           'https://multecihaklari.info/ru/services/education-6/',
           'https://multecihaklari.info/ru/services/detention-3/',
           'https://multecihaklari.info/ru/services/unaccompanied-minors-5/'
           ]

# Define a list of URLs containing Q&A Data for Non-Syrians in Somali
som_nsyr = ['https://multecihaklari.info/so/services/registration-and-status-3/',
            'https://multecihaklari.info/so/services/legal-assistance-5/',
            'https://multecihaklari.info/so/services/rent-and-property-3/',
            'https://multecihaklari.info/so/services/marriage-and-divorce-3/',
            'https://multecihaklari.info/so/services/labor-market-5/',
            'https://multecihaklari.info/so/services/healthcare-5/',
            'https://multecihaklari.info/so/services/education-5/',
            'https://multecihaklari.info/so/services/detention-5/',
            'https://multecihaklari.info/so/services/unaccompanied-minors-3/'
            ]

# Define a list of URLs containing Q&A Data for Non-Syrians in Sorani
ku_nsyr = ['https://multecihaklari.info/ku/services/registration-and-status-2/',
           'https://multecihaklari.info/ku/services/legal-assistance-4/',
           'https://multecihaklari.info/ku/services/rent-and-property-2/',
           'https://multecihaklari.info/ku/services/marriage-and-divorce-2/',
           'https://multecihaklari.info/ku/services/labor-market-4/',
           'https://multecihaklari.info/ku/services/healthcare-4/',
           'https://multecihaklari.info/ku/services/education-4/',
           'https://multecihaklari.info/ku/services/detention-4/',
           'https://multecihaklari.info/ku/services/unaccompanied-minors-4/']

# Define a list of URLs containing Q&A Data for Non-Syrians in French
fr_nsyr = ['https://multecihaklari.info/fr/services/registration-and-status/',
           'https://multecihaklari.info/fr/services/legal-assistance-3/',
           'https://multecihaklari.info/fr/services/rent-and-property/',
           'https://multecihaklari.info/fr/services/marriage-and-divorce/',
           'https://multecihaklari.info/fr/services/labor-market-3/',
           'https://multecihaklari.info/fr/services/healthcare-3/',
           'https://multecihaklari.info/fr/services/education-3/',
           'https://multecihaklari.info/fr/services/education-3/',
           'https://multecihaklari.info/fr/services/unaccompanied-minors-2/',
           'https://multecihaklari.info/fr/services/les-services-proposes-par-rrt/'
           ]


## Define a Function to Scrape Q&A Data from RefugeeRights.info
To upload our Q&A data to Airtable, we'll first need to gather the text from the website. The following Python function `save_faqs_to_file` accepts three arguments:
1. `filename`, a string which will be used to name the .json file written to disk by the function.
2. `urls`, an array of URLs in string from which need to be scraped.
3. `language`, which we will use in the final Airtable base to only retrieve texts in the required language.

In [8]:
import os
import json
import requests
from bs4 import BeautifulSoup


def stringify_children(node):
    """
    Convert the children of a node into a single string, maintaining order.
    """
    parts = ([node] if node.string else []) + list(node.children)
    return ''.join(str(x) for x in parts)


def extract_faq_data(url, language):
    """
    Extract FAQ data from a given URL.
    """
    response = requests.get(url)
    html_content = response.text

    soup = BeautifulSoup(html_content, "html.parser")

    # Extract the page title up to the specified delimiter
    page_title = soup.title.string.split(" - ")[0]

    # Find the accordion element that contains the FAQ cards
    accordion = soup.find("div", id="accordion")

    faq_cards = accordion.find_all("div", class_="card")
    faq_data = []

    for card in faq_cards:
        # Extract the question from the card's anchor element
        question_title_element = card.find("a")
        question_title = question_title_element.get_text(strip=True)

        # Extract the answer from the card's body element and convert it to HTML string
        answer_element = card.find("div", class_="card-body")
        answer_html = stringify_children(answer_element)

        # Append the FAQ entry with the specified language
        faq_data.append(
            {
                "section": page_title,
                "question": question_title,
                "answer": answer_html,
                "language": language
            })

    return faq_data


def save_faqs_to_file(filename, urls, language):
    """
    Extract and save FAQ data from a list of URLs to a JSON file.
    """
    all_faq_data = []

    for url in urls:
        faq_data = extract_faq_data(url, language)
        all_faq_data.extend(faq_data)
    
    # Ensure the 'faq_data' subdirectory exists, or create it if not
    os.makedirs("faq_data", exist_ok=True)

    # Save the aggregated FAQ data to a specified JSON file
    with open(os.path.join("faq_data", f"{filename}.json"), "w") as outfile:
        json.dump(all_faq_data, outfile, indent=4, ensure_ascii=False)


In [9]:
# Save Syrian and Non-Syrian Q&A Data in Turkish
save_faqs_to_file(filename='TR_SYR', urls=tr_syr, language='Turkish')
save_faqs_to_file(filename='TR_NSYR', urls=tr_nsyr, language='Turkish')
save_faqs_to_file(filename='AR_SYR', urls=ar_syr, language='Arabic')
save_faqs_to_file(filename='AR_NSYR', urls=ar_nsyr, language='Arabic')
save_faqs_to_file(filename='FA_NSYR', urls=fa_nsyr, language='Farsi')
save_faqs_to_file(filename='UR_NSYR', urls=ur_nsyr, language='Urdu')
save_faqs_to_file(filename='PS_NSYR', urls=ps_nsyr, language='Pashto')
save_faqs_to_file(filename='RU_NSYR', urls=ru_nsyr, language='Russian')
save_faqs_to_file(filename='SOM_NSYR', urls=som_nsyr, language='Somali')
save_faqs_to_file(filename='KU_NSYR', urls=ku_nsyr, language='Sorani Kurdish')
save_faqs_to_file(filename='FR_NSYR', urls=fr_nsyr, language='French')