<a href="https://colab.research.google.com/github/OlenaBugaiova/collecting-data-about-norwegian-agriculture/blob/main/NIBIO_Web_Scraping_of_Agriculture_Text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task

Web scrape text data from Nibio - a norwegian website on agriculture






In [None]:
NIBIO_URL = 'https://www.nibio.no'

# Import Libraries

In [None]:
pip install googletrans==3.1.0a0 --root-user-action=ignore

Collecting googletrans==3.1.0a0
  Downloading googletrans-3.1.0a0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==3.1.0a0)
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting hstspreload (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading hstspreload-2024.7.1-py3-none-any.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting chardet==3.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting idna==2.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading idna-2.10-py2.py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━

In [None]:
from bs4 import BeautifulSoup
import requests
import json

from google.colab import files
from googletrans import Translator

# Home Page

In the Norwegian language text, we have to properly encode the Norwegian alphabet

In [None]:
page = requests.get(NIBIO_URL)
page.encoding = page.apparent_encoding
home_webpage = BeautifulSoup(page.text, 'html')

In [None]:
home_webpage

<!DOCTYPE html>
<html class="no-js" lang="no">
<head><!-- Google Tag Manager -->
<!-- <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push(

{'gtm.start': new Date().getTime(),event:'gtm.js'}
);var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
})(window,document,'script','dataLayer','GTM-M2BQN2S');</script> -->
<!-- End Google Tag Manager -->
<!-- Matomo -->
<!-- <script>
  var _paq = window._paq = window._paq || [];
  /* tracker methods like "setCustomDimension" should be called before "trackPageView" */
  _paq.push(['trackPageView']);
  _paq.push(['enableLinkTracking']);
  (function() {
    var u="https://nibio.matomo.cloud/";
    _paq.push(['setTrackerUrl', u+'matomo.php']);
    _paq.push(['setSiteId', '1']);
    _paq.push(['enableHeartBeatTimer']);
    var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
    g.asyn

# Topics Hierarchy

From the home webpage, we can download the main topics, their descriptions and urls to the more narrow and detailed webpages. This text data has hierarchical structure and is stored in in the script with the variable called


```
var subjectHierarchy
```

In [None]:
SCRIPT_VARIABLE_NAME = 'subjectHierarchy'

We find all the scripts containing the required variable

In [None]:
scripts = home_webpage.find_all('script')
subject_hierarchy_scripts = [script for script in scripts if SCRIPT_VARIABLE_NAME in str(script)]
len(subject_hierarchy_scripts)

There is only one script on the webpage with variable= *'subjectHierarchy'*, we can retrieve it

In [None]:
subject_hierarchy_script = str(subject_hierarchy_scripts[0])
subject_hierarchy_script

Let's extract the content of the subjectHierarchy variable

In [None]:
text_to_remove_from_beginning = '<script>\n    var subjectHierarchy = '
text_to_remove_from_end = ';\n  </script>'

subjects_hierarchy_content = subject_hierarchy_script.lstrip(text_to_remove_from_beginning + '?')
subjects_hierarchy_content = subjects_hierarchy_content.rstrip(text_to_remove_from_end + '?')

In [None]:
subjects_hierarchy_content

Convert text into json format

In [None]:
subjects_json = json.loads(subjects_hierarchy_content)

In [None]:
print(json.dumps(subjects_json, ensure_ascii = False, indent = 4))

# Topics Description

Form the subject json file, we can retrieve high level topics title and description.

In [None]:
subjects_description = {}

for subject in subjects_json:
    subject_dict = dict(subject)

    title = subject_dict['title']
    description = subject_dict['description']

    subjects_description[title] = description

In [None]:
subjects_description

# Methods for Extracting Hierarchical Data

We can retrieve webpages text by following the links from the subject json file. We can use recursion to cover all links hierarchically

In [None]:
def extract_subject_info(
    content, info_type, extraction_method, children_extraction_method
    ):

    """ Fills up subject information and retreives children information
    Uses provided extraction methods

    Parameters
    ----------
    content: str
          Content to extract information from
    info_type: str
          Type of information to extract
    extraction_method: function
          Method to extract current subject information
    children_extraction_method: function
          Method to extract children information

    Output
    ----------
    subjects_info: dict
    """

    subjects_info = {}

    # extract subject information
    title, info = extraction_method(content)

    subjects_info['title'] = title
    subjects_info[info_type] = info

    # extract sub subjects information
    subsubjects_info = children_extraction_method(
        content, info_type, extraction_method
        )

    if subsubjects_info:
        subjects_info['children'] = subsubjects_info

    return subjects_info

In [None]:
def extract_children_info(
    content, info_type, extraction_method
    ):

    """ Retrieves children
    for each child retreives children information recursively
    by calling extract_subject_info method providing itself as a parameter
    (Depth First Search approach)

    Parameters
    ----------
    content: str
          Content to extract information from
    info_type: str
          Type of information to extract
    extraction_method: function
          Method used to extract information

    Output
    ----------
    childrens_info: list
    """

    content_dict = dict(content)
    children = content_dict.get('children')

    children_info = []
    if children is not None and len(children) > 0:
        for child in children:

            child_info = extract_subject_info(
                child, info_type, extraction_method, extract_children_info
                )

            children_info.append(child_info)

    return children_info

# URLs

For each subject, we extract title and url. If a subject has multiple subtopics, we extract their titles and urls recursively

In [None]:
def extract_subjects_url_info(content):

    content_dict = dict(content)

    title = content_dict['title']
    url = content_dict['url']

    return title, url

In [None]:
subjects_data = []

for subject in subjects_json:

    subject_info = extract_subject_info(
        subject, 'url', extract_subjects_url_info, extract_children_info
        )

    subjects_data.append(subject_info)

In [None]:
subjects_data

Extracting webpages takes time and we don't want anything to break during that process. So we perform two steps:
1. Extracting webpages
2. Extracting text from the webpages

# Webpages

In [None]:
def extract_subjects_webpage(content):

    title, url = extract_subjects_url_info(content)
    subject_url = NIBIO_URL + '/' + url

    page = requests.get(subject_url)
    page.encoding = page.apparent_encoding
    soup = BeautifulSoup(page.text, 'html')

    return title, soup

For each topic, extract topic webpage and webpages of subtopics recursively

In [None]:
subjects_webpage_data = []

for subject in subjects_data:

    subject_webpage = extract_subject_info(
        subject, 'webpage', extract_subjects_webpage, extract_children_info
        )

    subjects_webpage_data.append(subject_webpage)

# Text

To retrieve text, we need to parse the webpages downloaded from the links

In [None]:
def extract_subject_info(
    content, info_type, extraction_method, children_extraction_method
    ):

    subjects_info = {}

    # extract subject information
    title, info = extraction_method(content)
    detailed_info = extract_detailed_subjects_text(content)

    if info:
        subjects_info['title'] = title

        if detailed_info:
            info = info + '\n ' + detailed_info
        subjects_info[info_type] = info

    subsubjects_info = children_extraction_method(
        content, info_type, extraction_method
        )

    if subsubjects_info:
        subjects_info['children'] = subsubjects_info

    return subjects_info

On each of the children webpages we have data of the structure shown in an example:
```
<section class="intro">
            <div class="container">
                <div class="row heading">
                    <div class="locale-switch-only text-right hidden-sm hidden-md hidden-lg">
                        
                    </div>
                    <div class="col-sm-5">
                        <h1>Bruksområder for frø- og skogplanter</h1>
                    </div>
                    <div class="col-sm-7">
                        <p>For å få en klimatilpasset skog er det viktig å bruke riktig plantemateriale til lokaliteten. I dag brukes det stort sett foredla materialer til foryngelse av granskogen. Det er norsk foredlet granfrø tilgjengelig for alle områder på Østlandet, Vestlandet og deler av Trøndelag</p>
                    </div>
                    <div class="col-sm-12 col-md-7 col-md-offset-5">
                        
                        <a class="btn btn-default jump-down" href="#section-summary">
                            <span>Les mer</span>
                        </a>
                    </div>
                </div>
            </div>
        </section>
```
We can retieve title and description


In [None]:
def extract_subjects_text(content):

    content_dict = dict(content)

    webpage = content_dict['webpage']
    intro_section = webpage.find(attrs = {'class' : 'intro'})

    title = intro_section.find('h1').getText().strip()
    title = title.lstrip('<h1>')
    title = title.rstrip('</h1>')

    description = intro_section.find('p').getText().strip()

    return title, description

Each webpage contains text in the following sections:

```
<section data-portal-component-type="text">
<h3>Nasjonale anbefalinger&nbsp;</h3>

<p>Det er Skogfrøverket som har ansvaret for å utarbeide de nasjonale anbefalingene for bruk av skoglig formeringsmaterialer. Skogfrøverket baserer sine anbefalinger på en rekke avkomforsøk, proveniensforsøk, vitenskapelige publikasjoner og gjeldende forskrifter og regler. De nasjonale anbefalinger skal sikre en best mulig klimatilpasset skog, genetisk- gevinst og variasjon.&nbsp;</p>

<p>Skogfrøverket lanserte januar 2022 «Proveniensvelgeren» et nytt kartbasert verktøy for å finne de beste alternativene for foryngelse av skogen. Lenken til «proveniensvelgeren» finner du under "Lenker" lenger på siden.</p>

<h3>Forskrift og OECD regelverk</h3>

<p>Forskrift om skogfrø og skogplanter er hjemlet i skogbruksloven. Dagens forskrift er fra 1996 og er under revidering. Forskriften skal sikre at det brukes frø og planter av god kvalitet ved foryngelse av skog og at hensynet til det genetiske mangfoldet i skogen ivaretas. Forskriften gjelder for alle treslag som anvendes til skogbruksformål og omfatter produksjon, omsetning og bruk av skoglig formeringsmateriale.</p>

<p>Alt formeringsmateriale som omsettes i Norge må være godkjent og sertifisert. Skoglig formeringsmateriale deles inn i og sertifiseres under fire forskjellige kategorier som følger OECD regelverket:</p>

<ul>
	<li>Lokalitetsbestemt (source identified)</li>
	<li>Utvalgt (selected)</li>
	<li>Kvalifisert (qualified)</li>
	<li>Testet (tested).</li>
</ul>

<p>Alle krav knyttet til sertifiseringen er beskrevet i «OECD forest seed and plant scheme, rules and regulations. Dette er et internasjonalt sertifiseringssystem for handel med skoglig formeringsmateriale. Sertifiseringssystemet skal sikre at frø og planter i handelen har blitt samlet inn og produsert på en slik måte at identiteten til materialene er sikret. Et OECD sertifikat inneholder informasjon om den genetiske kvaliteten til materialene.</p>

<p>Regelverket ble etablert i 1967, men revisjoner oppgjennom årene. Norge har vært medlem siden tidlig på 70-tallet.</p>

</section>
```

A header might contain paragraphs and ordered/unordered lists. We can parse them



In [None]:
def extract_detailed_subjects_text(content):

    subsubjects_info = []

    content_dict = dict(content)
    webpage = content_dict['webpage']

    content_sections = webpage.find_all(attrs = {
        'data-portal-component-type' : 'text'
        })
    for content_section in content_sections:
        for header in content_section.find_all(['h3']):

            title = header.get_text()

            text = []
            text_started = False

            for elem in header.find_next_siblings():

                if text_started and not (
                    elem.name == 'p' or
                    elem.name == 'ul' or
                    elem.name == 'li' or
                    elem.name == 'ol'
                    ):
                    break

                if not text_started and elem.name == 'p':
                    text_started = True

                if text_started:

                    new_text = ''
                    if elem.name == 'ul' or elem.name == 'ol':
                        li_list = elem.find_all('li')
                        li_list = [li.get_text() for li in li_list]
                        new_text = ', '.join(li_list)
                    else:
                        new_text = elem.get_text()

                    text.append(new_text)

            text = '\n '.join(text)
            if text:
                subsubjects_info.append(title)
                subsubjects_info.append(text)

    subsubjects_info = '\n '.join(subsubjects_info)
    return subsubjects_info

We will use
- topics description extracted previously
- information obtained from parsing webpages of subtopics recursively

In [None]:
times_printed = 0

subjects_text_data = []

for subject in subjects_webpage_data:

    title = subject['title']
    description = subjects_description[title]

    subject_text = {}
    subject_text['title'] = title
    subject_text['text'] = description

    subsubjects_info = extract_children_info(
        subject, 'text', extract_subjects_text
        )

    if subsubjects_info:
        subject_text['children'] = subsubjects_info

    subjects_text_data.append(subject_text)

In [None]:
agriculture_text_from_nibio = subjects_text_data

# Output

In [None]:
print(json.dumps(agriculture_text_from_nibio, ensure_ascii = False, indent = 4))

In [None]:
output_text_data = json.dumps(agriculture_text_from_nibio, ensure_ascii = False, indent = 4)

with open('nibio_text_data.json', 'w') as f:
  f.write(str(output_text_data))

files.download('nibio_text_data.json')

# Validation of Text Titles





In [None]:
all_titles = []

def retrieve_text_data(topics):
    for subject in topics:

        title = subject.get('title')
        all_titles.append(title)

        children = subject.get('children')

        # recursion
        if children:
            retrieve_text_data(children)

In [None]:
retrieve_text_data(agriculture_text_from_nibio)

In [None]:
len(all_titles)

In [None]:
print(*all_titles, sep = '\n ')

### Duplicates

In [None]:
seen = set()
duplicates_titles = [x for x in all_titles if x in seen or seen.add(x)]

In [None]:
print('Duplicates in titles:\n')
print(*duplicates_titles, sep = '\n')