# About

The first step of the pipeline. 

Fetch Baltasar Gracian sources and texts.

In [70]:
import pandas as pd
import requests
import re
import polars as pl

from bs4 import BeautifulSoup

In [71]:
class Config:
    WORLDY_WISDOM_BASE_URL_GER = 'https://www.projekt-gutenberg.org/gracian/orakel/chap{COUNT}.html'
    WORLDY_WISDOM_BASE_URL_ENG = 'https://sacred-texts.com/eso/aww/aww1{COUNT}.htm'
    

# The Art of Worldy Wisdom

As a first source of Baltasar's Wisdom, we choose the Art of Worldy Wisdom - both in english and german from two different sources.

## German

In [72]:
def scrape_german_art_of_wisdom():
    items = []
    counter = 1
    for i in range(2, 12): # Book actually starts at chapter 2
        cur_url = Config.WORLDY_WISDOM_BASE_URL_GER.replace('{COUNT}', f'{i:03}')
        print(f'Scraping current url {cur_url} for chapter {str(i)}')
        
        response = requests.get(cur_url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # So, the page is not really nesting it's p and divs - we are interested in the content after a specific div.
            start_div = soup.find('div', class_='anzeige-chap')
            if(not start_div):
                print('Couldnt find the start_div for this url - continuing...')
                continue

            cur_item = {}
            next_tag = start_div.find_next_sibling()
            while(next_tag):
                text = next_tag.get_text()
                # An H4 tag indicates a new wisdom, which is encapsulated into its own object
                if next_tag.name == 'h4':
                    cur_item = {}
                    cur_item['lang'] = 'ger'
                    cur_item['src'] = cur_url
                    cur_item['count'] = counter
                    counter += 1
                elif next_tag.name == 'h3': # Heading
                    cur_item['header'] = text
                elif next_tag.name == 'p': # Actual content
                    cur_item['content'] = text
                    items.append(cur_item)
                next_tag = next_tag.find_next_sibling()
        else:
            print(f'Problem getting a request - got status_code: {str(response.status_code)}. Maybe too many requests or page doesnt exist?')
    
    return items

In [73]:
ger_items = scrape_german_art_of_wisdom()
print(len(ger_items))

Scraping current url https://www.projekt-gutenberg.org/gracian/orakel/chap002.html for chapter 2
Scraping current url https://www.projekt-gutenberg.org/gracian/orakel/chap003.html for chapter 3
Scraping current url https://www.projekt-gutenberg.org/gracian/orakel/chap004.html for chapter 4
Scraping current url https://www.projekt-gutenberg.org/gracian/orakel/chap005.html for chapter 5
Scraping current url https://www.projekt-gutenberg.org/gracian/orakel/chap006.html for chapter 6
Scraping current url https://www.projekt-gutenberg.org/gracian/orakel/chap007.html for chapter 7
Scraping current url https://www.projekt-gutenberg.org/gracian/orakel/chap008.html for chapter 8
Scraping current url https://www.projekt-gutenberg.org/gracian/orakel/chap009.html for chapter 9
Scraping current url https://www.projekt-gutenberg.org/gracian/orakel/chap010.html for chapter 10
Scraping current url https://www.projekt-gutenberg.org/gracian/orakel/chap011.html for chapter 11
300


In [74]:
print(ger_items)

[{'lang': 'ger', 'src': 'https://www.projekt-gutenberg.org/gracian/orakel/chap002.html', 'count': 1, 'header': 'Alles hat heut zu Tage seinen Gipfel erreicht,', 'content': 'aber die Kunst sich geltend zu machen, den höchsten. Mehr gehört jetzt zu Einem Weisen, als in alten Zeiten zu sieben: und mehr ist erfordert, um in diesen Zeiten mit einem einzigen Menschen fertig zu werden, als in vorigen mit einem ganzen Volke.'}, {'lang': 'ger', 'src': 'https://www.projekt-gutenberg.org/gracian/orakel/chap002.html', 'count': 2, 'header': 'Herz und Kopf:', 'content': 'die beiden Pole der Sonne unserer Fähigkeiten: eines ohne das andere, halbes Glück. Verstand reicht nicht hin; Gemüth ist erfordert. Ein Unglück der Thoren ist Verfehlung des Berufs im Stande, Amt, Lande, Umgang.'}, {'lang': 'ger', 'src': 'https://www.projekt-gutenberg.org/gracian/orakel/chap002.html', 'count': 3, 'header': 'Ueber sein Vorhaben in Ungewißheit lassen.', 'content': 'Die Verwunderung über das Neue ist schon eine Wertsc

## English

In [75]:
def scrape_english_art_of_wisdom():
    items = []
    counter = 1
    
    for i in range(0, 6):
        cur_url = Config.WORLDY_WISDOM_BASE_URL_ENG.replace('{COUNT}', str(i))
        print(f'Scraping current url {cur_url} for chapter {str(i)}')
        
        response = requests.get(cur_url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # So, the page is not really nesting it's p and divs - we are interested in the content after a specific div.
            start_div = soup.find('p', align='CENTER')
            if(not start_div):
                print('Couldnt find the start_div for this url - continuing...')
                continue

            cur_item = {}
            next_tag = start_div.find_next_sibling()
            while(next_tag):
                if next_tag.find('img') is None and next_tag.find('a') is None:
                    text = next_tag.get_text()
                    # There are only p tags here, but the p tag with a center class indicates a new wisdowm
                    if next_tag.has_attr('align') and next_tag['align'] == 'center':
                        if 'content' in cur_item:
                            cur_item['content'] = re.sub('<[^<]+?>', '', cur_item['content'].strip())
                            items.append(cur_item)
                        cur_item = {}
                        cur_item['lang'] = 'eng'
                        cur_item['src'] = cur_url
                        cur_item['count'] = counter
                        cur_item['header'] = text
                        cur_item['content'] = ''
                        counter += 1
                    else:
                        if 'content' in cur_item:
                            cur_item['content'] += ' ' + text
                next_tag = next_tag.find_next_sibling()
            
            cur_item['content'] = re.sub('<[^<]+?>', '', cur_item['content'].strip())
            items.append(cur_item)
        else:
            print(f'Problem getting a request - got status_code: {str(response.status_code)}. Maybe too many requests or page doesnt exist?')
    
    return items

In [76]:
eng_items = scrape_english_art_of_wisdom()
print(len(eng_items))

Scraping current url https://sacred-texts.com/eso/aww/aww10.htm for chapter 0
Scraping current url https://sacred-texts.com/eso/aww/aww11.htm for chapter 1
Scraping current url https://sacred-texts.com/eso/aww/aww12.htm for chapter 2
Scraping current url https://sacred-texts.com/eso/aww/aww13.htm for chapter 3
Scraping current url https://sacred-texts.com/eso/aww/aww14.htm for chapter 4
Scraping current url https://sacred-texts.com/eso/aww/aww15.htm for chapter 5
300


In [77]:
print(eng_items)



## Store 

We gonna store all scraped items in a Dataframe... should be fine.

In [79]:
all_items = ger_items + eng_items
df = pl.DataFrame(all_items)
df.write_json('the_art_of_worldly_wisdom.json')

print('Data stored to disc.')

Data stored to disc.
