## Aufgabe: Crawler


Jetzt haben wir den ArticleFetcher schon so angepasst, dass er die Daten aus allen Seiten extrahiert. Aber eigentlich wäre es cool, wenn er die Daten direkt als .csv-Datei speichern würde ;)

Aufgabe:

- Schaue dir das Modul csv an, und zwar die writer-Funktion: https://docs.python.org/3/library/csv.html#csv.writer.
- Passe dann den Ausgabe-Code so an, dass statt der Ausgabe hier im Notebook eine .csv-Datei gespeichert wird. Verwende als Spaltentrenner (Separator) ein Semikolon und als "quotechar" die doppelten Anführungszeichen ('"'); dann können wir die Datei später noch mit Excel öffnen. :-)

In [8]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import csv

In [32]:
class CrawledArticle():
    def __init__(self, title, emoji, content, image):
        self.title = title
        self.emoji = emoji
        self.content = content
        self.image = image
        
class ArticleFetcher():
    def fetch(self):
        url = "http://python.beispiel.programmierenlernen.io/index.php?page=6"
        articles = []
        
        while url != "":
            print(url)
            time.sleep(1)
            r = requests.get(url)
            doc = BeautifulSoup(r.text, "html.parser")
            
            for card in doc.select(".card"):
                emoji = card.select_one(".emoji").text
                content = card.select_one(".card-text").text
                title = card.select(".card-title span")[1].text
                image = urljoin(url, card.select_one("img").attrs["src"])

                crawled = CrawledArticle(title, emoji, content, image)
                articles.append(crawled)

            next_button = doc.select_one(".navigation .btn")
            if next_button:
                next_href = next_button.attrs["href"]
                next_href = urljoin(url, next_href)            
                url = next_href
            else:
                url = ""
        
        with open('crawler_output2.csv', 'w', newline='', encoding='utf-8') as csvfile:
            print("Opening file ....")
            writer = csv.writer(csvfile, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            for article in articles:
                writer.writerow([article.title, article.content, article.image])
            print("Closing ...")


In [33]:
fetcher = ArticleFetcher()
fetcher.fetch()

http://python.beispiel.programmierenlernen.io/index.php?page=6
Opening file ....
Closing ...


## MY CRAWLER

In [49]:
class CrawledApprenticeship():
    def __init__(self, profession, company, url, start, district, qualification, cident):
        self.profession = profession
        self.company = company
        self.url = url
        self.start = start
        self.district = district
        self.qualification = qualification
        self.cident = cident
        
class ApprFetcher():
    def fetch(self):
        time.sleep(1)
        link = "https://www.lehrstelle-handwerk.de/ausbildung/lehrstellenboerse-praktikumsboerse/lehrstelle-suchen/list"
        rh = requests.get(link)
        site = BeautifulSoup(rh.text, "html.parser")
        apprenticeships = []
        for table in site.find_all("tbody"):
            profession = table.select_one(".lehrstellen").text.strip()
       
            for tr in table.find_all("tr"):

                if tr.select_one(".flush") == None:
                    continue
                else:

                    company = tr.select_one(".flush").text.strip()
                    url = "https://www.lehrstelle-handwerk.de" + tr.select_one("a", href=True)['href'].strip()
                    start =  tr.select_one("a").text.strip()
                    cident = str(url)[slice(str(url).find("firma/")+6,str(url).find("?"))].strip()
                    district = tr.select_one("td[data-label='Stadtteil']").text.strip()
                    qualification = tr.select_one("td[data-label='Abschluss']").text.strip()
                    
                    apprenticeships.append(CrawledApprenticeship(profession, company, url, start, district, qualification, cident))

                    
        return apprenticeships
    
fetcher = ApprFetcher()

#write CSV file

with open('found_apprs.csv', 'w', newline='', encoding='utf-8') as csvfile:
    print("Opening file ....")
    writer = csv.writer(csvfile, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["profession","company_ident","company_name","url","start_date","disctrict","qualification"])
    for appr in fetcher.fetch():
        writer.writerow([appr.profession, appr.cident, appr.company, appr.url, appr.start, appr.district, appr.qualification])
    print("Closing ...")


Opening file ....
Closing ...
