In [655]:
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
from datetime import datetime

import os

import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize
import re
import requests

InteractiveShell.ast_node_interactivity = "all"
matplotlib.rcParams['svg.fonttype'] = 'none'


from bs4 import BeautifulSoup, NavigableString, Tag

In [315]:
def parse_kantonsblatt(year, lastIssue):
    result = {}
    parseYear = int(year)
    maxIssue = int(lastIssue)
    for i in range(1, maxIssue):
        issue = str(i).zfill(3)
        for lookup in range(0, 30):
            url = 'https://www.kantonsblatt.ch/archive/article/' + year + '/' + issue + '/' + year + issue + '01' + str(lookup).zfill(3) + '.html'
            r  = requests.get(url)
            data = r.text
            soup = BeautifulSoup(data, 'html.parser')
            if soup.find(content="Grundbuch"):
                result[i] = [soup]
                result[i].append(soup.find('font').next.next.text)
                result[i].append(issue)
    df = pd.DataFrame.from_dict(result, orient='index')
    df = df.reset_index()
    df = df.drop('index', 1)
    df.columns = ['html', 'datum', 'issue']
    return df

In [328]:
grundbuch_raw = parse_kantonsblatt('2017', '10')

In [329]:
grundbuch_raw

Unnamed: 0,html,datum,issue
0,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 T...",28.1.2017,8
1,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 T...",7.1.2017,2
2,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 T...",14.1.2017,4
3,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 T...",21.1.2017,6


In [650]:
def parse_grundbuch(dataframe):
    result = {}
    keys = ('location', 'datum', 'issue', 'grundbuch_data')
    #entries = dict.fromkeys(keys)
    for index, row in dataframe.iterrows():
        myps = row['html'].findAll('p')
        for i, p in enumerate(myps):
            if (p.text.strip() == 'Basel'):
                selector = i + 1
                basel = myps[selector].findAll("span", { "class" : "gr8" })
                entries['grundbuch_data'] = extract_entries(basel)
                entries['location'] = 'Basel'
                
        entries['datum'] = row['datum']
        entries['issue'] = row['issue']
        print(entries['issue'])
        
        result[index] = entries
        
    return result

In [651]:
dict = parse_grundbuch(grundbuch_raw)

008
002
004
006


In [652]:
dict

{0: {'datum': '21.1.2017',
  'grundbuch_data': {1: [<span class="b">Rheingasse 11, </span>,
    'S 8 8/100 an P 108, 130 m2, Wohnhaus, Flügelgebäude. Eigentum\r\n  bisher: Niklaus Krattiger, in Basel. Eigentum nun: Tino Krattiger, in Basel. \n'],
   3: [<span class="b">Im Wasenboden, Neudorfstrasse, </span>,
    'S 1 708 m2 von P 1619, zu P 2588. Eigentum bisher: Wohlfahrtstiftung\r\n  der Euphalt AG, in Basel. Eigentum nun: Kanton Basel-Stadt, in Basel. \n'],
   5: [<span class="b">Hagenaustrasse, </span>,
    'S 1 221 m\r\n  2 von P 674, zu P 2588. Eigentum bisher: Einwohnergemeinde der Stadt Basel, in Basel.\r\n  Eigentum nun: Kanton Basel-Stadt, in Basel. \n'],
   7: [<span class="b">Hagenaustrasse, </span>,
    'S 1 108 m\r\n  2 von Allmend, zu P 2588. Eigentum bisher: Einwohnergemeinde der Stadt Basel, in Basel.\r\n  Eigentum nun: Kanton Basel-Stadt, in Basel. \n'],
   9: [<span class="b">Im Wasenboden, </span>,
    "S 1 1'665 m\r\n  2 von Allmend, zu P 2588. Eigentum bisher: Ein

In [653]:
df = pd.DataFrame.from_dict(dict, orient='index')

In [654]:
df

Unnamed: 0,datum,grundbuch_data,issue,location
0,21.1.2017,"{1: [<span class=""b"">Rheingasse 11, </span>, '...",6,Basel
1,21.1.2017,"{1: [<span class=""b"">Rheingasse 11, </span>, '...",6,Basel
2,21.1.2017,"{1: [<span class=""b"">Rheingasse 11, </span>, '...",6,Basel
3,21.1.2017,"{1: [<span class=""b"">Rheingasse 11, </span>, '...",6,Basel


# Weitere Funktionen & Experimente

In [354]:
    myps = dataframe['html'].findAll('p')
    for i, p in enumerate(myps):
        if (p.text.strip() == 'Basel'):
            selector = i + 1
            basel = myps[selector].findAll("span", { "class" : "gr8" })
            result['basel'] = basel
        if (p.text.strip() == 'Bettingen'):
            selector = i + 1
            bettingen = myps[selector].findAll("span", { "class" : "gr8" })
            result['bettingen'] = bettingen
        if (p.text.strip() == 'Riehen'):
            selector = i + 1
            riehen = myps[selector].findAll("span", { "class" : "gr8" })
            result['riehen'] = riehen

AttributeError: 'Series' object has no attribute 'findAll'

In [393]:
def extract_entries(location):
    result = {}
    for i, name in enumerate(location):
        nextIndex = 0
        if (nextIndex < len(location)):
            nextIndex = i + 1
            if(name.select('span.b')):
                result[i] = name.select('span.b')
                result[i].append(location[nextIndex].text)

    return result

entries = extract_entries(riehen)   

In [264]:
entries

{2: [<span class="b">Baiergasse 49,</span>,
  ' P 666, 258 m2, Wohnhaus,\r\n  Garagegebäude. Eigentum bisher: Joseph Silvan Ziltener, in Bettingen. Eigentum nun: Marco Anton\r\n  Hamann und Elli Hamann, beide in Grellingen BL. \n'],
 4: [<span class="b">Buchholzweg, </span>,
  "P 782, 1'843 m\r\n  2. Eigentum bisher: Eveline Maja Müller, in Bettingen. Eigentum nun: Patrick Gerber\r\n  und Heidi Gerber, beide in Bettingen. \n"]}

In [430]:
def create_dataframe(entries):
    df = pd.DataFrame.from_dict(entries, orient='index')
    df = df.reset_index()
    df = df.drop('index', 1)
    df.columns = ['street', 'entry']
    df['street'] = df['street'].astype('str')
    df['street'] = df['street'].map(lambda x: x.lstrip('<span class="b">').rstrip(', </span>'))
    df['entry'] = df['entry'].str.split('.')
    for i in range(0, len(df['entry'])):
        try:
            df.loc[i, 'Sektionskennung'] = df['entry'][i][0]
        except:
            df.loc[i, 'Sektionskennung'] = ''     
        try:
            df.loc[i, 'Eigentum bisher'] = df['entry'][i][1]
        except:
            df.loc[i, 'Eigentum bisher'] = ''
        try:
            df.loc[i, 'Eigentum nun'] = df['entry'][i][2]
        except:
            df.loc[i, 'Eigentum nun'] = ''
            
    df = df.replace({'street': {r'\r\n': ''}}, regex=True)
    df = df.replace({'Eigentum bisher': {r'\r\n': ''}}, regex=True)
    df = df.replace({'Eigentum nun': {r'\r\n': ''}}, regex=True)
    df = df.replace({'Sektionskennung': {r'\r\n': ''}}, regex=True)
    df = df.replace({'Eigentum bisher': {r'Eigentum bisher: ': ''}}, regex=True)
    df = df.replace({'Eigentum nun': {r'Eigentum nun: ': ''}}, regex=True)
    df['Eigentum bisher'] = df['Eigentum bisher'].str.strip()
    df['street'] = df['Eigentum bisher'].str.strip()
    df['Eigentum bisher'] = df['Eigentum bisher'].str.strip()
    df['Eigentum bisher'] = df['Eigentum bisher'].str.strip()
    

In [138]:
df

Unnamed: 0,0,1
2,"<span class=""b"">Baiergasse 49,</span>","P 666, 258 m2, Wohnhaus,\r\n Garagegebäude. ..."
4,"<span class=""b"">Buchholzweg, </span>","P 782, 1'843 m\r\n 2. Eigentum bisher: Evelin..."


In [136]:
df = df.reset_index()

In [137]:
df = df.drop('index', 1)

In [63]:
url = 'https://www.kantonsblatt.ch/archive/article/2017/043/201704301008.html'

url2 = 'https://www.kantonsblatt.ch/archive/article/2017/010/201701001008.html'

#test-url, kein Grundbuch
url3 = 'https://www.kantonsblatt.ch/archive/article/2017/010/201701001007.html'

r  = requests.get(url2)

data = r.text

soup = BeautifulSoup(data, 'html.parser')

print(soup)

mydivs = soup.findAll("span", { "class" : "gr8" })

title = soup.find("meta",  property="content")


<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">

<html>
<head>
<title>Kantonsblatt Basel-Stadt 010 2017 - Grundbuch</title>
<link href="https://www.kantonsblatt.ch/_Resources/Static/Packages/Kantonsblatt.Online/Styles/kb.css" rel="stylesheet" type="text/css">
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
<meta content="Kantonsblatt Basel-Stadt" name="creator">
<meta content="NOINDEX, NOFOLLOW, NOARCHIVE" name="ROBOTS">
<meta content="4" name="tag">
<meta content="2" name="monat">
<meta content="2017" name="jahr">
<meta content="010" name="kbnr">
<meta content="Grundbuch" name="kbTitel">
<meta content="Departemente" name="hrast">
<meta content="Bau- und Verkehrsdepartement" name="urast">
<meta content="8" name="kbartnr">
<meta content="1" name="kbteil">
</meta></meta></meta></meta></meta></meta></meta></meta></meta></meta></meta></meta></link></head>
<body alink="#000099" bgcolor="#FFFFFF" link="#0033CC" vlink="#0033CC">
<table border="0" cellpaddin

In [138]:
df.columns = ['street', 'entry']

In [139]:
df['street'] = df['street'].astype('str') 

In [140]:
df['street'] = df['street'].map(lambda x: x.lstrip('<span class="b">').rstrip(', </span>'))

In [141]:
df['entry'] = df['entry'].str.split('.')

In [142]:
for i in range(0, len(df['entry'])):
    try:
        df.loc[i, 'Sektionskennung'] = df['entry'][i][0]
    except:
        df.loc[i, 'Sektionskennung'] = ''     
    try:
        df.loc[i, 'Eigentum bisher'] = df['entry'][i][1]
    except:
        df.loc[i, 'Eigentum bisher'] = ''
    try:
        df.loc[i, 'Eigentum nun'] = df['entry'][i][2]
    except:
        df.loc[i, 'Eigentum nun'] = ''

In [143]:
df = df.replace({'street': {r'\r\n': ''}}, regex=True)
df = df.replace({'Eigentum bisher': {r'\r\n': ''}}, regex=True)
df = df.replace({'Eigentum nun': {r'\r\n': ''}}, regex=True)
df = df.replace({'Sektionskennung': {r'\r\n': ''}}, regex=True)

In [149]:
df = df.replace({'Eigentum bisher': {r'Eigentum bisher: ': ''}}, regex=True)
df = df.replace({'Eigentum nun': {r'Eigentum nun: ': ''}}, regex=True)

In [None]:
df['Eigentum bisher'] = df['Eigentum bisher'].str.strip()
df['street'] = df['Eigentum bisher'].str.strip()
df['Eigentum bisher'] = df['Eigentum bisher'].str.strip()
df['Eigentum bisher'] = df['Eigentum bisher'].str.strip()

In [153]:
df.to_csv('grundbuch.csv')

In [None]:
>>> d = {}
>>> for i in range(0,5):
...      d.setdefault('result', [])
...      d['result'].append(i)
>>> d
{'result': [0, 1, 2, 3, 4]}

In [45]:
for i in range(1, 10):
    year = '2017'
    issue = str(i).zfill(3)
    for lookup in range(0, 30):
        url = 'https://www.kantonsblatt.ch/archive/article/' + year + '/' + issue + '/' + year + issue + '01' + str(lookup).zfill(3) + '.html'
        print(url)
        r  = requests.get(url)
        data = r.text
        soup = BeautifulSoup(data, 'html.parser')
        if soup.find(content="Grundbuch"):
            print('im Kantonsblatt, Nr.: ' + issue + ' check')
            


https://www.kantonsblatt.ch/archive/article/2017/001/201700101000.html
https://www.kantonsblatt.ch/archive/article/2017/001/201700101001.html
https://www.kantonsblatt.ch/archive/article/2017/001/201700101002.html
https://www.kantonsblatt.ch/archive/article/2017/001/201700101003.html
https://www.kantonsblatt.ch/archive/article/2017/001/201700101004.html
https://www.kantonsblatt.ch/archive/article/2017/001/201700101005.html
https://www.kantonsblatt.ch/archive/article/2017/001/201700101006.html
https://www.kantonsblatt.ch/archive/article/2017/001/201700101007.html
https://www.kantonsblatt.ch/archive/article/2017/001/201700101008.html
https://www.kantonsblatt.ch/archive/article/2017/001/201700101009.html
https://www.kantonsblatt.ch/archive/article/2017/001/201700101010.html
https://www.kantonsblatt.ch/archive/article/2017/001/201700101011.html
https://www.kantonsblatt.ch/archive/article/2017/001/201700101012.html
https://www.kantonsblatt.ch/archive/article/2017/001/201700101013.html
https:

KeyboardInterrupt: 

In [285]:
for key in grundbuch_urls:
    myps = grundbuch_urls[key][0].findAll('p')
    print(myps)

[<p><font face="Verdana, Arial, Helvetica, sans-serif" size="3"><b>Departemente</b></font></p>, <p><font color="#999999" face="Verdana, Arial, Helvetica, sans-serif" size="3"><b>Bau- und Verkehrsdepartement</b></font></p>, <p><font face="Verdana, Arial, Helvetica, sans-serif" size="2"><b>Grundbuch</b></font></p>, <p><font face="Verdana, Arial, Helvetica, sans-serif" size="2"><p>
<br>
<span class="gr7">Das Grundbuch- und Vermessungsamt, Kundenzentrum, Münsterplatz 11, Postfach,
  4001 Basel, ist geöffnet Montag–Donnerstag 8–12 und 13.30–16.30 Uhr, Freitag 8–12 und 13.30–16
  Uhr. Auszüge aus dem Grundbuch können telefonisch (061 267 92 85), per Fax (061 267 92 91)
  oder per Internet (www.gva.bs.ch) bestellt werden. <br>
</br></span><span class="gr7">S Sektion <br>
</br></span><span class="gr7">P Parzelle <br>
</br></span><span class="gr7">BRP Baurechtsparzelle <br>
</br></span><span class="gr7">UBRP Unterbaurechtsparzelle <br>
</br></span><span class="gr7">StWEP StockwerkEigentumspa