# We first import the modules we will need

In [22]:
import json

import pandas as pd

from bs4 import BeautifulSoup

import pyproj

from functools import partial

import shapely.ops as ops
from shapely.geometry.polygon import Polygon
from shapely.geometry.point import Point

print("Import succesful!")

Import succesful!


# Calculating the area per postal code

Using the data from https://public.opendatasoft.com/explore/dataset/postleitzahlen-deutschland/table/?refine.note=Hamburg which we saved as 'Postal_Codes_Coordinates_Hamburg.geojson' we are able to calculate the area (km^2) each postal code covers.

In [23]:
def calculate_area(coordinates):
    """
calculate_area(coordinates)
    
    Calculates the total area of a polygon on Earth.
    
    Parameters
    ----------
    coordinates: array of int
        A set of latitude and longitude coordinates for each vertex of the polygon in geojson format
    
    Returns
    -------
    float
        Area in km^2 of the polygon in the EPSG 4326 coordinate system.
        
    """
    
    polygons = [Polygon(coordinates[i]) for i in range(len(coordinates))]
    polygons_transformed = []
    
    for i in range(len(coordinates)):
        project_in = pyproj.Proj(init='EPSG:4326')
        project_out = pyproj.Proj(proj='aea', lat_1=polygons[i].bounds[1], lat_2=polygons[i].bounds[3])

        projection = partial(pyproj.transform, project_in, project_out)

        polygons_transformed.append(ops.transform(projection, polygons[i]))
    
    polygons_area = [polygons_transformed[i].area for i in range(len(coordinates))]
    
    total_area = polygons_area[0] - sum(polygons_area[1:])
    
    return total_area*1e-6

In [24]:
with open('Postal_Codes_Coordinates_Hamburg.geojson') as file:
    geography = json.load(file)
        
base = geography['features']

In [25]:
# For the coordinates we extracted from the 'Postal_Codes_Coordinates_Hamburg.geojson' file, we calculate the area.
area_df = pd.DataFrame(columns=['Postal_Code', 'Area'])

for i in range(len(base)):
    coordinates = base[i]['geometry']['coordinates']
    area = calculate_area(coordinates)
               
    postal_code = base[i]['properties']['plz']
               
    area_df = area_df.append(pd.DataFrame([[postal_code, area]], columns=['Postal_Code', 'Area']), ignore_index=True)

In [26]:
area_df

Unnamed: 0,Postal_Code,Area
0,22763,2.529803
1,22301,1.282740
2,22455,5.012461
3,22145,27.882760
4,22339,5.160678
...,...,...
100,21029,6.462843
101,21039,54.834955
102,20146,0.940159
103,22299,1.285978


It should be noted that some postal codes appear more than once in this dataframe. To solve this problem, we simply sum up the areas of the postal codes that appear more than once. This point is not highly relevant, because if a postal code appears more than once, one appearance accounts for at least ~90% of the area.

In [30]:
area_df = area_df.groupby('Postal_Code').sum().reset_index()

In [31]:
area_df

Unnamed: 0,Postal_Code,Area
0,20095,0.759284
1,20097,1.968939
2,20099,1.589162
3,20144,0.931352
4,20146,0.940159
...,...,...
94,22761,3.720915
95,22763,2.529803
96,22765,2.193986
97,22767,2.608677


In [32]:
# We save the dataframe as a csv file.
area_df.to_csv('Area_Postal_Codes_Hamburg.csv', index=False)

# Extracting the rental price per m^2 for every postal code

We found the rental prices per square metre on https://www.miet-check.de/mietpreise/plz/hamburg/1126/. We have saved the information on this webpage as 'Rental_Prices_Hamburg.html'. We use BeautifulSoup to extract the information on this page.

In [40]:
with open('Rental_Prices_Hamburg.html','r') as html:
    soup = BeautifulSoup(html)

In [41]:
# We set up an empty dataframe and extract the data from the webpage into the dataframe.
rental_prices_df = pd.DataFrame(columns=['Postal_Code', 'Rental_Price'])

for i in range(int(len(soup.find_all('td'))/5)):
    postal_code = int(soup.find_all('td')[5*i + 1].a.string)
    rental_price = float(soup.find_all('td')[5*i + 2].string[:-5])
    
    rental_prices_df = rental_prices_df.append(
        pd.DataFrame([[postal_code, rental_price]], columns=['Postal_Code', 'Rental_Price']),
        ignore_index=True
    )

In [42]:
rental_prices_df['Postal_Code'].value_counts()

22111    2
21035    2
20253    2
21031    2
21073    2
        ..
22399    1
22765    1
22525    1
22607    1
22529    1
Name: Postal_Code, Length: 101, dtype: int64

Unfortunately, it seems certain postal codes appear more than once. The good news is that we can simply remove these postal codes by dropping duplicates.

In [43]:
rental_prices_df.drop_duplicates(ignore_index=True, inplace=True)

In [44]:
rental_prices_df

Unnamed: 0,Postal_Code,Rental_Price
0,20038,13.26
1,20095,20.35
2,20097,15.55
3,20099,15.93
4,20144,18.45
...,...,...
96,22761,13.68
97,22763,14.75
98,22765,16.26
99,22767,15.40


In [45]:
rental_prices_df.to_csv('Rental_Prices_Postal_Codes_Hamburg.csv', index=False)