In [1]:
import numpy as np
import pandas as pd
import skimage as sk
import matplotlib.pyplot as plt
import os
from PIL import Image
import random
from shutil import copyfile, rmtree
import re
import math

SOURCE_NORMAL = '/home/mniederer/workspace/data/new_material/umrisse_with_all_data.csv'
SOURCE_LOD2 = '/home/mniederer/workspace/data/Useful_LoD2_31466'

In [2]:
df_material = pd.read_csv(SOURCE_NORMAL)

lod_data = []

for file_name in os.listdir(SOURCE_LOD2):
    if(file_name.endswith('.csv')):
        new_df=pd.read_csv(os.path.join(SOURCE_LOD2, file_name))
        lod_data.append(new_df)


## Keep only important features

In [3]:
wanted = ['WKT', 'roofType', 'roofType_', 'consistsOfBuildingPart|BuildingPart|roofType']
wanted_height = [ 'measuredHeight', 'measuredHeight_']
lod2_good = []
#heights = []
for current_df in lod_data:
    keep = []
    keep_height = []
    height = 0
    for column in current_df.columns:
        if( column in wanted ):
            keep.append(column)
        if ( column in wanted_height):
            keep_height.append(column)

    lod2_good.append(current_df.loc[:,keep+ keep_height])

## Combine columns that contain info about shape and the ones that have info on height

In [4]:
for df in lod2_good:
    shapes = []  #this is will be our new column, that combines other columns
    heights = []
    value_check = df.notnull()  #boolean df that specifies where values are

    # iterate over df
    for index in range(0, df.shape[0]):
        # roof shape
        for feature in ['roofType', 'roofType_', 'consistsOfBuildingPart|BuildingPart|roofType']:
            if feature in value_check.columns:  
                if(value_check[feature][index]):
                    shapes.append(df[feature][index])
        # height
        current_heights = []
        for feature in [ 'measuredHeight', 'measuredHeight_']:
            if feature in value_check.columns:  
                if(value_check[feature][index]):
                    current_heights.append(df[feature][index])

        # clean height list
        if len(current_heights) == 0:
            current_heights.append(5.9825) #default value for missing values
        elif type(current_heights[0]) == str:
            string = current_heights[0]
            string = string[1:-1]
            current_heights = [float(idx) for idx in string.split(', ')]
        # at this point we have a list with one ore multiple float values -> take mean
        height = np.mean(current_heights)
        heights.append(height)

    df['Shape'] = shapes  # create new feature 'Shape' based in shapes-list
    df['Height'] = heights
    


# delete other columns
wanted = ['WKT', 'Shape', 'Height']
lod2_good = [ df[wanted] for df in lod2_good ]


df = pd.concat(lod2_good)
df.reset_index(inplace=True, drop=True)

In [5]:
df

Unnamed: 0,WKT,Shape,Height
0,"MULTILINESTRING ((2555053.976 5465511.281,2555...",2100.0,3.527000
1,"MULTILINESTRING ((2554928.772 5465539.13,25549...",2100.0,3.339000
2,"MULTILINESTRING ((2554979.093 5465557.841,2554...","[ 1000, 3100 ]",5.982500
3,"MULTILINESTRING ((2555042.843 5465398.968,2555...",2100.0,3.037000
4,"MULTILINESTRING ((2554915.296 5465382.438,2554...",1000.0,3.500000
...,...,...,...
22354,MULTILINESTRING ((2575829.65701334 5484152.295...,"[ 3100, 1000 ]",3.112000
22355,MULTILINESTRING ((2575906.84369191 5484192.886...,1000.0,3.914000
22356,MULTILINESTRING ((2576278.41978254 5483603.563...,1000.0,2.310000
22357,MULTILINESTRING ((2576282.48122526 5483597.625...,"[ 1000, 1000, 1000 ]",3.802333


In [6]:
def reclassify_shape(s):

    # only value, cast that value to int
    if type(s) == np.float64 or type(s) == int:
        return int(s)

    #else there is a string of a list we must parse
    list_of_shapes = [ int(x) for x in  s.strip('][').split(', ') ]

    #check for only one shape
    that_one_shape = 5000 # -1 means it did not work (because there are multiple shapes in the list)
    for shape in list_of_shapes:
        if shape in [5000,9999]: # 5000 and 9999 are of no interest to us
            continue
        elif that_one_shape == shape: # current shape equals that one shape -> all good
            continue
        elif that_one_shape == 5000: # first time that_one_shape is needed -> our current shape is that_one_shape
            that_one_shape = shape
        else: # it did not work -> abort
            that_one_shape = -1
            break
    if that_one_shape != -1:
        return that_one_shape

    # check for combinations:
    comb_131 = [1000,3100, 5000, 9999]
    comb_231 = [2100, 2200, 3100, 5000, 9999]
    comb_121 = [1000, 2100, 2200, 5000, 9999]
    comb_12131 = [1000, 2100, 2200, 3100, 5000, 9999]
    check_131 = True
    check_231 = True
    check_121 = True
    check_12131 = True
    for shape in list_of_shapes:
        if shape not in comb_131:
            check_131 = False
        if shape not in comb_231:
            check_231 = False
        if shape not in comb_121:
            check_121 = False
        if shape not in comb_12131:
            check_12131 = False
    if check_131 == True:
        return 131
    if check_231 == True:
        return 231
    if check_121 == True:
        return 121
    if check_12131 == True:
        return 12131

    # check for 60 % one shape
    counter_1000  = 0.0
    counter_2100  = 0.0
    counter_3100  = 0.0
    counter_other = 0.0
    threshold = 0.6

    for shape in list_of_shapes:
        if shape in [5000,9999]: # 5000 and 9999 are of no interest to us
            continue
        elif shape == 1000:
            counter_1000 += 1
        elif shape == 1000:
            counter_2100 += 1
        elif shape == 1000:
            counter_3100 += 1
        else:
            counter_other += 1

    sum = counter_1000 + counter_2100 + counter_3100 + counter_other
    if counter_1000 / sum >= threshold:
        return 1000
    if counter_2100 / sum >= threshold:
        return 2100
    if counter_3100 / sum >= threshold:
        return 3100

    return 5000

def calculate_mean_point(poly_string):
    def parse_polygon(s):
        coordinates = s.split(',')
        polygon = []
        for coordinate in coordinates:
            (u,v) = coordinate.split(' ')
            polygon.append((float(u), float(v)))
        return polygon

    poly_string_splitted = re.split("[\(\)]+", poly_string)
    polygon = parse_polygon(poly_string_splitted[1])

    xs = [x for (x,y) in polygon]
    ys = [y for (x,y) in polygon]

    x_mean = np.mean(xs)
    y_mean = np.mean(ys)


    return (x_mean,y_mean)

def distance(x1,y1, x2,y2):
    return np.square(x1-x2) + np.square(y1-y2)


In [7]:
df['x_mean'] = 0
df['y_mean'] = 0

for index in df.index:
    wkt_string = df['WKT'][index]
    shape = df['Shape'][index]

    (x,y) = calculate_mean_point(wkt_string)
    df['x_mean'][index] = x
    df['y_mean'][index] = y
    df['Shape'][index] = reclassify_shape(shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['x_mean'][index] = x
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['y_mean'][index] = y
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Shape'][index] = reclassify_shape(shape)


In [8]:
df.head(20)

Unnamed: 0,WKT,Shape,Height,x_mean,y_mean
0,"MULTILINESTRING ((2555053.976 5465511.281,2555...",2100,3.527,2555056.0,5465514.0
1,"MULTILINESTRING ((2554928.772 5465539.13,25549...",2100,3.339,2554929.0,5465543.0
2,"MULTILINESTRING ((2554979.093 5465557.841,2554...",131,5.9825,2554973.0,5465562.0
3,"MULTILINESTRING ((2555042.843 5465398.968,2555...",2100,3.037,2555040.0,5465400.0
4,"MULTILINESTRING ((2554915.296 5465382.438,2554...",1000,3.5,2554916.0,5465384.0
5,"MULTILINESTRING ((2555035.144 5465356.474,2555...",5000,10.472,2555031.0,5465365.0
6,"MULTILINESTRING ((2555028.174 5465442.397,2555...",5000,10.679,2555021.0,5465445.0
7,"MULTILINESTRING ((2554829.543 5465523.34,25548...",1000,2.789,2554836.0,5465516.0
8,"MULTILINESTRING ((2555024.64 5465436.094,25550...",1000,2.292,2555028.0,5465439.0
9,"MULTILINESTRING ((2554992.832 5465537.194,2554...",1000,5.585,2554995.0,5465541.0


In [9]:
df_material['Shape'] = 0
df_material['Height'] = 0
length = len(df_material.index) - 1
shapes = []
heights = []

for i in df_material.index:
    print('Calculating entry {}/{}'.format(i,length))
    min_dist = math.inf
    index_from_min = 0
    (x,y) = calculate_mean_point(df_material.iat[i,0])
    for j in df.index:
        current_x = df.iat[j,3]
        current_y = df.iat[j,4]
        dist = distance(x,y,current_x,current_y)
        if dist < min_dist:
            min_dist = dist
            index_from_min = j

    shapes.append(df['Shape'][index_from_min])
    heights.append(df['Height'][index_from_min])


df_material['Shape'] = shapes
df_material['Height'] = heights

Calculating entry 0/8246
Calculating entry 1/8246
Calculating entry 2/8246
Calculating entry 3/8246
Calculating entry 4/8246
Calculating entry 5/8246
Calculating entry 6/8246
Calculating entry 7/8246
Calculating entry 8/8246
Calculating entry 9/8246
Calculating entry 10/8246
Calculating entry 11/8246
Calculating entry 12/8246
Calculating entry 13/8246
Calculating entry 14/8246
Calculating entry 15/8246
Calculating entry 16/8246
Calculating entry 17/8246
Calculating entry 18/8246
Calculating entry 19/8246
Calculating entry 20/8246
Calculating entry 21/8246
Calculating entry 22/8246
Calculating entry 23/8246
Calculating entry 24/8246
Calculating entry 25/8246
Calculating entry 26/8246
Calculating entry 27/8246
Calculating entry 28/8246
Calculating entry 29/8246
Calculating entry 30/8246
Calculating entry 31/8246
Calculating entry 32/8246
Calculating entry 33/8246
Calculating entry 34/8246
Calculating entry 35/8246
Calculating entry 36/8246
Calculating entry 37/8246
Calculating entry 38/8

## Save dataframe

In [10]:
df_material.to_csv('umrisse_with_all_data_and_LoD2.csv')

In [12]:
df_material

Unnamed: 0,WKT,mat_qgis,ezg,id,area_type,area,Shape,Height
0,MULTIPOLYGON (((2553971.12751531 5467514.32424...,Beton,Industriegebiet_Fordwerke,1,0,3862.22,1000,4.4650
1,MULTIPOLYGON (((2554018.94473681 5467539.85034...,Beton,Industriegebiet_Fordwerke,2,0,2181.13,1000,5.9825
2,MULTIPOLYGON (((2554194.44303813 5467557.74389...,Bitumen,Industriegebiet_Fordwerke,3,0,1810.98,1000,5.9825
3,MULTIPOLYGON (((2554397.94423659 5467545.61267...,Bitumen,Industriegebiet_Fordwerke,4,0,1308.61,1000,5.9825
4,MULTIPOLYGON (((2554361.34839266 5467564.01169...,Beton,Industriegebiet_Fordwerke,5,0,1547.07,1000,5.9825
...,...,...,...,...,...,...,...,...
8242,MULTIPOLYGON (((2555419.84241189 5467746.84985...,Bitumen,Industriegebiet_Fordwerke,8249,0,18305.07,1000,5.7840
8243,MULTIPOLYGON (((2575490.90357179 5483204.89096...,Metallbahn,Wohngebiet_Tholey,8250,1,67.76,1000,3.9865
8244,MULTIPOLYGON (((2574984.76380313 5482991.65824...,Ziegel,Wohngebiet_Tholey,8251,1,118.96,3100,5.6950
8245,MULTIPOLYGON (((2574995.55640273 5482997.79298...,Ziegel,Wohngebiet_Tholey,8252,1,59.53,3100,5.3080
