In [1]:
import pandas as pd

gdf = pd.read_pickle("krakow-buildings.pkl")

# Data Exploration

## Columns

In [2]:
columns = sorted(gdf.columns)

# Print column names and their data types
for col in columns:
    print(f"{col}: {gdf[col].dtype}")

3dr:height1: object
3dr:height2: object
3dr:length1: object
abandoned: object
abandoned:building: object
abandoned:power: object
access: object
addr:city: object
addr:city:simc: object
addr:country: object
addr:district: object
addr:door: object
addr:housename: object
addr:housenumber: object
addr:old_street: object
addr:place: object
addr:postcode: object
addr:province: object
addr:state: object
addr:street: object
addr:street:sym_ul: object
addr:subdistrict: object
addr:suburb: object
addr:unit: object
admin_level: object
aeroway: object
air_conditioning: object
alt_addr:street: object
alt_name: object
alt_name:en: object
amenity: object
amenity_1: object
animal_boarding: object
architect: object
architect:wikidata: object
area: object
artwork_type: object
atm: object
audio: object
automated: object
bar: object
barrier: object
beauty: object
beds: object
bell: object
bell:name: object
bells: object
bench: object
bicycle_parking: object
biergarten: object
bin: object
branch: object
br

Most promising columns are:
`area`, `type`, `building`, `building:levels`/`building:levels:aboveground`, `building:flats`

and of course we need to keep `id` and `geom` 

## Explore Data

In [3]:
print(gdf["area"].describe())
print('-' * 30)
print(gdf["area"].value_counts())

count      13
unique      1
top       yes
freq       13
Name: area, dtype: object
------------------------------
area
yes    13
Name: count, dtype: int64


In [4]:
print(gdf["type"].describe())
print('-' * 30)
print(gdf["type"].value_counts())

count           26
unique           2
top       building
freq            25
Name: type, dtype: object
------------------------------
type
building        25
indoor_range     1
Name: count, dtype: int64


In [5]:
print(gdf["building"].describe())
print("-" * 30)
print(gdf["building"].value_counts())

count     122543
unique       145
top          yes
freq       85888
Name: building, dtype: object
------------------------------
building
yes                  85888
apartments            8405
house                 4315
garage                4177
detached              3991
                     ...  
nursery                  1
technical                1
entrance_building        1
HVAC                     1
ger                      1
Name: count, Length: 145, dtype: int64


In [6]:
print(gdf["building:levels"].describe())
print("-" * 30)
print(gdf["building:levels"].value_counts())

count     21052
unique       37
top           1
freq       6457
Name: building:levels, dtype: object
------------------------------
building:levels
1      6457
2      4598
3      3441
4      2545
5      2031
6       544
11      311
7       248
12      190
8       157
10      113
9       103
0        95
3.5      52
4.5      35
2.5      29
14       22
16       16
5.5      15
1.5      13
13        8
15        6
17        5
0.2       3
0.8       2
0.5       2
6.5       1
3w        1
20        1
1.2       1
-1        1
3.8       1
27        1
1.1       1
1.3       1
0.3       1
1.8       1
Name: count, dtype: int64


In [7]:
print(gdf["building:levels:aboveground"].describe())
print("-" * 30)
print(gdf["building:levels:aboveground"].value_counts())

count     5
unique    2
top       7
freq      3
Name: building:levels:aboveground, dtype: object
------------------------------
building:levels:aboveground
7    3
5    2
Name: count, dtype: int64


In [8]:
print(gdf["building:flats"].describe())
print("-" * 30)
print(gdf["building:flats"].value_counts())

count     60
unique    31
top        2
freq      10
Name: building:flats, dtype: object
------------------------------
building:flats
2      10
17      6
16      6
14      4
4       3
40      2
96      2
72      2
30      2
12      2
46      1
6       1
58      1
118     1
122     1
60      1
78      1
48      1
172     1
24      1
121     1
19      1
89      1
45      1
250     1
15      1
177     1
42      1
49      1
67      1
65      1
Name: count, dtype: int64


# Action Plan

Turns out the OSM data is shitty to say the least...

I will use geom column to calculate the areas, and combining this data with building levels (default 3-4) and assuming each level has the same area we can calculate the living_area.

Based on that we will sum all living areas and calculate population density per living area based on krakow's population density.

Next we will take this coefficient and calculate population density on building area per 100m^2

### 1. Trim dataset

we only need `id`, `geom`, `building(:type)` and `building:levels`

In [9]:
gdf = gdf[["id", "geom", "building", "building:levels"]].rename(columns={"building": "building:type"})
gdf.head(2)

Unnamed: 0,id,geom,building:type,building:levels
0,301173473,"POLYGON ((19.89907 49.99811, 19.89908 49.99802...",apartments,
1,125811727,"POLYGON ((19.89869 49.99864, 19.89869 49.99865...",yes,5.0


### 2. Calculate buildings area

In [10]:
gdf.to_crs(epsg=2180, inplace=True)
gdf["building:area"] = gdf["geom"].area
print(gdf["building:area"].describe())

count    122543.000000
mean        191.693877
std         839.324166
min           0.000239
25%          36.228647
50%          93.855768
75%         155.635441
max      100527.669597
Name: building:area, dtype: float64


### 3. We need to limit buildings to ony those that are livable

Probably we need to use `building:type` column but most buildings have "yes" and in visualization above we can see that this is the case for also family homes.

In [11]:
for type in gdf["building:type"].value_counts().index:
    print(type, '\t', gdf["building:type"].value_counts()[type])

yes 	 85888
apartments 	 8405
house 	 4315
garage 	 4177
detached 	 3991
residential 	 2073
service 	 1910
retail 	 1309
terrace 	 1057
garages 	 944
outbuilding 	 893
shed 	 725
roof 	 687
construction 	 677
commercial 	 591
greenhouse 	 510
industrial 	 494
office 	 458
semidetached_house 	 433
school 	 342
allotment_house 	 217
kiosk 	 207
warehouse 	 203
university 	 199
farm_auxiliary 	 192
church 	 152
kindergarten 	 130
hospital 	 109
hotel 	 92
grandstand 	 91
garbage_shed 	 81
hut 	 58
carport 	 58
dormitory 	 56
chapel 	 53
civic 	 47
ruins 	 47
public 	 37
convent 	 33
guardhouse 	 32
monastery 	 31
bunker 	 30
farm 	 27
bridge 	 25
presbytery 	 24
sports_centre 	 22
bungalow 	 22
transportation 	 22
government 	 19
toilets 	 17
trash_shed 	 16
shop 	 16
barn 	 16
museum 	 14
supermarket 	 13
storage_tank 	 11
hangar 	 10
synagogue 	 10
train_station 	 9
parking 	 9
religious 	 8
cloister 	 7
library 	 7
proposed 	 7
waste_shed 	 6
chimney 	 6
transformer_tower 	 6
college 	

#### 3.1. Definitely livable buildings

In [12]:
definitely_livable_types = [
    "apartments",
    "house",
    "detached",
    "semidetached_house",
    "residential",
    "dormitory",
]

definitely_livable = gdf[gdf["building:type"].isin(definitely_livable_types)].copy()
print(definitely_livable["building:area"].describe())

count    19273.000000
mean       310.990915
std        359.449087
min          5.506919
25%        108.148276
50%        160.780730
75%        370.081804
max       7177.797287
Name: building:area, dtype: float64


In [13]:
print(definitely_livable["building:levels"].describe())
print("-" * 30)
print(definitely_livable["building:levels"].value_counts())
print("-" * 30)
print(definitely_livable["building:levels"].isnull().sum())

count     10834
unique       23
top           2
freq       2434
Name: building:levels, dtype: object
------------------------------
building:levels
2      2434
3      1980
4      1673
1      1620
5      1610
6       416
11      292
7       187
12      172
8       134
10       85
9        84
0        36
3.5      24
4.5      22
16       16
14       15
5.5       9
2.5       8
13        7
15        6
17        3
1.5       1
Name: count, dtype: int64
------------------------------
8439


In [14]:
definitely_livable.fillna({"building:levels": 2}, inplace=True)

In [15]:
definitely_livable.head()

Unnamed: 0,id,geom,building:type,building:levels,building:area
0,301173473,"POLYGON ((564416.744 237145.386, 564417.105 23...",apartments,2,134.073271
2,301173396,"POLYGON ((564403.357 237147.948, 564403.349 23...",apartments,2,164.338593
3,692996137,"POLYGON ((564386.02 236941.818, 564398.505 236...",apartments,2,1126.355321
4,769991350,"POLYGON ((564688.179 236921.29, 564687.175 236...",apartments,2,972.286428
5,965591141,"POLYGON ((564766.96 236962.361, 564769.784 236...",apartments,2,1108.686775


#### 3.2. Potentially livable buildings

In [16]:
type_yes = gdf[gdf["building:type"] == "yes"].copy()
print(type_yes["building:area"].describe())

count     85888.000000
mean        139.515000
std         527.725429
min           0.000239
25%          32.328275
50%          85.299460
75%         133.871657
max      100527.669597
Name: building:area, dtype: float64


In [17]:
livable_yes = type_yes[70 <= type_yes["building:area"]]
livable_yes = livable_yes[livable_yes["building:area"] <= 800]
print(livable_yes["building:area"].describe())

count    47715.000000
mean       161.083570
std        115.524326
min         70.002324
25%         96.727293
50%        122.098278
75%        170.035039
max        799.578648
Name: building:area, dtype: float64


In [18]:
print(livable_yes["building:levels"].describe())
print("-" * 30)
print(livable_yes["building:levels"].value_counts())
print("-" * 30)
print(livable_yes["building:levels"].isnull().sum())

count     3858
unique      22
top          2
freq      1045
Name: building:levels, dtype: object
------------------------------
building:levels
2      1045
3       904
1       874
4       580
5       262
6        63
3.5      27
2.5      16
0        15
4.5      12
7        12
12        9
11        9
8         7
10        7
1.5       6
5.5       4
14        2
3.8       1
6.5       1
9         1
1.8       1
Name: count, dtype: int64
------------------------------
43857


In [19]:
livable_yes.fillna({"building:levels": 2}, inplace=True)

#### 3.3. All buildings assumed as livable

In [20]:
livable_buildings = pd.concat([livable_yes, definitely_livable])

### 4. calculate living area

In [21]:
livable_buildings["building:levels"] = livable_buildings["building:levels"].astype(float)

In [22]:
livable_buildings["building:floor_space"] =\
    livable_buildings["building:area"] * livable_buildings["building:levels"]

In [23]:
print(livable_buildings.dtypes)

livable_buildings.describe()

id                         int64
geom                    geometry
building:type             object
building:levels          float64
building:area            float64
building:floor_space     float64
dtype: object


Unnamed: 0,id,building:levels,building:area,building:floor_space
count,66988.0,66988.0,66988.0,66988.0
mean,359258800.0,2.311498,204.213149,605.217059
std,278563500.0,1.244984,226.457557,1346.301294
min,25247420.0,0.0,5.506919,0.0
25%,162598800.0,2.0,98.697915,193.942376
50%,233372400.0,2.0,129.124953,256.981705
75%,339574000.0,2.0,199.132022,416.655447
max,1326332000.0,17.0,7177.797287,57422.378296


### 5. Calculating density

In [24]:
KRAKOW_POPULATION = 790_000

all_floor_space = livable_buildings["building:floor_space"].sum()
print(f"{all_floor_space=:.3f}")

density_coefficient = KRAKOW_POPULATION / all_floor_space
print(f"{density_coefficient=:.3f}")

density_per_100m2 = density_coefficient * 100
print(f"{density_per_100m2=:.3f}") # 1.95 people per 100m2 - seems reasonable

all_floor_space=40542280.346
density_coefficient=0.019
density_per_100m2=1.949


In [25]:
livable_buildings["building:population"] = livable_buildings["building:floor_space"] * density_coefficient
livable_buildings["building:density_per_100m2"] = livable_buildings["building:population"] / livable_buildings["building:area"] * 100

In [26]:
livable_buildings.head()

Unnamed: 0,id,geom,building:type,building:levels,building:area,building:floor_space,building:population,building:density_per_100m2
18,161678891,"POLYGON ((564801.322 237252.039, 564796.226 23...",yes,2.0,180.417926,360.835852,7.031186,3.897166
19,161678957,"POLYGON ((564794.672 237277.515, 564794.533 23...",yes,2.0,254.323445,508.646891,9.911407,3.897166
24,337197358,"POLYGON ((564923.468 236804.199, 564926.911 23...",yes,2.0,116.15696,232.31392,4.52683,3.897166
74,1219360898,"POLYGON ((564936.177 235994.116, 564934.704 23...",yes,2.0,74.174188,148.348376,2.890691,3.897166
75,1219360895,"POLYGON ((564930.424 236021.749, 564928.965 23...",yes,2.0,74.55895,149.1179,2.905686,3.897166


### 6. Save results

In [27]:
livable_buildings.to_pickle("livable_buildings_with_density.pkl")