In [140]:
import os

import pandas as pd
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from geopy.geocoders import GoogleV3, Nominatim
import folium
import geojson
import numpy as np
from shapely.geometry import shape, Point

In [8]:
load_dotenv()
GOOGLE_GEO_API_KEY = os.environ.get('GOOGLE_GEO_API_KEY')

## Parse flat price, size, address from saved html pages

In [9]:
def find_flat_in_minsk() -> list:
    _flats = []

    for page_number in range(1, 26):
        html_page_name = 'flat/%s.html' % page_number
        with open(html_page_name) as html_file:
            soup = BeautifulSoup(html_file, 'html.parser')

        prices = soup.find_all('span', attrs={'data-bind': "text: SearchApartments.formatPrice(apartment.price, 'USD')"})

        flats = soup.find_all('span', attrs={'class': 'classified__caption-item classified__caption-item_type', 
                                             'data-bind': 'text: SearchApartments.formatRentType(apartment.rent_type)'})

        addresses = soup.find_all('span', attrs={'class': 'classified__caption-item classified__caption-item_adress', 
                                                 'data-bind': 'text: apartment.location.user_address'})

        # print(page_number, '| prices:', len(prices), ', flats:', len(flats), ', addresses', len(addresses), '\n')

        for price, flat_size, address in zip(prices, flats, addresses):
            pr = int(price.text)
            fs = flat_size.text
            if fs == 'Комната':
                fs = 0.5
            else:
                fs = int(fs[0])

            ads = address.text

            _flats.append([pr, fs, ads])
    
    return _flats

In [158]:
# minsk_flats = find_flat_in_minsk()
# print('Now avalible', len(minsk_flats), 'flats in Minsk')
# minsk_flats[:5]

## Find coordinates for each flat with Google GEO API, and add them to the list (minsk_flats)

In [160]:
def find_coords(flat_list: list):
    for flat in flat_list:
        try:
            address = flat[2] + ', Minsk'
            google_geo = GoogleV3(api_key=GOOGLE_GEO_API_KEY)
            coords = google_geo.geocode(address)

            lat = round(float(coords.latitude), 7)
            lng = round(float(coords.longitude), 7)

            flat.append(lat)
            flat.append(lng)
            # print(lat, lng)
        except Exception as ex:
            print('Exception in google_location: %s' % ex)

In [161]:
# find_coords(minsk_flats)
# minsk_flats[:5]

## Save parsed data to csv

In [159]:
# columns = ['price', 'flat-size', 'address', 'lat', 'lng']
# df_flats = pd.DataFrame(data=minsk_flats, columns=columns)
# df_flats.head()

In [157]:
# df_flats.to_csv('minsk_flats.csv')

### open csv

In [199]:
df_flats = pd.read_csv('minsk_flats.csv')
df_flats.drop(columns=['Unnamed: 0'], axis=1, inplace=True)
print(df_flats.shape)
df_flats.head()

(900, 5)


Unnamed: 0,price,flat-size,address,lat,lng
0,295,1.0,"Чернышевского, 7",53.926984,27.600406
1,900,4.0,"Калинина, 7А",53.92642,27.605746
2,470,2.0,"Мельникайте, 16",53.909619,27.544978
3,380,1.0,"Академика Фёдорова, 3",53.872958,27.633268
4,300,2.0,"Волоха, 7 к1",53.896907,27.521841


## for df_flats add 1 column 'aria'

In [207]:
len_df_flats = df_flats.shape[0]
len_df_flats

900

In [201]:
df_a = pd.DataFrame(data=[None for _ in range(len_df_flats)], columns=['area'])
df_flats = pd.concat([df_flats, df_a], axis=1)
df_flats.head()

Unnamed: 0,price,flat-size,address,lat,lng,area
0,295,1.0,"Чернышевского, 7",53.926984,27.600406,
1,900,4.0,"Калинина, 7А",53.92642,27.605746,
2,470,2.0,"Мельникайте, 16",53.909619,27.544978,
3,380,1.0,"Академика Фёдорова, 3",53.872958,27.633268,
4,300,2.0,"Волоха, 7 к1",53.896907,27.521841,


## for each flat find neighborhood from geojson polygons

In [202]:
minsk_areas = r'minsk_areas.geojson'

In [203]:
def open_geojson():
    with open(minsk_areas, encoding="utf8") as f:
        data = geojson.load(f)
    
    point = Point(27.429495, 53.912448)
    
    for index in range(len_df_flats):
        flat_lat = df_flats.iloc[index, 3]
        flat_lng = df_flats.iloc[index, 4]

        for feature_t in data['features']:
            point = Point(flat_lng, flat_lat)
            polygon = shape(feature_t['geometry'])
            
            if polygon.contains(point):
                neighborhood = feature_t['properties']['neighborhood']
                # print('Found containing polygon:', neighborhood)
                df_flats.at[index, 'area'] = neighborhood


In [204]:
%%time
open_geojson()
df_flats.head()

CPU times: user 3.71 s, sys: 319 µs, total: 3.71 s
Wall time: 3.71 s


Unnamed: 0,price,flat-size,address,lat,lng,area
0,295,1.0,"Чернышевского, 7",53.926984,27.600406,
1,900,4.0,"Калинина, 7А",53.92642,27.605746,Челюскинтцев
2,470,2.0,"Мельникайте, 16",53.909619,27.544978,Центр
3,380,1.0,"Академика Фёдорова, 3",53.872958,27.633268,западный посёлок
4,300,2.0,"Волоха, 7 к1",53.896907,27.521841,Розы люксембург


In [206]:
df_flats.isnull().sum()

price          0
flat-size      0
address        0
lat            0
lng            0
area         115
dtype: int64

In [213]:
areas_grouped = df_flats.groupby('area').mean().reset_index()
print(areas_grouped.shape)
areas_grouped.head()

(58, 5)


Unnamed: 0,area,price,flat-size,lat,lng
0,Академии наук,818.0,2.5,53.923316,27.599986
1,Аэродромная,418.894737,1.855263,53.878749,27.54842
2,Брилевичи,424.545455,1.636364,53.851028,27.484192
3,Велозавод,140.0,0.5,53.885515,27.587505
4,Восгоградская,300.0,1.0,53.930212,27.617529


## Normalyze flat price: x $ / 1 room

In [220]:
avg_price = round(areas_grouped['price'].div(areas_grouped['flat-size']), 1)
flat_norm = pd.concat([areas_grouped['area'], avg_price], axis=1)
flat_norm.columns = ['area', 'avg-price']
flat_norm.head()

Unnamed: 0,area,avg-price
0,Академии наук,327.2
1,Аэродромная,225.8
2,Брилевичи,259.4
3,Велозавод,280.0
4,Восгоградская,300.0


## Plot Rezult

In [221]:
address = 'Minsk, BY'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinate of Minsk are {}, {}.'.format(latitude, longitude))

The geographical coordinate of Minsk are 53.902334, 27.5618791.


In [222]:
map_minsk_flat = folium.Map(location=[latitude, longitude], zoom_start=12)

choropleth = folium.Choropleth(
    geo_data=minsk_areas,    # geojson
    data=flat_norm,
    columns=['area', 'avg-price'],
    key_on='feature.properties.neighborhood',
    fill_color='YlOrRd', 
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Flat Price',
).add_to(map_minsk_flat)

choropleth.geojson.add_child(
    folium.features.GeoJsonTooltip(['neighborhood'])
)

# display map
map_minsk_flat