In [1]:
import requests
import json
import re
import pandas as pd
from datetime import datetime
import concurrent.futures
import time

In [3]:
def get_today_date():
    return datetime.now().strftime('%Y-%m-%d')

def load_urls(load_name):
    df = pd.read_csv(load_name)
    urls = ["https://www.sreality.cz/api/cs/v2/estates/" + str(x) for x in df['url_id']]
    return urls

def get_data(url):
    r = requests.get(url)
    data = r.json()
    return data

def fetch_data_concurrently(urls):
    data_list = []
    
    # Fetch the first URL and time it
    start_time_single = time.time()
    data_list.append(get_data(urls[0]))
    end_time_single = time.time()
    time_single = end_time_single - start_time_single
    
    # Fetch the rest of the URLs concurrently
    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        futures = {executor.submit(get_data, url): url for url in urls[1:]}
        
        for i, future in enumerate(concurrent.futures.as_completed(futures), start=1):
            url = futures[future]
            try:
                data = future.result()
                data_list.append(data)
                
                # Calculate estimated remaining time
                remaining_urls = len(urls) - (i + 1)
                remaining_time_estimate = time_single * remaining_urls
                remaining_time_estimate = remaining_time_estimate / 2  # We are using 2 threads
                
                if remaining_time_estimate > 999:
                    remaining_time_estimate /= 60  # Convert to minutes
                    print(f"\rRemaining estimated time for fetching data from {remaining_urls} URLs: {remaining_time_estimate:.2f} minutes.", end="")
                else:
                    print(f"\rRemaining estimated time for fetching data from {remaining_urls} URLs: {remaining_time_estimate:.2f} seconds.", end="")
                
            except Exception as exc:
                print('%r generated an exception: %s' % (url, exc))
    
    return data_list

def main(load_name=f'{get_today_date()}_urls.csv'):
    urls = load_urls(load_name)
    data_list = fetch_data_concurrently(urls)
    return data_list


In [4]:
data_list = main()

Remaining estimated time for fetching data from 0 URLs: 0.00 seconds......

In [5]:
# Create an empty DataFrame
dict_list = []

# loop through the data_list and extract the required data for each item
for data in data_list:
    # create an empty dictionary to store the data for this item
    dict_data = {}

    # check if '_links' and 'self' keys are in the data dictionary
    if '_links' in data and 'self' in data['_links']:
        # add the href value to the dictionary under the name 'url_id'
        dict_data['url_id'] = data['_links']['self'].get('href', None)

    # check if 'items' key is in the data dictionary
    if 'items' in data:
        # extract the required data for this item
        for item in data['items']:
            # add the value to the corresponding key in the dictionary
            dict_data[item['name']] = item['value']

    # check if 'map' key is in the data dictionary
    if 'map' in data:
        # add the latitude and longitude to the dictionary
        dict_data['Latitude'] = data['map'].get('lat', None)
        dict_data['Longitude'] = data['map'].get('lon', None)

    # check if there are any 'poi' items in the data
    if 'poi' in data:
        # loop through each 'poi' item
        for poi in data['poi']:
            # add the distance to the dictionary under the name of the poi
            # note that this will create a new column for each unique poi name
            dict_data[poi['name']] = poi['distance']

    # append the dictionary to the list
    dict_list.append(dict_data)

# convert the list of dictionaries to a DataFrame
df = pd.DataFrame(dict_list)

df.head()


Unnamed: 0,url_id,Celková cena,ID zakázky,Aktualizace,Stavba,Stav objektu,Vlastnictví,Umístění objektu,Podlaží,Užitná plocha,...,Datum konání dražby,Dražební vyhláška,Posudek znalce,Velikost podílu,Velikost podílu společných prostor,Počet vlastníků,Minimální kupní cena,Termín 1. prohlídky,Termín 2. prohlídky,Plocha bazénu
0,/cs/v2/estates/847426892,4 168 000,202356.0,Dnes,Panelová,Po rekonstrukci,Osobní,Klidná část obce,6. podlaží,53,...,,,,,,,,,,
1,/cs/v2/estates/1687311436,5 673 000,,Dnes,Cihlová,Po rekonstrukci,Osobní,,1. podlaží,156,...,,,,,,,,,,
2,/cs/v2/estates/461526092,5 410 000,,Dnes,Cihlová,Po rekonstrukci,Osobní,,1. podlaží,125,...,,,,,,,,,,
3,/cs/v2/estates/741217356,4 585 000,,Dnes,Cihlová,Novostavba,Osobní,Klidná část obce,5. podlaží z celkem 5,101,...,,,,,,,,,,
4,/cs/v2/estates/63538508,3 632 000,142.0,Dnes,Skeletová,Velmi dobrý,Osobní,Klidná část obce,3. podlaží z celkem 4,52,...,,,,,,,,,,


In [6]:
filename = f'{get_today_date()}_data.csv'
df.to_csv(filename, index=False)