In [1]:
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
import time

disable_scraping = True

## Get All bus routes

In [2]:
if disable_scraping:
    print('Scraping is disabled')
    busRoutes = pd.read_pickle('data/raw/busRoutes.pkl')
else:
    url = 'https://dinoffentligetransport.dk/api/BusLines/lines'
    response = requests.get(url,)

    if response.status_code != 200:
        print('Failed to get data:', response.status_code)
        exit(1)
    else:
        # store the response in a pandas dataframe
        busRoutes = pd.DataFrame(response.json())
        busRoutes_list = busRoutes.designation.to_list()
        busRoutes_refs = busRoutes.ref.to_list()

    busRoutes.to_pickle('data/raw/busRoutes.pkl')

print(f'Number of unique bus lines: {busRoutes.designation.nunique()}')

Scraping is disabled
Number of unique bus lines: 424


## Load all busstops for each route

In [3]:
if disable_scraping:
    print('Scraping is disabled')
    with open('data/raw/busRoutes_dict.json', 'r') as f:
        busroutes_dict = json.load(f)
else:
    busroutes_dict = {key:None for key in busRoutes_refs[:]}
    for ref in list(busroutes_dict.keys()):
        response = requests.get(f'https://dinoffentligetransport.dk/api/buslines/fromstoppoints/{ref}/1',)
        busroutes_dict[ref] = response.json()
        time.sleep(2)

    with open('data/raw/busRoutes_dict.json', 'w') as f:
        json.dump(busroutes_dict, f)

Scraping is disabled


## House listings

In [4]:
def scrape_boligsiden(url:str, pagenumber:int):
    # create header
    headers={'name':'xkh771','mail':'xkh771@alumni.ku.dk'}
    # intialize empty list to store data
    dataList = []
    # Looping through all available pages
    for pagenumber in range(1,pagenumber):
        # create a url with the page number
        url_page = url + f'&page={pagenumber}'
        # get the response
        response = requests.get(url_page, headers=headers)
        # load the json data
        data = json.loads(response.text)
        # append the data to the dataList
        dataList.append(data) # Append fetched data to list
        time.sleep(1) # Sleep for 1 second to avoid overloading the server

    # Do the same for the other data
    data_cases = [item['cases'] for item in dataList]
    #Flatten the list
    data_cases = [item for sublist in data_cases if sublist is not None for item in sublist]
    # Create a dataframe
    df = pd.DataFrame(data_cases)

    return df

In [5]:
if disable_scraping:
    print('Scraping is disabled')
else:
    cop_url = 'https://api.boligsiden.dk/search/cases?municipalities=K%C3%B8benhavn&municipalities=Frederiksberg&addressTypes=villa%2Ccondo%2Cterraced+house%2Choliday+house%2Ccooperative%2Cfarm%2Chobby+farm%2Cfull+year+plot%2Cvilla+apartment%2Choliday+plot%2Chouseboat&per_page=50&highlighted=false&sortBy=random'
    omegn_url = 'https://api.boligsiden.dk/search/cases?provinces=K%C3%B8benhavns+omegn&addressTypes=villa%2Ccondo%2Cterraced+house%2Choliday+house%2Ccooperative%2Cfarm%2Chobby+farm%2Cfull+year+plot%2Cvilla+apartment%2Choliday+plot%2Chouseboat&per_page=50&highlighted=false&sortBy=random'
    df_cop = scrape_boligsiden(cop_url, 43)
    df_omegn_data = scrape_boligsiden(omegn_url, 49)
    df_raw_data = pd.concat([df_cop, df_omegn_data], axis=0)
    df_raw_data.to_pickle('data/raw/df_raw_data.pkl')

Scraping is disabled


## Metro stations

In [6]:
## Metro station
if disable_scraping:
    print('Scraping is disabled')
    df_metro = pd.read_pickle('data/raw/metro.pkl')
else:
   url = 'https://m.dk/'
   response = requests.get(url, headers={'name':'xkh771','mail':'xkh71@alumni.ku.dk'})
   soup = BeautifulSoup(response.text, 'html.parser')

   svg_element = soup.find('svg', {'id':'cont'})
   g_elements = svg_element.find_all('g')
   metro_stations = []
   for g in g_elements:
      a_tag = g.find('a')
      if a_tag:
         text = a_tag.text.strip()
         if text and text[0].isalpha():
            metro_stations.append(text)

   cleaned_text = [x.replace('\n',' ') for x in metro_stations]
   df_metro = pd.DataFrame(cleaned_text, columns=['MetroStation'])
   df_metro.to_pickle('data/raw/metro.pkl')

print(f'Number of unique metro stations: {df_metro.MetroStation.nunique()}')

Scraping is disabled
Number of unique metro stations: 44
