In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import os
import json

## scrape

### 1.1 mapping

In [2]:
links = []
for page in range(1,201,1):
    url = f'https://www.boligsiden.dk/tilsalg?sortAscending=true&page={page}'
    links.append(url)

In [3]:
links

['https://www.boligsiden.dk/tilsalg?sortAscending=true&page=1',
 'https://www.boligsiden.dk/tilsalg?sortAscending=true&page=2',
 'https://www.boligsiden.dk/tilsalg?sortAscending=true&page=3',
 'https://www.boligsiden.dk/tilsalg?sortAscending=true&page=4',
 'https://www.boligsiden.dk/tilsalg?sortAscending=true&page=5',
 'https://www.boligsiden.dk/tilsalg?sortAscending=true&page=6',
 'https://www.boligsiden.dk/tilsalg?sortAscending=true&page=7',
 'https://www.boligsiden.dk/tilsalg?sortAscending=true&page=8',
 'https://www.boligsiden.dk/tilsalg?sortAscending=true&page=9',
 'https://www.boligsiden.dk/tilsalg?sortAscending=true&page=10',
 'https://www.boligsiden.dk/tilsalg?sortAscending=true&page=11',
 'https://www.boligsiden.dk/tilsalg?sortAscending=true&page=12',
 'https://www.boligsiden.dk/tilsalg?sortAscending=true&page=13',
 'https://www.boligsiden.dk/tilsalg?sortAscending=true&page=14',
 'https://www.boligsiden.dk/tilsalg?sortAscending=true&page=15',
 'https://www.boligsiden.dk/tilsal

In [4]:
# Create a new list
houses = []
for url in links:
    # Connects to site
    response = requests.get(url, headers={'name':'He SHI','email':'pvs237@alumni.ku.dk'})
    
    #Parse data with BeautifulSoup 
    soup = BeautifulSoup(response.content, 'lxml')

    # Identify houses to scrape by inspecting site 
    houses += soup.find_all('div', class_ = 'overflow-hidden relative shadow-card rounded-sm grid grid-cols-1 grid-rows-1 sm:grid-cols-12 sm:grid-rows-1 border border-gray-100 bg-background')

    # sleep for 0.5 seconds
    time.sleep(0.5) #Sleep for 0.5 seconds

In [5]:
houses
len(houses)

10000

We then create a list of URLs that we want to scrape

In [6]:
# Create an empty list
list_of_house_urls = []

# Creating a loop that appends the article url to the list above
for i in range(len(houses)):
    list_of_house_urls.append(houses[i].find('a')['href'])

In [7]:
list_of_house_urls

['/adresse/stengaardsvaenge-85-2800-kongens-lyngby-01590814__85_______?udbud=fc0eb21b-90ee-45cf-9baf-3f3652950ec2',
 '/viderestillingaabenthus/71c02557-0069-479a-952d-fb59bb387b0c',
 '/adresse/ryes-passage-8-1-th-9000-aalborg-08516873___8__1__th?udbud=3f21f1d4-248d-4eca-8a76-f18ebcfe9e37',
 '/adresse/samsoegade-37-3-th-8000-aarhus-c-07517065__37__3__th?udbud=88acedec-41b5-45a7-a9d4-7bded7b83f04',
 '/viderestillingaabenthus/29813355-1fa4-46ac-88ef-dae921f29254',
 '/adresse/lilleaavej-73-7500-holstebro-06613700__73_______?udbud=1b89f25a-856e-45da-bcfa-03e1a243f6f1',
 '/viderestillingaabenthus/d90fffe4-5bfd-4022-99bb-6d893ea2254d',
 '/adresse/vestergade-4-2-tv-6580-vamdrup-06211055___4__2__tv?udbud=d417ce24-1f53-46eb-a775-5cdd1202fbfc',
 '/viderestillingaabenthus/c97d73c0-a76f-4e72-8bed-71f8d575f058',
 '/adresse/horns-rev-27-6857-blaavand-05730541__27_______?udbud=8aafddfa-6859-4e32-acfd-54ae1bc5f8c0',
 '/adresse/noeddehoej-226-2990-nivaa-02100534_226_______?udbud=405c233e-0827-4e4a-9047-

Some of the links are not to articls, then we...

In [8]:
list_of_house_urls_final = []
for link in list_of_house_urls:
    if '/adresse' in link: #All article URLs have this string in them, so we restrict on it being in the URL
        list_of_house_urls_final.append(link)

### 1.2 Downloading + 1.3 Parsing

In [9]:
# Define the log function to gather the log information
def log(response,logfile,output_path=os.getcwd()):
    # Open or create the csv file
    if os.path.isfile(logfile): #If the log file exists, open it and allow for changes     
        log = open(logfile,'a')
    else: #If the log file does not exist, create it and make headers for the log variables
        log = open(logfile,'w')
        header = ['timestamp','status_code','length','output_file']
        log.write(';'.join(header) + "\n") #Make the headers and jump to new line
        
    # Gather log information
    status_code = response.status_code #Status code from the request result
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #Local time
    length = len(response.text) #Length of the HTML-string
    
    # Open the log file and append the gathered log information
    with open(logfile,'a') as log:
        log.write(f'{timestamp};{status_code};{length};{output_path}' + "\n") #Append the information and jump to new line

In [10]:
# Creatig empty list for the infomation we want to extract for every article
house_data_list=[]
logfile = 'log.csv'
for i in range(1000): #len(list_of_house_urls_final)
    try:
        # This time we scrape for each house in the url list we created before
        url = 'https://www.boligsiden.dk' + list_of_house_urls_final[i] #The scraped links are relative, so we need to add the base url
        response = requests.get(url)
        soup = BeautifulSoup(response.content,'lxml')

        # creat a dictionary
        house_data = {}
    
        # Append price to list
        temp = soup.find('div', class_='flex flex-row justify-center items-center space-x-2')
        temp = temp.text.strip()
        house_data['price'] = temp
    
        # Append address to list
        temp = soup.find('div', class_='font-bold text-sm md:text-base') 
        temp = temp.text.strip()
        house_data['address'] = temp
    
        # Append area to list
        temp = soup.find('div', class_='mt-1 text-xs md:text-sm text-gray-600')
        temp = temp.text.strip()
        house_data['area'] = temp

        # Append city to list
        temp = soup.find('span', class_='inline-flex')
        temp = temp.text.strip()
        house_data['city'] = temp

        # Append type to list
        temp = soup.find('span', class_='text-black text-sm pr-2')
        temp = temp.text.strip()
        house_data['type'] = temp

        # Append other data to list 
        temp = soup.find('div', class_='py-5 px-2 md:px-6 grid text-sm grid-cols-2')
        temp = temp.text.strip()
        house_data['data'] = temp

        # Find days till sale
        temp = soup.find('div', class_='pb-0.5 border-dashed border-b border-gray-300 cursor-pointer text-sm text-gray-800') 
        temp = temp.text.strip()
        house_data['saledays'] = temp

        # Find energy ranking
        temp = soup.find('svg', id='Lag_1') 
        temp = temp.text.strip()
        house_data['energy'] = temp

        # Append the house data dictionary to the list
        house_data_list.append(house_data)
    
        # Time sleep
        time.sleep(0.5) #Sleep for 0.5 seconds

        # Logging
        log(response,logfile)

    except Exception as e:
        print(url) #Print url
        print(e) #Print error
        continue


https://www.boligsiden.dk/adresse/lilleaavej-73-7500-holstebro-06613700__73_______?udbud=1b89f25a-856e-45da-bcfa-03e1a243f6f1
'NoneType' object has no attribute 'text'
https://www.boligsiden.dk/adresse/helga-pedersens-gade-109-8-th-8000-aarhus-c-07513178_109__8__th?udbud=54a51d3c-33c1-465a-b92b-f58fdc670f6f
'NoneType' object has no attribute 'text'
https://www.boligsiden.dk/adresse/jeanettevej-6-4593-eskebjerg-03260769___6_______?udbud=08872096-327f-4806-b642-b181106d6f89
'NoneType' object has no attribute 'text'
https://www.boligsiden.dk/adresse/p.baatrupsvej-53-8300-odder-07275525__53_______?udbud=48dcb679-9883-4c3d-bcfb-2ac8a34f6f9b
'NoneType' object has no attribute 'text'
https://www.boligsiden.dk/adresse/roedkaelkevej-1-9300-saeby-08131595___1_______?udbud=7250ded2-81b5-4e88-8a2b-d6775182e934
'NoneType' object has no attribute 'text'
https://www.boligsiden.dk/adresse/gilmoseparken-43-7400-herning-06572040__43_______?udbud=b7794366-db2a-43d1-8e73-461db27caca4
'NoneType' object has

In [78]:
house_data_list

[{'price': '4.890.000 kr.',
  'address': 'Stengårdsvænge 85',
  'area': '2800 Kongens Lyngby',
  'city': 'Københavns omegn|',
  'type': 'Rækkehus/Villa',
  'data': '81 m²Grund: 301 m²4 værelserEjerudg.: 3.614 kr/mdOpført 1955Se flere detaljer',
  'saledays': 'Til salg i alt: 0 dag',
  'energy': 'Energimærke C'},
 {'price': '2.795.000 kr.',
  'address': 'Ryes Passage 8, 1. th.',
  'area': '9000 Aalborg',
  'city': 'Nordjylland|',
  'type': 'Ejerlejlighed',
  'data': '97 m²Grund: - m²4 værelserEjerudg.: 2.237 kr/mdOpført 1915Se flere detaljer',
  'saledays': 'Til salg i alt: 0 dag',
  'energy': 'Energimærke C'},
 {'price': '2.625.000 kr.',
  'address': 'Samsøgade 37, 3. th.',
  'area': '8000 Aarhus C',
  'city': 'Østjylland|',
  'type': 'Ejerlejlighed',
  'data': '57 m²Grund: - m²2 værelserEjerudg.: 1.575 kr/mdOpført 1898Se flere detaljer',
  'saledays': 'Til salg i alt: 0 dag',
  'energy': 'Energimærke D'},
 {'price': '695.000 kr.',
  'address': 'Vestergade 4, 2. tv.',
  'area': '6580 V

In [79]:
df = pd.DataFrame(house_data_list)

In [80]:
csv_file_path = 'original_data.csv'
df.to_csv(csv_file_path, index=False, encoding='utf-16')

In [81]:
def extract_numeric_values(entry):
    numeric_values = re.findall(r'\d+\.\d+|\d+', entry)
    return [float(value.replace('.', '')) if '.' in value else int(value) for value in numeric_values]

# split "data"
data_list = df['data'].tolist()
columns = ['living_space', 'ground_space', 'rooms', 'owner_expenses', 'year']
extracted_data = [extract_numeric_values(entry) for entry in data_list]
extracted_data_df = pd.DataFrame(extracted_data, columns=columns)
df = pd.concat([df, extracted_data_df], axis=1)
df.drop('data', axis=1, inplace=True)


In [82]:
df[['area_code', 'area_name']] = df['area'].str.split(' ', 1, expand=True)
df.drop(columns=['area'], inplace=True)
df['city'] = df['city'].str.replace('|', '')
df['price'] = df['price'].str.replace('.', '')
df['price'] = df['price'].str.replace(r'\s*kr.*', '', regex=True)
df['saledays'] = df['saledays'].str.extract(r'(\d+)').astype(float)

  df[['area_code', 'area_name']] = df['area'].str.split(' ', 1, expand=True)
  df['city'] = df['city'].str.replace('|', '')
  df['price'] = df['price'].str.replace('.', '')


In [83]:
df['ground_space'] = df['ground_space'].fillna(0)
df = df.dropna()   

In [84]:
type_counts = df['type'].value_counts()
print(type_counts)

Villa                      481
Rækkehus                    68
Landejendom                 34
Villa/Fritidsbolig          26
Andelsbolig                  9
Ejerlejlighed                6
Villalejlighed               6
Landejendom/Landejendom      5
Rækkehus/Villa               1
Fritidsbolig                 1
Landejendom/Villa            1
Name: type, dtype: int64


In [85]:
values_to_replace = ['Villalejlighed', 'Rækkehus/Villa', 'Fritidsbolig', 'Landejendom/Villa']
replacement_value = 'Villa'
df['type'] = df['type'].replace(values_to_replace, replacement_value)

In [86]:
type_counts = df['type'].value_counts()
print(type_counts)

Villa                      490
Rækkehus                    68
Landejendom                 34
Villa/Fritidsbolig          26
Andelsbolig                  9
Ejerlejlighed                6
Landejendom/Landejendom      5
Name: type, dtype: int64


In [87]:
dummies = pd.get_dummies(df['type'])

# create dummy only run one time
df = pd.concat([df, dummies], axis=1)

df

Unnamed: 0,price,address,city,type,saledays,energy,living_space,ground_space,rooms,owner_expenses,year,area_code,area_name,Andelsbolig,Ejerlejlighed,Landejendom,Landejendom/Landejendom,Rækkehus,Villa,Villa/Fritidsbolig
0,4890000,Stengårdsvænge 85,Københavns omegn,Villa,0.0,Energimærke C,81,301,4.0,3614.0,1955.0,2800,Kongens Lyngby,0,0,0,0,0,1,0
3,695000,"Vestergade 4, 2. tv.",Sydjylland,Ejerlejlighed,127.0,Energimærke C,90,304,3.0,1198.0,1900.0,6580,Vamdrup,0,1,0,0,0,0,0
4,4998000,Horns Rev 27,Sydjylland,Villa,0.0,Energimærke E,151,4714,5.0,5464.0,1985.0,6857,Blåvand,0,0,0,0,0,1,0
5,2995000,Nøddehøj 226,Nordsjælland,Rækkehus,0.0,Energimærke B,90,211,3.0,2902.0,1993.0,2990,Nivå,0,0,0,0,1,0,0
6,10950000,Skovbakkevej 24,Nordjylland,Villa,0.0,Energimærke D,248,926,6.0,5710.0,1894.0,9000,Aalborg,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
819,1250000,Borgergade 89,Vest- og Sydsjælland,Villa,3.0,Energimærke F,132,892,4.0,2598.0,1978.0,4241,Vemmelev,0,0,0,0,0,1,0
820,1775000,Polarvej 2,Sydjylland,Villa,3.0,Energimærke C,120,1621,3.0,2904.0,1986.0,7100,Vejle,0,0,0,0,0,1,0
821,3495000,"Lundebjerggårdsvej 1E, st. tv.",Københavns omegn,Ejerlejlighed,3.0,Energimærke B,110,4241,3.0,3254.0,2006.0,2740,Skovlunde,0,1,0,0,0,0,0
823,4700000,Ahornvej 204,Fyn,Villa,3.0,Energimærke C,170,9839,6.0,4432.0,1747.0,"Thurø,",5700 Svendborg,0,0,0,0,0,1,0


In [90]:
# Step 1: Replace all non-numeric values in the "area_code" column with NaN
df['area_code'] = pd.to_numeric(df['area_code'], errors='coerce')

# Step 2: Extract numbers from the "area_name" column
df['extracted_area'] = df['area_name'].str.extract(r'(\d+)')

# Step 3: Fill "area_code" column based on conditions
df['area_code'] = df.apply(lambda row: row['extracted_area'] if pd.isnull(row['area_code']) else row['area_code'], axis=1)

# Drop the intermediate "extracted_area" column
df.drop('extracted_area', axis=1, inplace=True)

# Print the processed DataFrame
df

Unnamed: 0,price,address,city,type,saledays,energy,living_space,ground_space,rooms,owner_expenses,year,area_code,area_name,Andelsbolig,Ejerlejlighed,Landejendom,Landejendom/Landejendom,Rækkehus,Villa,Villa/Fritidsbolig
0,4890000,Stengårdsvænge 85,Københavns omegn,Villa,0.0,Energimærke C,81,301,4.0,3614.0,1955.0,2800.0,Kongens Lyngby,0,0,0,0,0,1,0
3,695000,"Vestergade 4, 2. tv.",Sydjylland,Ejerlejlighed,127.0,Energimærke C,90,304,3.0,1198.0,1900.0,6580.0,Vamdrup,0,1,0,0,0,0,0
4,4998000,Horns Rev 27,Sydjylland,Villa,0.0,Energimærke E,151,4714,5.0,5464.0,1985.0,6857.0,Blåvand,0,0,0,0,0,1,0
5,2995000,Nøddehøj 226,Nordsjælland,Rækkehus,0.0,Energimærke B,90,211,3.0,2902.0,1993.0,2990.0,Nivå,0,0,0,0,1,0,0
6,10950000,Skovbakkevej 24,Nordjylland,Villa,0.0,Energimærke D,248,926,6.0,5710.0,1894.0,9000.0,Aalborg,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
819,1250000,Borgergade 89,Vest- og Sydsjælland,Villa,3.0,Energimærke F,132,892,4.0,2598.0,1978.0,4241.0,Vemmelev,0,0,0,0,0,1,0
820,1775000,Polarvej 2,Sydjylland,Villa,3.0,Energimærke C,120,1621,3.0,2904.0,1986.0,7100.0,Vejle,0,0,0,0,0,1,0
821,3495000,"Lundebjerggårdsvej 1E, st. tv.",Københavns omegn,Ejerlejlighed,3.0,Energimærke B,110,4241,3.0,3254.0,2006.0,2740.0,Skovlunde,0,1,0,0,0,0,0
823,4700000,Ahornvej 204,Fyn,Villa,3.0,Energimærke C,170,9839,6.0,4432.0,1747.0,5700.0,5700 Svendborg,0,0,0,0,0,1,0


In [72]:
csv_file_path = 'cleaned_data.csv'
df.to_csv(csv_file_path, index=False, encoding='utf-16')

In [74]:
excel_file_path = 'cleaned_data_excel.xlsx'
df.to_excel(excel_file_path, index=False, encoding='utf-16')
