### Import
Import relevant packages

In [1]:
import time # set time when scraping
import requests # Request HTML code
import os
import tqdm # To get status line when scraping
import pandas as pd # To work with data in dataframes
from bs4 import BeautifulSoup # Beautiful Soup to parse HTML code

### Functions
Define functions that will be used in the code

#### Function to log scraping

In [2]:
# Define the log function to gather the log information
def log(response,logfile,output_path=os.getcwd()):
    # Open or create the csv file
    if os.path.isfile(logfile): #If the log file exists, open it and allow for changes     
        log = open(logfile,'a')
    else: #If the log file does not exist, create it and make headers for the log variables
        log = open(logfile,'w')
        header = ['timestamp','status_code','length','output_file']
        log.write(';'.join(header) + "\n") #Make the headers and jump to new line
        
    # Gather log information
    status_code = response.status_code #Status code from the request result
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #Local time
    length = len(response.text) #Length of the HTML-string
    
    # Open the log file and append the gathered log information
    with open(logfile,'a') as log:
        log.write(f'{timestamp};{status_code};{length};{output_path}' + "\n") #Append the information and jump to new line

### List of links
Create a list of links by looping through the pages

In [3]:
# Make empty list
links = []

# Loop through pages
for page in range(1,5,1):
    url = f"https://api.boliga.dk/api/v2/search/results?sort=views-d&page={page}" #Create URL
    links.append(url) # append url to links
    
links

['https://api.boliga.dk/api/v2/search/results?sort=views-d&page=1',
 'https://api.boliga.dk/api/v2/search/results?sort=views-d&page=2',
 'https://api.boliga.dk/api/v2/search/results?sort=views-d&page=3',
 'https://api.boliga.dk/api/v2/search/results?sort=views-d&page=4']

### Extract HTML CODE
1. Create an empty list of HTMLS
2. Loop through list of urls and get html relate to that url
3. append html text to list og HTMLs
4. set sleep timer


In [4]:
# Creates empty list of jsonfiles
json_list = []

# Create logfile
logfile = 'log.csv'

# Loop through links
for url in tqdm.tqdm(links):
    response = requests.get(url,headers = {'name': 'Nicoline Lund Dahl', 'email': 'dmg934@alumni.ku.dk'},
                       verify = False)
    json=response.json() # Turn into json
    json_list.append(json)
    time.sleep(1)
    log(response,logfile)

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:07<00:00,  1.83s/it]


In [22]:
# Turn data into a json
# json_list[0].keys()
# json_list[0]
# result_data = result['results']

# # Create df
# df_bolig = pd.DataFrame(result_data)

# df_bolig
json_list[1]['results']

# Udvælger kun relevante elementer i listen dvs. hvor der er et id
json_list[3]['results']

bolig_df = pd.DataFrame()
for i in range(len(json_list)):
    df = pd.DataFrame(json_list[i]['results'])
    # Append df to empty df
    bolig_df = pd.concat([bolig_df, df])

pd.set_option('display.max_columns', None)
bolig_df   

Unnamed: 0,id,latitude,longitude,propertyType,priceChangePercentTotal,energyClass,openHouse,price,selfsale,rooms,size,lotSize,floor,buildYear,city,isForeclosure,isActive,municipality,zipCode,street,squaremeterPrice,area,daysForSale,createdDate,isPremiumAgent,images,net,exp,basementSize,inWatchlist,views,agentRegId,domainId,guid,agentDisplayName,groupKey,downPayment,itemType,dawaId,projectSaleUrl,additionalBuildings,lastSeen,businessArea,nonPremiumDiscrete,bfeNr,ouId,ouAddress,onTheWay,cleanStreet,otwAddress,dsAddress,boligaPlus,showLogo,randomTypeHuse
0,1844515,55.67386,12.57692,1,0,d,,100000000,False,12.0,383,213,,1757,København K,False,True,101,1472,Ny Kongensgade 3,261096.0,1,1007,2021-11-11T23:02:32.000Z,False,"[{'id': 1844515, 'date': '2024-08-15T13:01:28....",0,13972,105,False,94036,26472,171,628079F8-F05F-4D4D-A5B4-1AF913E20B7C,,,5000000,0,,,,2024-08-14T22:39:51.557Z,80.0,False,6027332.0,0,ny-kongensgade-3-1472-koebenhavn-k,False,Ny Kongensgade,,,False,False,"{'leisureHouses': [], 'houses': []}"
1,2124268,55.86242,12.48589,1,0,D,,33000000,False,9.0,274,22071,,1934,Hørsholm,False,True,230,2970,Kirsebærvej 8,120437.0,3,37,2024-07-09T00:07:55.980Z,False,"[{'id': 2124268, 'date': '2024-08-15T13:01:28....",152039,12294,52,False,27527,17998,609,F1B6C7CC-AA4F-43AD-B744-BCEA80F993E4,,,1650000,0,,,,2024-08-14T23:34:32.110Z,153.0,False,2361211.0,1399883946,kirsebaervej-8-2970-hoersholm,False,Kirsebærvej,,,False,False,"{'leisureHouses': [], 'houses': [[{'estateUrl'..."
2,2082241,55.76980,12.57548,1,0,C,,78000000,False,12.0,456,5252,,1926,Klampenborg,False,True,157,2930,Klampenborgvej 35C,171052.0,2,153,2024-03-14T23:03:35.433Z,False,"[{'id': 2082241, 'date': '2024-08-15T13:01:28....",0,37909,195,False,25726,25254,419,7A2467F3-FCA9-4D3E-98FF-2AAC6AB0D587,,,3900000,0,,,,2024-08-14T22:31:01.637Z,0.0,False,2038042.0,588843670,klampenborgvej-35c-2930-klampenborg,False,Klampenborgvej,,,False,False,"{'leisureHouses': [], 'houses': [[{'estateUrl'..."
3,2088714,55.92461,12.44494,6,-13,J,,42000000,False,9.0,375,40946,,2019,Kokkedal,False,True,210,2980,Fredensborg Kongevej 42,112000.0,3,139,2024-03-28T23:02:51.050Z,False,"[{'id': 2088714, 'date': '2024-08-15T13:01:28....",0,11780,241,False,24388,26800,419,54DCADE6-8FA5-4652-B64F-65A182758CDD,,,2100000,0,,,,2024-08-14T22:31:01.637Z,0.0,False,7662613.0,0,fredensborg-kongevej-42-2980-kokkedal,False,Fredensborg Kongevej,,,False,False,"{'leisureHouses': [], 'houses': [[{'estateUrl'..."
4,1990622,55.77462,12.46564,1,-19,E,,58000000,False,14.0,704,3423,,1882,Kongens Lyngby,False,True,173,2800,"Nybrovej 375, Tusculum",82386.0,2,470,2023-05-02T22:05:41.047Z,False,"[{'id': 1990622, 'date': '2024-08-15T13:01:28....",262158,54937,196,False,23277,26763,419,E3313E6F-F2AD-4C6C-834E-94BABF4ACF65,,,2900000,0,,,,2024-08-14T22:31:01.637Z,0.0,False,8864741.0,2120502893,nybrovej-375-2800-kongens-lyngby,False,Nybrovej,,,False,False,"{'leisureHouses': [], 'houses': [[{'estateUrl'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45,2106654,54.91609,9.39523,1,0,J,,3795000,False,4.0,156,19832,,2016,Padborg,False,True,580,6330,"Vilsbækvej 37A, Vilsbæk",24326.0,8,83,2024-05-23T23:45:23.230Z,False,"[{'id': 2106654, 'date': '2024-08-15T13:01:34....",0,1952,0,False,3486,1080,9,A5D0A25C-E214-42C3-9DF1-817E4962248F,,,190000,0,,,,2024-08-14T23:40:34.143Z,0.0,False,100016430.0,12223021,vilsbaekvej-37a-6330-padborg,False,Vilsbækvej,,,False,False,"{'leisureHouses': [], 'houses': [[{'estateUrl'..."
46,1684136,55.89851,11.12725,8,0,-,,375000,False,0.0,0,1238,,0,Sejerø,False,True,326,4592,Knoldebjerg 19,0.0,6,1490,2020-07-16T22:03:28.983Z,False,"[{'id': 1684136, 'date': '2024-08-15T13:01:34....",1958,344,0,False,3481,25818,667,,,,25000,0,F2EDFFE0-5351-46C7-A3D1-9EE1C850186C,,,2024-08-14T22:31:01.637Z,,False,2455625.0,1422174542,knoldebjerg-19-4592-sejeroe,False,Knoldebjerg,,,False,False,"{'leisureHouses': [], 'houses': [[{'estateUrl'..."
47,626139,56.56825,8.16081,1,-19,E,,1295000,False,10.0,253,2442,,1904,Lemvig,False,True,665,7620,"Harboørevej 1, Strande",5118.0,9,4831,2011-05-25T02:57:51.530Z,True,"[{'id': 626139, 'date': '2024-08-15T13:01:34.0...",0,1542,9,False,3476,540,9,F90D5605-3142-4E2E-8F20-875FEF10F4D6,Nybolig Lemvig,,65000,0,0A3F50BE-6EE1-32B8-E044-0003BA298018,,,2024-08-14T23:41:59.103Z,0.0,False,5007012.0,133843946,harbooerevej-1-7620-lemvig,False,Harboørevej,,,False,False,"{'leisureHouses': [], 'houses': [[{'estateUrl'..."
48,1929257,55.91833,11.56405,6,0,C,,13200000,False,6.0,316,351611,,1868,Højby,False,True,306,4573,"Gudmindrupvej 3, Gudmindrup",41772.0,6,706,2022-09-08T22:03:40.037Z,False,"[{'id': 1929257, 'date': '2024-08-15T13:01:34....",0,3244,0,False,3449,25226,419,A116BA1B-CC4F-488F-9338-CDF04ADA91E6,,,660000,0,0A3F50AF-2C98-32B8-E044-0003BA298018,,,2024-08-14T22:31:01.637Z,0.0,False,1313696.0,1936749576,gudmindrupvej-3-4573-hoejby,False,Gudmindrupvej,,,False,False,"{'leisureHouses': [], 'houses': [[{'estateUrl'..."


## Data Wrangling
Wrangle data with properties so it can be used for analysis

In [None]:
# PropertyType
# 1 = Villa
# 2 = Rækkehus
# 3 = Ejerlejlighed
# 4 = Fritidshus
# 5 = Andelsbolig
# 6 = Landejendom
# 7 = Helårsgrund
# 8 = Fritidsgrund
# 9 = Villalejlighed
# Andet (0 eller 10?)

# Area = Landsdel
# 1 = København By
# 2 = Københavns omegn
# 3 = Nordsjælland
# 4 = Bornholm
# 5 = Østsjælland
# 6 = Vest- og Sydsjælland
# 7 = Syddanmark
# 8 = Sydjylland
# 9 = Midtjylland
# 10 = Vestjylland
# 11 = Nordjylland
# 0 = missing?

# Exp = ejerudgifter

In [10]:
bolig_df.to_csv('boliger_salg.csv', sep=',', index=False, encoding='utf-8')

In [23]:
# Danner dummy for vej
bolig_df['vej'] = bolig_df['cleanStreet'].str.contains('vej', case=False).astype(int)

# Danner dummy for alle
bolig_df['alle'] = bolig_df['cleanStreet'].str.contains('alle', case=False).astype(int)

# Danner dummy for gade
bolig_df['gade'] = bolig_df['cleanStreet'].str.contains('gade', case=False).astype(int)

# Danner dummy for alle
bolig_df['boulevard'] = bolig_df['cleanStreet'].str.contains('boulevard', case=False).astype(int)

# Danner dummy for strand
bolig_df['strand'] = bolig_df['cleanStreet'].str.contains('strand', case=False).astype(int)


In [32]:
bolig_df[bolig_df['strand']==1]

Unnamed: 0,id,latitude,longitude,propertyType,priceChangePercentTotal,energyClass,openHouse,price,selfsale,rooms,size,lotSize,floor,buildYear,city,isForeclosure,isActive,municipality,zipCode,street,squaremeterPrice,area,daysForSale,createdDate,isPremiumAgent,images,net,exp,basementSize,inWatchlist,views,agentRegId,domainId,guid,agentDisplayName,groupKey,downPayment,itemType,dawaId,projectSaleUrl,additionalBuildings,lastSeen,businessArea,nonPremiumDiscrete,bfeNr,ouId,ouAddress,onTheWay,cleanStreet,otwAddress,dsAddress,boligaPlus,showLogo,randomTypeHuse,vej,alle,gade,boulevard,strand
11,2056762,55.8595,12.56272,1,0,C,,120000000,False,12.0,527,6267,,1917,Vedbæk,False,True,230,2950,"Vedbæk Strandvej 478, Vedbæklund",227703.0,3,253,2023-12-05T23:05:48.000Z,False,"[{'id': 2056762, 'date': '2024-08-15T13:01:28....",0,60805,95,False,16317,25254,419,0B484D86-F581-4E7E-A899-16A8305E81D3,,,6000000,0,,,,2024-08-14T22:31:01.637Z,0.0,False,2059831.0,479018074,vedbaek-strandvej-478-2950-vedbaek,False,Vedbæk Strandvej,,,False,False,"{'leisureHouses': [], 'houses': [[{'estateUrl'...",1,0,0,0,1
15,2099167,55.84077,12.57324,1,0,e,,34500000,False,7.0,264,2616,,1964,Vedbæk,False,True,230,2950,Vedbæk Strandvej 312,130681.0,3,104,2024-05-02T22:35:07.980Z,False,"[{'id': 2099167, 'date': '2024-08-15T13:01:28....",158797,25532,52,False,14684,26473,428,7BE8DA7E-A746-4C26-B1AC-7194BBC595A5,,,1725000,0,,,,2024-08-14T22:37:09.973Z,0.0,False,2057164.0,92343779,vedbaek-strandvej-312-2950-vedbaek,False,Vedbæk Strandvej,,,False,False,"{'leisureHouses': [], 'houses': [[{'estateUrl'...",1,0,0,0,1
23,2112024,55.7821,12.59278,1,0,A,,100000000,False,8.0,396,1465,,1876,Klampenborg,False,True,173,2930,Taarbæk Strandvej 38,252525.0,2,69,2024-06-06T22:03:51.877Z,False,"[{'id': 2112024, 'date': '2024-08-15T13:01:28....",0,38809,137,False,11384,25254,419,AA7AFDB4-8AFB-42B7-87EE-4116C2A4C572,,,5000000,0,,,,2024-08-14T22:31:01.637Z,0.0,False,2046763.0,597622442,taarbaek-strandvej-38-2930-klampenborg,False,Taarbæk Strandvej,,,False,False,"{'leisureHouses': [], 'houses': [[{'estateUrl'...",1,0,0,0,1
24,2047135,55.73459,12.58205,7,0,-,,250000000,False,23.0,0,2968,,1902,Hellerup,False,True,157,2900,Lille Strandvej 27,0.0,2,287,2023-11-02T00:54:55.120Z,False,"[{'id': 2047135, 'date': '2024-08-15T13:01:28....",0,54219,0,False,11221,742,3,F49EA1E3-ED0C-4A15-8D2A-0FE46215995A,,,12500000,0,,,,2024-08-14T23:51:03.720Z,,False,2009154.0,1708239206,lille-strandvej-27-2900-hellerup,False,Lille Strandvej,,,False,False,"{'leisureHouses': [], 'houses': [[{'estateUrl'...",1,0,0,0,1
31,2002918,56.10717,12.17933,4,0,-,,45000000,False,5.0,295,5538,,1907,Græsted,False,True,270,3230,Udsholt Strandvej 134,152542.0,3,430,2023-06-11T22:04:43.183Z,False,"[{'id': 2002918, 'date': '2024-08-15T13:01:28....",0,23157,77,False,10316,26472,171,83F2B737-E74B-435E-8332-75301B4D97D2,,,2250000,0,,,,2024-08-14T22:31:01.637Z,0.0,False,2309841.0,346924870,udsholt-strandvej-134-3230-graested,False,Udsholt Strandvej,,,False,False,"{'leisureHouses': [], 'houses': [[{'estateUrl'...",1,0,0,0,1
33,1948864,55.67722,12.594,1,-13,-,,6950000,False,5.0,135,0,,1964,København K,False,True,101,1401,Strandgade 75Z,51481.0,1,639,2022-11-14T23:03:31.863Z,False,"[{'id': 1948864, 'date': '2024-08-15T13:01:28....",31923,10601,60,False,9458,26413,1109,,,,350000,0,,,,2024-08-14T22:31:01.637Z,,False,,0,strandgade-75z-1401-koebenhavn-k,False,Strandgade,,,False,False,"{'leisureHouses': [], 'houses': []}",0,0,1,0,1
11,1014037,55.22032,10.12071,7,25,-,,325000,False,0.0,0,636,,0,Haarby,False,True,420,5683,Strandgade 42,0.0,7,3648,2014-08-20T01:44:00.940Z,False,"[{'id': 1014037, 'date': '2024-08-15T13:01:30....",1642,439,0,False,6837,17855,6,8CBE090B-D83C-496A-B071-C8CC61A81DDF,,,25000,0,8D762022-428E-4995-A34C-3B27807120D5,,,2024-08-14T23:55:06.050Z,,False,2686691.0,908028231,strandgade-42-5683-haarby,False,Strandgade,,,False,False,"{'leisureHouses': [], 'houses': [[{'estateUrl'...",0,0,1,0,1
13,2078115,55.76706,12.09114,1,-3,C,,8500000,False,5.0,193,1795,,1952,Jyllinge,False,True,265,4040,Strandtoften 26,44041.0,5,165,2024-03-03T11:30:28.853Z,False,"[{'id': 2078115, 'date': '2024-08-15T13:01:30....",38176,4378,80,False,6623,17221,465,E1B5C695-EBA0-4808-A420-CF8F24789BC4,,,425000,0,0A3F50AB-4462-32B8-E044-0003BA298018,,,2024-08-14T22:31:01.637Z,0.0,False,2176135.0,2091932527,strandtoften-26-4040-jyllinge,False,Strandtoften,,,False,False,"{'leisureHouses': [], 'houses': [[{'estateUrl'...",0,0,0,0,1
38,1769910,55.52418,10.41905,6,0,-,,11995000,False,6.0,725,39998,,1789,Otterup,False,True,480,5450,Strandvejen 151 mfl.,16544.0,7,1223,2021-04-10T00:19:19.000Z,False,"[{'id': 1769910, 'date': '2024-08-15T13:01:30....",0,6396,603,False,5198,287,3,8854D65F-5A7C-4DF0-9F73-C607D03D565D,,,600000,0,0A3F50B5-21D2-32B8-E044-0003BA298018,,,2024-08-14T23:54:46.290Z,323.0,False,100016028.0,1866833785,strandvejen-151-5450-otterup,False,Strandvejen,,,False,False,"{'leisureHouses': [], 'houses': [[{'estateUrl'...",1,0,0,0,1
9,1986001,57.65882,10.46655,4,0,-,2024-08-17T12:00:00.000Z,125000,False,2.0,74,72236,,1985,Skagen,False,True,813,9990,"Hotel Skagen Strand 64, Hulsig",1689.0,11,485,2023-04-18T08:31:06.347Z,False,"[{'id': 1986001, 'date': '2024-08-15T13:01:34....",0,1806,0,False,3803,454,7,255589F7-8322-4CCB-8048-68128F596C95,,,25000,0,0A3F50C9-2EC0-32B8-E044-0003BA298018,,,2024-08-14T23:53:13.873Z,,False,5531548.0,1645108031,hotel-skagen-strand-64-9990-skagen,False,Hotel Skagen Strand,,,False,False,"{'leisureHouses': [], 'houses': [[{'estateUrl'...",0,0,0,0,1


In [21]:
json_list[0]['results']

[{'id': 1844515,
  'latitude': 55.67386,
  'longitude': 12.57692,
  'propertyType': 1,
  'priceChangePercentTotal': 0,
  'energyClass': 'd',
  'openHouse': '',
  'price': 100000000,
  'selfsale': False,
  'rooms': 12.0,
  'size': 383,
  'lotSize': 213,
  'floor': None,
  'buildYear': 1757,
  'city': 'København K',
  'isForeclosure': False,
  'isActive': True,
  'municipality': 101,
  'zipCode': 1472,
  'street': 'Ny Kongensgade 3',
  'squaremeterPrice': 261096.0,
  'area': 1,
  'daysForSale': 1007,
  'createdDate': '2021-11-11T23:02:32.000Z',
  'isPremiumAgent': False,
  'images': [{'id': 1844515,
    'date': '2024-08-15T13:01:28.000Z',
    'url': 'https://i.boliga.org/dk/550x/1844/1844515.jpg'}],
  'net': 0,
  'exp': 13972,
  'basementSize': 105,
  'inWatchlist': False,
  'views': 94036,
  'agentRegId': 26472,
  'domainId': 171,
  'guid': '628079F8-F05F-4D4D-A5B4-1AF913E20B7C',
  'agentDisplayName': '',
  'groupKey': None,
  'downPayment': 5000000,
  'itemType': 0,
  'dawaId': None,
 