# Imp libs and get csv with streets

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import math

# Intro

I´m using the info from the csv created in "1. Sthlm - Get info_sold_sthlm"

I´m scraping info from 1273 streets in Stockholms kommun 

This gives me the raw data for the analysis (50k+ sold apartments in Sthlm)

In [7]:
# Reading csv with the streets I´m getting info from
df = pd.read_csv("id_sold_sthlm.csv")

In [8]:
# Create lst for looping 
lst_gator = df.values.tolist()

In [6]:
# 1273 streets with a varying number of apartments sold for each street
# min = 0 for a street
# max ~ 1000 for a street
len(lst_gator)

1273

# Scrape info from Hemnet - Runtime 30-60 mins

In [None]:
# Initialize list to store information from each sold apartment
adress_lst = []
omrade_lst = []
kvm_lst = []
rum_lst = []
maklare_lst = []
avgift_lst = []
slutpris_lst = []
datum_lst = []
change_lst = []
gata_id_lst = []
gata_lst = []
stockholm_lst = []

# Loop through the lst of 1273 apartments
for i in lst_gator:
    # Each street has a varying number of apartments sold
    ## Hemnet displays 50 sold objects per page
    ### page_num stores the number of times I'm looping through each street 
    #### This is possible since {street_id} and {page_number} are the only variables in the URL
    page_num = math.ceil(int(i[0])/50)
    for j in range(1, page_num+1):
        url = f"https://www.hemnet.se/salda/bostader?item_types%5B%5D=bostadsratt&location_ids%5B%5D={i[2]}&page={j}&sold_age=all"
        response = requests.get(url)
        html = response.content
        soup = BeautifulSoup(html, ("lxml"))
        
        # each container has all the desired info about one sold object
        container = soup.select("li.sold-results__normal-hit")

        for x in container: 
            
            # 1: I'm getting info from lst_gator (iterator = i)
            
            ## Get street ID 
            gata_id_lst.append(i[2])
            
            ## Get street name
            gata = i[1]
            gata_re = re.findall(r"[-] +[A-Za-zåäöÅÄÖ\s]+[,]", gata)
            if gata_re:                
                gata_re_strip = gata_re[0].strip(",-").strip()
                gata_lst.append(gata_re_strip)
            else: 
                gata_lst.append(np.nan)
            
            ## Get municipality (should always be "Stockholms kommun")
            if "Stockholms kommun" in i[1]:
                stockholm_lst.append("Stockholms kommun")
            else:
                stockholm_lst.append(np.nan)
            
            # 2: I'm getting info from the container (iterator = x)

            ## Get adress
            adress = [x.text for x in x.select("span.item-result-meta-attribute-is-bold")]
            if adress:
                adress_lst.append(adress[0])
            else:
                adress.lst.append(np.nan)
            
            ## Get area (neighbourhood)
            omr = [x.text for x in x.select("span.item-link")]
            if len(omr) == 1:
                omrade_lst.append(np.nan)    
            else:
                omrade_lst.append(omr[1].strip())
                
            ## Get sqm
            kvm = [re.findall(r"(([\d+,]+[\d+])\xa0m)", x.text) for x in x.select("div.sold-property-listing__subheading.sold-property-listing--left")]
            if kvm[0]:
                kvm_lst.append(kvm[0][0][1])
            else:
                kvm_lst.append(np.nan)     
            
            ## Get no. of rooms
            rum = [re.findall(r"(([ \d+,]+[\d+])\xa0rum)", x.text) for x in x.select("div.sold-property-listing__subheading.sold-property-listing--left")]
            if rum[0]:
                rum_lst.append(rum[0][0][1].strip())
            else:
                rum_lst.append(np.nan)
                
            ## Get broker (firm)
            maklare = [x.text.strip() for x in x.select("div.sold-property-listing__broker")]
            if maklare:
                maklare_lst.append(maklare[0])
            else:
                maklare_lst.append(np.nan)
                
            ## Get monthly fee
            avgift = [x.text.strip() for x in x.select("div.sold-property-listing__fee")]
            if len(avgift) == 1:
                avgift = re.findall("[0-9]+", avgift[0])
                avgift_lst.append("".join(avgift))
            else: 
                avgift_lst.append(np.nan)
                
            ## Get final price
            slutpris = [re.findall("[0-9]+", x.text) for x in x.select("span.sold-property-listing__subheading")]
            if slutpris:
                slutpris_lst.append("".join(slutpris[0]))
            else:
                slutpris_lst.append(np.nan)
                
            ## Get date_sold
            datum = [x.text.strip() for x in x.select("div.sold-property-listing__sold-date")]
            if datum:
                datum_lst.append(datum[0])
            else:
                datum_lst.append(np.nan)
                
            ## Get % change in price
            ## Here I'm using continue to handle special cases and therefore this has to be on the bottom of the loop
            ## Otherwise continue would screw things up
            change = [re.findall("[0-9+-]+", x.text) for x in x.select("div.sold-property-listing__price-change")]
            if not change:
                change_lst.append(0)
                continue
            if not change[0]:
                change_lst.append(0)
                continue
            else:
                change_lst.append(change[0][0])
# Print len for all lists to verify that they have equal length                
print(len(adress_lst))
print(len(omrade_lst))
print(len(kvm_lst))
print(len(rum_lst))
print(len(maklare_lst))
print(len(avgift_lst))
print(len(slutpris_lst))
print(len(datum_lst))
print(len(change_lst))
print(len(gata_id_lst))
print(len(gata_lst))
print(len(stockholm_lst))

# Zip lists, create df and write to csv

In [7]:
# Zip lists
comp_lst = list(zip(adress_lst, omrade_lst, kvm_lst, rum_lst, maklare_lst,
                    avgift_lst, slutpris_lst, datum_lst, change_lst, gata_id_lst, gata_lst, stockholm_lst))

In [8]:
# Create df
df_sold = pd.DataFrame(data = comp_lst, columns = ["adress", "omrade", "kvm", "rum",
                                                  "maklare", "avgift", "slutpris", "datum",
                                                    "prisförändring", "gata_id_lst", "gata_lst", "stockholm_lst"])

In [16]:
# Write to csv
df_sold.to_csv("sthlm_raw.csv", index = False)