In [3]:
import requests
req = requests.get("https://kamernet.nl/huren/kamer-utrecht")
html = req.text

# part two

file = open("rooms-utrecht.html", "w")
file.write(html)
file.close()




def get_rooms(html):
    soup = BeautifulSoup(html, "lxml") # The 'lxml' argument is called the parser, you can try 'html5lib' here as well
    rooms = [] # Our room data is going here
    
    
# CSS selector for all elements with the 'rowSearchResultRoom' class
    rooms_list = soup.select(".rowSearchResultRoom")
    
    print(f"Found {len(rooms_list)} rooms")
    
 # Loop over our rooms
    for room in rooms_list:
        # 'room' is a new BeautifulSoup element that also accepts the select() method
        title = room.select(".tile-title") # Kamernet has decent class names indicating different data
        
        # select() *always* returns a list, even if there's just one element! 
        # So we need to get the first element in the list
        title = title[0]
        
        # Now we can use the get_text() method to get the text in the element
        title = title.get_text()
        
        # Note that all these methods can be chained, so this is the same (and a lot shorter!)
        title = room.select(".tile-title")[0].get_text()
        
        # And add it to the rooms list
        rooms.append(title)
        
    return rooms
    
# And here we're calling the function with our saved webpage
file = open("rooms-utrecht.html")
html = file.read()
file.close()

rooms = get_rooms(html)

 #Let's use a pandas dataframe for easy viewing
import pandas as pd
pd.DataFrame(rooms)

Found 18 rooms


Unnamed: 0,0
0,Steenstraat
1,Johannes Beenlaan
2,Hildebranddreef
3,Amerikalaan
4,Jazzsingel
5,Cornelis Mertenssstraat
6,Balderikstraat
7,Hoogstraat
8,Vleutenseweg
9,Vleutenseweg


In [2]:
# Okay, now that we know the basics, let's try getting some more information
# Something that isn't in the original data is the price per square meter,
# we can calculate that if we divide the rent with the surface of the room
def get_rooms(html):
    soup = BeautifulSoup(text, "lxml")
    rooms = []

    for room in soup.select(".rowSearchResultRoom"): # Note that we're directly using soup.select() here
        # We need to convert the rent and surface from strings to integer, so make variables
        # with the strings first
        rent_str = room.select(".tile-rent")[0].get_text()
        surface_str = room.select(".tile-surface")[0].get_text()

        # When you look at the rent strings you see the price is always 3 digits and starting from the 
        # third character. This method will break when prices are lower than €100 or higher than €999,
        # but we're taking that risk
        rent = int(rent_str[2:6])   
        
        # Same method for surface, and same problem here: if surface is lower than 10 square meters
        # or higher than 99 square meters this will fail
        surface = int(surface_str[0:2])
        
        # We can finally calculate the price per square meter, 
        # note that we're using the inbuild round() function here, to get a proper number
        rent_per_sqm = round(rent / surface)

        # Let's also add a bool indicating if you need to pay extra for electricity, water and gas
        has_gwl = "incl. G/W/E" in rent_str
        
        # Let's also get the thumbnail, note that the `src` attribute contains the image,
        # so we need to use get() instead of get_text() to get that value
        image = room.select(".tile-img img")[0].get("src")

        rooms.append({
            "available" : room.select(".tile-availability .left")[0].get_text(), # We're doing a nested selector here
            "furnished" : room.select(".tile-furnished")[0].get_text(),
            "has_gwl" : has_gwl,
            "image" : image,
            "rent" : rent,
            "rent_per_sqm" : f"€{rent_per_sqm}", # Use an f-string here to get the Euro character
            "rent_str" : rent_str,
            "surface" : surface,
            "surface_str" : surface_str,
            "title" : room.select(".tile-title")[0].get_text(),
        })

    return rooms

# We're using the with() statement here, which automatically closes the
# file you're opening and is a bit shorter than the usual four lines this takes
with open("rooms-utrecht.html") as f:
    rooms = get_rooms(f.read())

# Make a dataframe and sort by rent_per_sqm
df = pd.DataFrame(rooms)

# describe() gives some nice statistics here, such as as the average rent and surface
print(df.describe())

# Use pandas' to_csv() method to save to a CSV file
df.to_csv("rooms-utrecht.csv")

# And finally, show the table sorted by rent per square meter
df.sort_values('rent_per_sqm')

NameError: name 'text' is not defined