# Testing out webscraping on pararius.nl

## Background

### Observations
- https://www.pararius.nl/huurwoningen/[STAD] is the overview per city
- changes in https://www.pararius.nl/appartement-te-huur/[STAD]/[CODDE]/[STRAAT] when going to a listing

### Desired features

See: 
- price
- street,
- (zipcode),
- neighborhood
- agent,
- number of rooms,
- number of bedrooms,
- suitable for sharing (based on AI),
- date added,
- surface area

Other:
- sortable

## Scraping

### Set up

In [90]:
# Import packages
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs 
import openai
import config
import openpyxl

# Methods
def get_number(string):
    allowed = [str(i) for i in range(10)]
    str_numbers = list(filter(lambda x: x in allowed, [*string]))
    joined = ''.join(str_numbers)
    value = int(joined)
    return value

def truncate_middle(s, length, ellipsis="..."):
    """
    Truncate a string to a specified length, adding ellipses in the middle if necessary.

    Args:
        s (str): The input string.
        length (int): The maximum length of the resulting string (including ellipses).
        ellipsis (str): The ellipsis string to use (default is "...").

    Returns:
        str: The truncated string.
    """
    if len(s) <= length:
        return s  # No truncation needed

    # Calculate the length of the prefix and suffix (including ellipses)
    prefix_length = (length - len(ellipsis)) // 2
    suffix_length = length - prefix_length - len(ellipsis)

    # Construct the truncated string with ellipses in the middle
    truncated = s[:prefix_length] + ellipsis + s[-suffix_length:]

    return truncated

def is_listing_suitable(description):
    # Define the prompt for GPT-3
    prompt = f"Is the following real estate listing suitable for sharing with others?\n{description}\n"

    # Use GPT-3 to generate a response
    response = openai.Completion.create(
        engine="text-davinci-002",  # Choose the appropriate engine
        prompt=prompt,
        max_tokens=1,  # Adjust the number of tokens as needed
        n=1,  # Number of completions to generate
        stop=None,  # Optional: Stop tokens to limit the response
    )

    # Extract the generated text from the response
    generated_text = response.choices[0].text.strip()

    # Check if the response indicates suitability
    return "Yes" in generated_text or "Suitable" in generated_text

## Experimenting

### Extract info from listing

In [42]:
# Listing Page

url ='https://www.pararius.nl/appartement-te-huur/rotterdam/872bf769/groenendaal'
req = requests.get(url)
print(req)

soup = bs(req.text, "html.parser")

# Monthly price
price_html = soup.find("div", {"class": "listing-detail-summary__price"})
print(price_html.text.split())
price = get_number(price_html.text)
print(price)

# Surface area
area_html = soup.find("li", {"class": "illustrated-features__item illustrated-features__item--surface-area"})
area = get_number(area_html.text)
print(area)

# Number of rooms
nrooms_html = soup.find("li", {"class": "illustrated-features__item illustrated-features__item--number-of-rooms"})
nrooms = get_number(nrooms_html.text)
print(nrooms)

# Number of bedrooms
nbedrooms_html = soup.find("dd", {"class": "listing-features__description listing-features__description--number_of_bedrooms"})
nbedrooms = get_number(nbedrooms_html.text)
print(nbedrooms)

# Number of bathrooms
try:
    nbathrooms_html = soup.find("dd", {"class": "listing-features__description listing-features__description--number_of_bathrooms"})
    nbathrooms = get_number(nbathrooms_html.text)
    print(nbathrooms)
except:
    print("NA")    

# Furnished
furnished_html = soup.find("li", {"class": "illustrated-features__item illustrated-features__item--interior"})
furnished = furnished_html.text
print(furnished)

# Very easy to add additonal 'data' from the listing
# Neighbourhood
# Zipcode
location_html = soup.find("div", {"class": "listing-detail-summary__location"})
location_split = location_html.text.split()
zipcode = location_split[0] + location_split[1]
neighborhood = location_split[2]
print(zipcode)
print(neighborhood)

# Street
street_htmls = soup.find_all("a", {"class": "breadcrumbs__link"})
street = street_htmls[-1].text
print(street)

# Offered since
since_html = soup.find("dd", {"class": "listing-features__description listing-features__description--offered_since"})
since_down_html = since_html.find("span", {"class": "listing-features__main-description"})
since = since_down_html.text
print(since)

# Agent
agent_html = soup.find("a", {"class": "agent-summary__title-link"})
agent = agent_html.text
print(agent)

# Status
status_html = soup.find("dd", {"class": "listing-features__description listing-features__description--status"})
status = status_html.text
print(status)

# Description
description_html = soup.find("div", {"class": "listing-detail-description__content"})
description = description_html.text
print(truncate_middle(description,100))

# ChatGPT integration (For later if ever)
# openai.api_key = config.openai_key
# suitable = is_listing_suitable(description)
# print(suitable)

<Response [200]>
['€', '1.400', 'per', 'maand']
1400
65
3
2
NA


AttributeError: 'NoneType' object has no attribute 'text'

### Get all listing of a city

In [11]:
# Overview page
url = 'https://www.pararius.nl/huurwoningen/rotterdam'

req = requests.get(url)
print(req)

soup = bs(req.text, "html.parser")

# Maximum number of pages
numpages_html = soup.find_all("li", {"class": "pagination__item"})
numpages = get_number(numpages_html[-2].text)
print(numpages)

# Pages
pagelinks = ["https://www.pararius.nl/huurwoningen/rotterdam/page-" + str(i) for i in range(2,numpages+1)]
print(pagelinks) # Probably neater to do this with a for-loop (btw: page-1 redirects to the first page so not necessary to start at 2)

# Listing links on page
listings_html = soup.find_all("a", {"class": "listing-search-item__link listing-search-item__link--title"})
listing_links = ["https://www.pararius.nl/" + link.get("href") for link in listings_html]
print(listing_links)

# Obtain all links
listing_links = []
for i in range(1, numpages+1):
    
    # Obtain link and soup of page
    pagelink = "https://www.pararius.nl/huurwoningen/rotterdam/page-" + str(i)
    req = requests.get(pagelink)
    s = bs(req.text, "html.parser")

    # Obtain links of listings on page
    listings_on_page_html = s.find_all("a", {"class": "listing-search-item__link listing-search-item__link--title"})
    listing_on_page_links = ["https://www.pararius.nl" + link.get("href") for link in listings_on_page_html]

    listing_links += listing_on_page_links

print(listing_links)



<Response [200]>
16
['https://www.pararius.nl/huurwoningen/rotterdam/page-2', 'https://www.pararius.nl/huurwoningen/rotterdam/page-3', 'https://www.pararius.nl/huurwoningen/rotterdam/page-4', 'https://www.pararius.nl/huurwoningen/rotterdam/page-5', 'https://www.pararius.nl/huurwoningen/rotterdam/page-6', 'https://www.pararius.nl/huurwoningen/rotterdam/page-7', 'https://www.pararius.nl/huurwoningen/rotterdam/page-8', 'https://www.pararius.nl/huurwoningen/rotterdam/page-9', 'https://www.pararius.nl/huurwoningen/rotterdam/page-10', 'https://www.pararius.nl/huurwoningen/rotterdam/page-11', 'https://www.pararius.nl/huurwoningen/rotterdam/page-12', 'https://www.pararius.nl/huurwoningen/rotterdam/page-13', 'https://www.pararius.nl/huurwoningen/rotterdam/page-14', 'https://www.pararius.nl/huurwoningen/rotterdam/page-15', 'https://www.pararius.nl/huurwoningen/rotterdam/page-16']
['https://www.pararius.nl//appartement-te-huur/rotterdam/b6c9f139/prins-hendrikkade', 'https://www.pararius.nl//appar

## Building blocks of code

### Get all listings of a city

In [31]:


# First get all the URLs of listings of the city
city = "Rotterdam"

def get_all_listing_urls(city):
    listing_urls = []
    for i in range(1, numpages+1):
    
        # Obtain link and soup of page
        page_url = f"https://www.pararius.nl/huurwoningen/{city.lower()}/page-{str(i)}"

        req = requests.get(page_url)
        s = bs(req.text, "html.parser")

        # Obtain links of listings on page
        listings_on_page_html = s.find_all("a", {"class": "listing-search-item__link listing-search-item__link--title"})
        listing_on_page_urls = ["https://www.pararius.nl" + link.get("href") for link in listings_on_page_html]

        # Add listings of page to list
        listing_urls += listing_on_page_urls

    return listing_urls

urls = get_all_listing_urls(city)
print(urls)


['https://www.pararius.nl/appartement-te-huur/rotterdam/ec707cec/maashaven-n-z', 'https://www.pararius.nl/appartement-te-huur/rotterdam/b6c9f139/prins-hendrikkade', 'https://www.pararius.nl/appartement-te-huur/rotterdam/6d345a10/westersingel', 'https://www.pararius.nl/appartement-te-huur/rotterdam/24b9c531/schieweg', 'https://www.pararius.nl/appartement-te-huur/rotterdam/83dcba24/bloklandstraat', 'https://www.pararius.nl/appartement-te-huur/rotterdam/db0df498/mijnsherenlaan', 'https://www.pararius.nl/appartement-te-huur/rotterdam/e4f2f218/vondelweg', 'https://www.pararius.nl/appartement-te-huur/rotterdam/e1023cb6/middellandplein', 'https://www.pararius.nl/appartement-te-huur/rotterdam/cfec556e/middellandplein', 'https://www.pararius.nl/appartement-te-huur/rotterdam/ad203df1/middellandplein', 'https://www.pararius.nl/appartement-te-huur/rotterdam/19877e7a/blaak', 'https://www.pararius.nl/huis-te-huur/rotterdam/98df0641/heemraadsplein', 'https://www.pararius.nl/appartement-te-huur/rotter

### Extract info from all URLs

In [105]:
# Extract the information our of all URLs
simple_extractions = {"price": ["div", {"class": "listing-detail-summary__price"}],
                      "area": ["li", {"class": "illustrated-features__item illustrated-features__item--surface-area"}],
                      "nrooms": ["li", {"class": "illustrated-features__item illustrated-features__item--number-of-rooms"}],
                      "nbedrooms": ["dd", {"class": "listing-features__description listing-features__description--number_of_bedrooms"}],
                      "nbathrooms": ["dd", {"class": "listing-features__description listing-features__description--number_of_bathrooms"}],
                      "furnished": ["li", {"class": "illustrated-features__item illustrated-features__item--interior"}],
                      "agent": ["a", {"class": "agent-summary__title-link"}],
                      "status": ["dd", {"class": "listing-features__description listing-features__description--status"}],
                      "description": ["div", {"class": "listing-detail-description__content"}],
                      "numbers": ["price", "area", "nrooms", "nbedrooms", "nbathrooms"]
                    }

def get_scraped_dict(urls, simple_extractions):

    print('''You can expect error messages: not all listings have all information,
          especially regarding the furnishing and number of bathrooms.
          ''')

    scraped_dict = {}

    for url in urls:

        listing_dict = {}

        try:
            # URL
            listing_dict["url"] = url

            # Code
            code = url.split('/')[-2]
            listing_dict["code"] = code

            # Obtain html
            req = requests.get(url)
            soup = bs(req.text, "html.parser")

            # Extract all simple elements
            for key, value in simple_extractions.items():

                if key == "numbers": continue

                try:
                    item_html = soup.find(value[0], value[1])
                    if key in simple_extractions["numbers"]:
                        item = get_number(item_html.text)
                    else:
                        item = item_html.text
                    listing_dict[key] = item
                except:
                    listing_dict[key] = "NA"
                    print(f"Something went wrong with listing {url} when extracting {key}")

            # Price per bedroom
            price_per_bedroom = round(listing_dict["price"] / listing_dict["nbedrooms"])
            listing_dict["price_per_bedroom"] = price_per_bedroom

            # Neighbourhood
            # Zipcode
            location_html = soup.find("div", {"class": "listing-detail-summary__location"})
            location_split = location_html.text.split()
            zipcode = location_split[0] + location_split[1]
            neighborhood = location_split[2:]
            listing_dict["zipcode"] = zipcode
            listing_dict["neighborhood"] = neighborhood

            # Street
            street_htmls = soup.find_all("a", {"class": "breadcrumbs__link"})
            street = street_htmls[-1].text
            listing_dict["street"] = street

            # Offered since
            since_html = soup.find("dd", {"class": "listing-features__description listing-features__description--offered_since"})
            since_down_html = since_html.find("span", {"class": "listing-features__main-description"})
            since = since_down_html.text
            listing_dict["since"] = since

            # Added to parent dictionary
            scraped_dict[code]= listing_dict
            
        except Exception as e:
            print(f"something went wrong with {url}")
            print(e)

    return scraped_dict

scraped_dict = get_scraped_dict(urls, simple_extractions)

You can expect error messages: not all listings have all information,
          especially regarding the furnishing and number of bathrooms.
          
Something went wrong with listing https://www.pararius.nl/appartement-te-huur/rotterdam/b6c9f139/prins-hendrikkade when extracting price
Something went wrong with listing https://www.pararius.nl/appartement-te-huur/rotterdam/b6c9f139/prins-hendrikkade when extracting nbedrooms
Something went wrong with listing https://www.pararius.nl/appartement-te-huur/rotterdam/b6c9f139/prins-hendrikkade when extracting nbathrooms
Something went wrong with listing https://www.pararius.nl/appartement-te-huur/rotterdam/b6c9f139/prins-hendrikkade when extracting agent
Something went wrong with listing https://www.pararius.nl/appartement-te-huur/rotterdam/b6c9f139/prins-hendrikkade when extracting status
Something went wrong with listing https://www.pararius.nl/appartement-te-huur/rotterdam/b6c9f139/prins-hendrikkade when extracting description
something 

KeyboardInterrupt: 

### Put info in excel file

In [106]:
# Dictionary that tells how the variables are displayed as column names
var_display_dict = {"price": "Price", 
                 "area": "Surface Area", 
                 "nrooms": "#Rooms", 
                 "nbedrooms":"#Bedrooms", 
                 "nbathrooms": "#Bathrooms", 
                 "furnished":"Furnished", 
                 "agent": "Agent", 
                 "status": "Status",
                 "zipcode": "Zipcode",
                 "neighborhood": "Neighborhood",
                 "street": "Street",
                 "since": "Available since",
                 "url": "Link",
                 "price_per_bedroom": "Price per bedroom"}

wb = openpyxl.load_workbook("test.xlsx")

ws = wb.active
ws.title = "Rotterdam All"

# Order of the columns
order = list(var_display_dict.keys()) # names of variables in the program
order = ['price', 'nbedrooms', 'nrooms', 'area','neighborhood', 'street', 'zipcode', 'since', 'agent', 'url']

# Give the columns names
for i in range(len(order)):
    var = order[i]
    cell = ws.cell(row=1, column = i+1, value = var_display_dict[var])

# Enter info of listings

row_number = 2
# Loop over all listings
for _, dict in scraped_dict.items():

    # Loop over all info
    for i in range(len(order)):
        var = order[i]
        cell = ws.cell(row=row_number, column = i+1, value = dict[var])

    row_number += 1

wb.save('test.xlsx')

### Put filtered listing in excel file

In [None]:
ws.create_sheet("Rotterdam Filtered")

row_number = 2
# Loop over all listings
for code, dict in scraped_dict.items():

    # Check if listing get through filters
    # Price per bedroom
    # Minimum number of bedrooms
    # Neighborhood

    
    # Loop over all info and add to sheet

    row_number += 1

In [97]:
order

[dict_keys(['price', 'area', 'nroom', 'nbedrooms', 'nbathrooms', 'furnished', 'agent', 'status', 'zipcode', 'neighborhood', 'street', 'since', 'url'])]

In [79]:
len(scraped_dict)

436