# Data Scraping for Housing Price Predictions

This notebook is the first part of the project focusing on predicting house prices in Amsterdam. The goal is to retrieve current house sales listings from a website with information about the price, location, size etc. 

The data gathered in this is only for personnal use and is not distributed widely. For web scraping I suggest to respect the wishes of the source company and do the scraping in respectful ways.  

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import requests
import time

## Web Scraping

In [2]:
# Define url to be retrieved
retrieve_url = 'URL'

In [3]:
# Set up base dataframe
houses = pd.DataFrame(columns=['price', 'post_code', 'neighborhood', 'living_area', 'rooms', 'year', 'url'])

In [4]:
# Scraping search results
for i in range(1, 70):
    html = requests.get(retrieve_url+'page-'+str(i)).content 
    soup = BeautifulSoup(html, "lxml") 

    locations = [loc.text.replace('\n', '').replace('nieuw', '').strip() for loc in soup.find_all('div', class_='listing-search-item__location')]
    post_code = [code[:7] for code in locations]
    neighborhood = [el.split('(')[1].replace(')', '') for el in locations ]

    prices = [re.sub("[^0-9]", "", price.text) for price in soup.find_all('span', class_='listing-search-item__price')]

        # Retrieve icon descriptions
    descriptions = [area.text.strip() for area in soup.find_all('div', class_='illustrated-features__item')]

    living_area = []
    rooms = []
    year = []

    for d in descriptions:
        if re.search('woonop', d):
            living_area.append(re.sub("[^0-9]", "", d))
        elif re.search('kamers', d):
            rooms.append(re.sub("[^0-9]", "", d))
        elif re.search('bouwjaar', d):
            year.append(re.sub("[^0-9]", "", d))

        url = [url.get('href') for url in soup.find_all('a', class_='listing-search-item__link listing-search-item__link--title')]

    if len(prices) == len(post_code) == len(neighborhood) == len(living_area) == len(rooms) == len(url) == len(year):
        houses = houses.append(pd.DataFrame({'price': prices, 'post_code':post_code, 'neighborhood':neighborhood, 'living_area':living_area, 'rooms':rooms, 'year':year, 'url':url}), ignore_index=True)

    # Set sleep to avoid too much traffic at once
    time.sleep(3)

In [5]:
# Check the results
len(houses)

1980

In [6]:
houses.head()

Unnamed: 0,price,post_code,neighborhood,living_area,rooms,year,url
0,395000,1082 CH,Buitenveldert Midden Zuid,82,4,1961,/appartement-te-koop/amsterdam/1b33ec7a/rietnesse
1,475000,1061 BM,Kolenkitbuurt Zuid,105,3,2009,/appartement-te-koop/amsterdam/0f0cb3f8/blauwv...
2,325000,1055 HR,Bosleeuw,61,3,1942,/appartement-te-koop/amsterdam/c2d2d29c/granid...
3,315000,1092 VB,Transvaalbuurt Oost,49,2,1923,/appartement-te-koop/amsterdam/e6abaf97/tugelaweg
4,425000,1094 EX,Oostpoort,90,4,1986,/appartement-te-koop/amsterdam/a573c2f3/celebe...


Now that we have the base data, let's retrieve some additional information from each house

In [7]:
house_info = pd.DataFrame(columns=['offered_since', 'status', 'monthly_cost_vve', 'type', 'bedrooms', 'bathrooms', 'isolation', 'energy_label','heating', 'parking', 'garage', 'balcony', 'garden', 'storage','url'])

In [9]:
base_url = 'URL'


for url in houses['url']:
    html = requests.get(base_url+url).content
    soup = BeautifulSoup(html, "lxml") 

    base_class = 'listing-features__description listing-features__description--'

    try:
        offered_since = soup.find(class_=base_class+'offered_since').text
    except:
        offered_since = 'N/A'
    try:
        status = soup.find(class_=base_class+'status').text
    except:
        status = 'N/A'

    try:
        monthly_cost_vve = soup.find(class_=base_class+'monthly_contribution').text
    except:
        monthly_cost_vve = 0

    try:
        type_woning = soup.find(class_=base_class+'dwelling_type').text
    except:
        type_woning = 'N/A'
    try:
        slaap_kamers = soup.find(class_=base_class+'number_of_bedrooms').text
    except:
        slaap_kamers = 'N/A'
    try:
        bathroom = soup.find(class_=base_class+'number_of_bathrooms').text
    except:
        bathroom = 'N/A'
    try:
        isolation = soup.find(class_=base_class+'insulations').text
    except:
        isolation = 'N/A'

    for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g']:
        try:
            energy_label = soup.find(class_=base_class+'energy-label-'+label).text
        except:
             energy_label = 'N/A'
        if energy_label.lower() == label:
            break

    try:
        parking = soup.find(class_=base_class+'parking').text
    except: 
        parking = 'N/A'
    try:
        garage = soup.find(class_=base_class+'available').text
    except:
        garage = 'N/A'
    try:
        balkon = soup.find(class_=base_class+'balcony').text
    except:
        balkon = 'N/A'

    try:
        garden = soup.find(class_=base_class+'garden').text
    except:
        garden = 'N/A'

    try:
        storage = soup.find(class_=base_class+'storage').text
    except:
        storage = 'N/A'
    try:
        warming = soup.find(class_=base_class+'heatings').text
    except:
        warming = 'N/A'
        
    house_info = house_info.append(pd.DataFrame({'offered_since':[offered_since], 'status':[status], 'monthly_cost_vve':[monthly_cost_vve], 'type':[type_woning], 'bedrooms':[slaap_kamers], 'bathrooms':[bathroom], 'isolation':[isolation], 'energy_label':[energy_label],'heating':[warming], 'parking':[parking], 'garage':[garage], 'balcony':[balkon], 'garden':[garden], 'storage':[storage],'url':[url]}),ignore_index=True)
    
    time.sleep(3)

house_info.shape

In [10]:
house_info.head()

Unnamed: 0,offered_since,status,monthly_cost_vve,type,bedrooms,bathrooms,isolation,energy_label,heating,parking,garage,balcony,garden,storage,url
0,08-07-2020,Beschikbaar,0,Appartement,3,1.0,Dubbele beglazing,C,CV-ketel,Betaald,,Niet aanwezig,,Aanwezig,/appartement-te-koop/amsterdam/1b33ec7a/rietnesse
1,08-07-2020,Beschikbaar,Ja (€ 175 per maand),Appartement,2,1.0,"Muurisolatie, Vloerisolatie, Dubbele beglazing",A,Stadsverwarming,,Ja,Aanwezig,Niet aanwezig,Aanwezig,/appartement-te-koop/amsterdam/0f0cb3f8/blauwv...
2,08-07-2020,Beschikbaar,0,Appartement,2,,,,,,Nee,Aanwezig,Niet aanwezig,,/appartement-te-koop/amsterdam/c2d2d29c/granid...
3,08-07-2020,Beschikbaar,0,Appartement,1,,,,,,Nee,Aanwezig,Niet aanwezig,,/appartement-te-koop/amsterdam/e6abaf97/tugelaweg
4,08-07-2020,Beschikbaar,0,Appartement,3,,,,,,Nee,Niet aanwezig,Niet aanwezig,,/appartement-te-koop/amsterdam/a573c2f3/celebe...


Merge two datasets together and save for later use

In [11]:
houses = pd.merge(houses, house_info, how='left', left_on='url', right_on='url')

## Retrieve coordinates for each of the houses

In [12]:
# Geolocations

from geopy.extra.rate_limiter import RateLimiter
from geopy.exc import GeocoderTimedOut
import geopy.geocoders
from geopy.geocoders import Nominatim
import geopy


In [13]:
locator = Nominatim(user_agent = 'Google chrome Version 83.0.4103.116 (Official Build) (64-bit)')

locations = []

count = 1
for post_code in houses['post_code']:
    geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
    try:
        loc = geocode(post_code)
        locations.append([loc.latitude, loc.longitude])
    except:
        locations.append(['-', '-'])
            
    time.sleep(1)

In [None]:
houses['loc'] = locations

In [14]:
# Save to a new dataset 
houses.to_csv('new_amsterdam_houses.csv', index=False)

In [15]:
houses['loc'] = locations

In [16]:
houses.to_csv('new_amsterdam_houses.csv', index=False)