## Importing libraries

In [1]:
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import pandas as pd  
import math

## Function for scraping listing data

In [2]:
def get_listing_summary(result_set, county):
    
    listing_dict = {'County': [],
        'Property ID': [],
        'Price per month': [],
        'Address': [],
        'Property Description': [],
        'Date of Listing': []}
    
    for result in result_set:
        
        property_id = result['id']
        price_per_month = result.find('p', class_='_170k6632 _16fktr6').text
        address = result.find('h3', class_='_1ankud52 _16fktr9').text
        property_description = result.find('h2', class_='_1ankud51 _16fktr8').text
        date_of_listing = result.find('li', class_='_18cib8e1').text
        
        listing_dict['County'].append(county)
        listing_dict['Property ID'].append(property_id)
        listing_dict['Price per month'].append(price_per_month)
        listing_dict['Address'].append(address)
        listing_dict['Property Description'].append(property_description)
        listing_dict['Date of Listing'].append(date_of_listing)
        
    return(pd.DataFrame(listing_dict))


## Function for extracting URLs

In [3]:
def URL_extractor(county):
    county = county.replace("&", "and")
    string1 = county.replace(" ", "-")
    string2 = county.replace(" ", "%")
    URL = "https://www.zoopla.co.uk/to-rent/property/" + string1 + "/?pets_allowed=true&price_frequency=per_month&q=" + string2 + "&radius=0&results_sort=lowest_price&search_source=to-rent"
    return URL

## Main function for data collection

In [4]:
def data_extractor(counties):
       
    listing_dict = {'County': [],
            'Property ID': [],
            'Price per month': [],
            'Address': [],
            'Property Description': [],
            'Date of Listing': []}
    
    data = pd.DataFrame(listing_dict)
    
    for county in counties:
        
        curr_status = False
        page = 0

        try:

            while [curr_status == False]:
                driver = webdriver.Chrome()
                url = URL_extractor(county) 
                page += 1
                if page != 1:
                    url = url + "&pn=" + str(page)
                driver.get(url)

                response = BeautifulSoup(driver.page_source, 'html.parser')
                status = response.find_all('div', class_='_13wnc6k2')
                curr_status = status[0].find('a')['aria-disabled']

                listings = response.find_all('div', class_='f0xnzq2')   
                page_data = get_listing_summary(listings, county)
                data = pd.concat([data, page_data], ignore_index=True)

                driver.quit()

        except IndexError:
            print("All data for {} has been collected".format(county))
            
    return data

In [5]:
counties = ["Bath & N E Somerset",
    "Bedfordshire",
    "Berkshire",
    "Bristol",
    "Buckinghamshire",
    "Cambridgeshire",
    "Cheshire",
    "City of London",
    "Cornwall",
    "Cumbria",
    "Derbyshire",
    "Devon",
    "Dorset",
    "Durham",
    "East Riding of Yorkshire",
    "East Sussex",
    "Essex",
    "Gloucestershire",
    "Greater London",
    "Greater Manchester",
    "Hampshire",
    "Herefordshire",
    "Hertfordshire",
    "Isle of Wight",
    "Kent",
    "Lancashire",
    "Leicestershire",
    "Lincolnshire",
    "Merseyside",
    "Norfolk",
    "Northamptonshire",
    "Northumberland",
    "North Yorkshire",
    "Nottinghamshire",
    "Oxfordshire",
    "Rutland",
    "Shropshire",
    "Somerset",
    "South Yorkshire",
    "Staffordshire",
    "Suffolk",
    "Surrey",
    "Tyne and Wear",
    "Warwickshire",
    "West Midlands",
    "West Sussex",
    "West Yorkshire",
    "Wiltshire",
   "Worcestershire"
]

In [6]:
data = data_extractor(counties)

All data for Bath & N E Somerset has been collected
All data for Bedfordshire has been collected
All data for Berkshire has been collected
All data for Bristol has been collected
All data for Buckinghamshire has been collected
All data for Cambridgeshire has been collected
All data for Cheshire has been collected
All data for City of London has been collected
All data for Cornwall has been collected
All data for Cumbria has been collected
All data for Derbyshire has been collected
All data for Devon has been collected
All data for Dorset has been collected
All data for Durham has been collected
All data for East Riding of Yorkshire has been collected
All data for East Sussex has been collected
All data for Essex has been collected
All data for Gloucestershire has been collected
All data for Greater London has been collected
All data for Greater Manchester has been collected
All data for Hampshire has been collected
All data for Herefordshire has been collected
All data for Hertfordshir

In [8]:
data.to_csv("listings_data.csv")

## Which are the top 5 counties with the most pet-welcomed listings?

In [7]:
data.County.value_counts()[:5].index.tolist()

['West Midlands', 'Greater Manchester', 'Surrey', 'Kent', 'West Yorkshire']

## Next steps

##### Carry out feature extraction on 'Property Description' column to create new columns:
- Property Type: e.g. Flat, House, etc
- Property Design: e.g. Detached, Terraced, etc
- Number of Bedrooms
- etc
- Use this data to find distribution of property types across top 5 counties
##### Visualisations:
- Find a way to visualise heatmap of rental properties on a map, segmented by counties
- Pricing trends across regions
- Basic visualisations for top 5 counties and distribution plots
##### EDA:
- Look for repeated listings and listings missed out during searches as searching for properties in 'England' produces over 4000 results
- Extract postcodes and use them to classify according to region, insteading of searc

In [9]:
data

Unnamed: 0,County,Property ID,Price per month,Address,Property Description,Date of Listing
0,Bath & N E Somerset,listing_64794678,£825 pcm,"Dudwell Lane, Chewton Mendip, Nr Radstock BA3",2 bed bungalow to rent,Listed on 5th Jun 2023
1,Bath & N E Somerset,listing_64829362,"£1,795 pcm","The Hollow, Bath BA2",2 bed semi-detached house to rent,Listed on 9th Jun 2023
2,Bath & N E Somerset,listing_61754783,"£2,000 pcm","Norfolk Buildings, Bath BA1",1 bed flat to rent,Listed on 14th Dec 2022
3,Bath & N E Somerset,listing_64697636,"£2,250 pcm","Old Brewery Place, Oakhill, Nr Radstock BA3",4 bed property to rent,Listed on 23rd May 2023
4,Bath & N E Somerset,listing_60256022,"£2,400 pcm","Norfolk Buildings, Bath BA1",1 bed flat to rent,Listed on 6th Apr 2023
...,...,...,...,...,...,...
2756,Worcestershire,listing_62407301,"£1,650 pcm","Solent Place, Evesham WR11",4 bed detached house to rent,Listed on 26th Apr 2023
2757,Worcestershire,listing_64881285,"£2,200 pcm","Gilberts End Lane, Hanley Castle, Worcester WR8",4 bed detached house to rent,Listed on 15th Jun 2023
2758,Worcestershire,listing_64317465,"£2,250 pcm","Crown East Lane, Worcester WR2",3 bed semi-detached house to rent,Listed on 2nd Apr 2023
2759,Worcestershire,listing_64651338,"£2,250 pcm","Wick Road, Little Comberton, Pershore WR10",4 bed detached house to rent,Listed on 17th May 2023
