* Transaction type (i.e. sale vs. rent - string)
* Bedrooms (integer)
* Bathrooms (integer)
* Description (free text string)
* Property type e.g. flat, detached house, terraced house
* Price e.g. 500,000 (typically integer)
* Location:Key location data here is Postcode district and/or PostcodeSee link for explanation of what these are
* Agent (advertising the property)
* Listing source
* Listing URL
* Other nice-to-have metadataIf a rental property is furnished or not Anything else you deem interesting

### Scrape Properties data from rightmove.com

In [382]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import datetime



rm_salesurl = "https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E87490&index={}&propertyTypes=&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords="
rm_renturl = "https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E87490&index={}&propertyTypes=&includeLetAgreed=false&mustHave=&dontShow=&furnishTypes=&keywords="


def get_driver():
    # path to chrome driver on my pc
    driver = webdriver.Chrome(r'C:\Windows\chromedriver.exe')
    return driver


def get_pages(driver,page,url):
    driver.get(url.format(page))
    OM_DIV_TAG = 'propertyCard-wrapper'
    page_html = driver.find_elements(By.CLASS_NAME, OM_DIV_TAG)
    return page_html


def parse_pages(page_html:'page_html', transaction_type:str, source:str):
    """
    
    """

    page_data = []

    for page in page_html:
        # Transaction

        try:
            if transaction_type == 'rent':
                transaction = transaction_type

            elif transaction_type == 'sales':
                transaction = transaction_type

            else:
                print('transaction_type can either be sales or rent')
                break

        except:
                print('transaction_type can either be sales or rent')
                break


        
        # Address
        try:
            address_tag = page.find_element(By.CLASS_NAME, "propertyCard-address")
            address = address_tag.text
        except:
            address = ''

        # Bedroom
        try:         
            bedroom_element = page.find_element(By.CLASS_NAME,"propertyCard-content")
            bedroom_span = bedroom_element.find_element(By.CLASS_NAME, "bed-icon")
            inner_html = bedroom_span.get_attribute('innerHTML')

            # use string formatting
            title_start = "<title>"
            title_end = "</title>"
            title_index_start = inner_html.index(title_start) + len(title_start)
            title_index_end = inner_html.index(title_end)
            bedroom = inner_html[title_index_start:title_index_end][0:1]    

            #.text.split("\n")[-1].strip()
            # bedroom = bedroom_span.get_attribute('innerHTML')
            # print(bedroom)
            # # print(bedroom_span.get_attribute('title')        
        
        except:
            bedroom = ''
     
       
        
        # Bathroom
        try:
            
            bathroom_tag =page.find_element(By.CLASS_NAME,"propertyCard-content")
            bathroom_span = bathroom_tag.find_element(By.CLASS_NAME, "bathroom-icon")
            inner_html2 = bathroom_span.get_attribute('innerHTML')

            # use string formatting
            title_start = "<title>"
            title_end = "</title>"
            title_index_start = inner_html2.index(title_start) + len(title_start)
            title_index_end = inner_html2.index(title_end)
            bathroom = inner_html2[title_index_start:title_index_end][0:1]    

        except:
            bathroom =''


        # Description
        try:
            description_tag = page.find_element(By.CLASS_NAME, 'propertyCard-description')
            description = description_tag.text

        except:
            description = ''


        # property Type
        try:
            property_type_tag =page.find_element(By.CLASS_NAME, 'property-information')
            property_type =property_type_tag.text.split("\n")[0].strip()

        except:
            property_type = ''

  
        # rent payment
        if transaction_type == 'rent':

            sales_price = ' '
            
            # rent price per month
            try:
                pcm = page.find_element(By.CLASS_NAME, 'propertyCard-priceValue')
                per_month = pcm.text.split(" ")[0].strip()
                
            except:
                per_month = ''
                
            # rent price per week
            try:
                pw = page.find_element(By.CLASS_NAME, 'propertyCard-secondaryPriceValue')
                per_week = pw.text.split(" ")[0].strip()

            except:
                per_week = ''               

        else:
            # sales Price
            try:
                per_week = ''  
                per_month = ''
                
                price_tag = page.find_element(By.CLASS_NAME, 'propertyCard-priceValue')
                sales_price = price_tag.text.split(" ")[0].strip()
                
            except:
                sales_price = ' '

                

        # Location
        try:
            location_tag = page.find_element(By.CLASS_NAME, 'propertyCard-address')
            location = location_tag.text.split(",")[-1].strip()
        
        except:
            location =''
    

        # Agent
        try: 
            agent_tag =page.find_element(By.CLASS_NAME, 'propertyCard-branchSummary')
            # print(agent_tag.get_attribute('innerHTML'))
            agent = agent_tag.text.split("by")[-1].strip()

        except:
            agent = ''

        #Listing Source
        listing_source = source

        # Listing URL
        try:
            listing_url_tag =page.find_element(By.CLASS_NAME, 'propertyCard-link')
            listing_url = listing_url_tag.get_attribute('href')

        except:
            listing_url = ''
        

        
        # Date Added
        try:
            date_tag = page.find_element(By.CLASS_NAME, 'propertyCard-branchSummary-addedOrReduced')
            # print(date_tag.get_attribute('innerHTML'))
            added_reduced = date_tag.text
            date_type = date_tag.text.split(" ")[0].strip()

            # date
            if added_reduced == 'Added today':
                date = datetime.date.today()
                
            elif added_reduced== 'Added yesterday':
                date = datetime.date.today() - timedelta(days=1)

            elif added_reduced== 'Reduced today':
                date = datetime.date.today()

            elif added_reduced== 'Reduced yesterday':
                date = datetime.date.today() - timedelta(days=1)

            else:
                date = date_tag.text.split()[-1].strip()
        except:
            date = ' '
            date_type = ' '      


        page_data.append({
            'transaction': transaction,
            'address': address,
            'bedroom': bedroom,
            'bathroom': bathroom,
            'sales_price': sales_price,
            'rent_perMonth': per_month,
            'rent_perWeek': per_week,
            'description': description,
            'propertyType': property_type,
            'location':location,
            'agent':agent,
            'listing_source':listing_source,
            'listing_url':listing_url,
            'date_type':date_type,
            'date':date,
            })

    return page_data


def get_data(url,transaction_type,source,start_index, stop_index,increment):
    browser = get_driver()
    all_pages_data = []

    for page in range(start_index, stop_index,increment):
        page_html = get_pages(browser,page,url)
        pages_data = parse_pages(page_html,transaction_type, source)
        all_pages_data.extend(pages_data)

    browser.quit()

    data = pd.DataFrame(all_pages_data)
    return data

# if __name__ == "__main___":
# # Specify the start and end page numbers for scraping
# start_index = 0
# stop_index = 24
# increment = 24    

# # Call the get_data function to scrape the data
# data = get_data(rm_renturl,'rent','rightmove',start_index, stop_index,increment)

# # Print the scraped data
# len(data)    

In [380]:
# Specify the start and end page numbers for scraping
start_index = 0
stop_index = 24
increment = 24    

# Call the get_data function to scrape the data
data = get_data(rm_renturl,'rent','rightmove',start_index, stop_index,increment)

data.head()

  


Unnamed: 0,transaction,address,bedroom,bathroom,sales_price,rent_perMonth,rent_perWeek,description,propertyType,location,agent,listing_source,listing_url,date_type,date
0,rent,"Dorset House, Gloucester Place, Marylebone, Lo...",3,2,,"£5,200","£1,200",*** VIDEO VIEWING AVAILABLE *** Smartly modern...,Apartment,NW1,"Parkes Estate Agents, Kensington",rightmove,https://www.rightmove.co.uk/properties/1347760...,Reduced,14/04/2023
1,rent,"Zachary House, London, SW9",1,1,,"£2,000",£462,"We are proud to offer this modern 1 bedroom, 1...",Flat,SW9,"OpenRent, London",rightmove,https://www.rightmove.co.uk/properties/1346130...,Reduced,2023-05-17
2,rent,Newham London E16,1,1,,"£1,880",£434,Available 8th Aug | Fully Furnished | Private ...,Apartment,Newham London E16,"Grainger Plc, South",rightmove,https://www.rightmove.co.uk/properties/1349249...,Added,2023-05-17
3,rent,"Television Centre, 101 Wood Lane, London, W12",2,2,,"£3,200",£738,Moving City are excited to offer this (826 sq ...,Apartment,W12,"Moving City, London",rightmove,https://www.rightmove.co.uk/properties/1326757...,Reduced,2023-05-17
4,rent,"Cranmer Court, London, SW3",3,3,,"£5,850","£1,350",Photos coming soon - contact us for What's App...,Flat,SW3,"Dolce Vita, Mayfair",rightmove,https://www.rightmove.co.uk/properties/1349249...,Added,2023-05-17


In [383]:
# Specify the start and end page numbers for scraping
start_index = 0
stop_index = 24
increment = 24    

# Call the get_data function to scrape the data
data = get_data(rm_salesurl,'sales','rightmove',start_index, stop_index,increment)

data.head()

  


Unnamed: 0,transaction,address,bedroom,bathroom,sales_price,rent_perMonth,rent_perWeek,description,propertyType,location,agent,listing_source,listing_url,date_type,date
0,sales,"Ironworks Way, London E13",1,1,"£325,000",,,"A stunning modern apartment, with a delightful...",Flat,London E13,"ludlowthompson, Bow - Sales",rightmove,https://www.rightmove.co.uk/properties/1130464...,Reduced,2023-05-17
1,sales,"One Hyde Park, Knightsbridge",5,5,"£60,000,000",,,An exceptional exclusive five bedroom apartmen...,Apartment,Knightsbridge,"The Cloister, London",rightmove,https://www.rightmove.co.uk/properties/1301776...,Added,22/12/2022
2,sales,"Lygon Place, Belgravia, SW1W",7,9,"£45,000,000",,,Ref. LOB0798 - Set behind a gated Belgravia dr...,House,SW1W,"Beauchamp Estates Ltd, Mayfair - Resale",rightmove,https://www.rightmove.co.uk/properties/1293029...,Added,21/11/2022
3,sales,"Mayfair Freehold House, Park Lane Area, W1K",1,1,"£45,000,000",,,This stunning 12 bedroom (including 3 staff be...,House,W1K,"Luxury Living Homes International, London",rightmove,https://www.rightmove.co.uk/properties/1303065...,Added,31/12/2022
4,sales,"Pitt Street, London, W8",7,5,"£44,000,000",,,"A rare opportunity to own this unique, contemp...",Detached,W8,"Savills, Kensington",rightmove,https://www.rightmove.co.uk/properties/1318464...,Added,20/02/2023


### Scrape Properties data from Zoopla

In [39]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import datetime
from datetime import datetime



zrent_url = 'https://www.zoopla.co.uk/to-rent/property/london/?price_frequency=per_month&q=london&results_sort=newest_listings&search_source=to-rent&pn={}'
zsales_url = 'https://www.zoopla.co.uk/for-sale/property/london/?price_frequency=per_month&q=london&results_sort=newest_listings&search_source=to-rent&pn={}'

def get_driver():
    # path to chrome driver on my pc
    driver = webdriver.Chrome(r'C:\Windows\chromedriver.exe')
    return driver

def get_pages(driver,page,url):
    driver.get(url.format(page))
    OM_DIV_TAG = 'kii3au6'
    page_html = driver.find_elements(By.CLASS_NAME, OM_DIV_TAG)
    return page_html

def parse_pages(page_html:'page_html', transaction_type:str, source:str):
    """
    
    """

    page_data = []

    for page in page_html:
        # Transaction

        try:
            if transaction_type == 'rent':
                transaction = transaction_type

            elif transaction_type == 'sales':
                transaction = transaction_type

            else:
                print('transaction_type can either be sales or rent')
                break

        except:
                print('transaction_type can either be sales or rent')
                break


        
        # Address
        try:
            address_tag = page.find_element(By.CLASS_NAME, "_1ankud52")
            address = address_tag.text
        except:
            address = ''

        # Bedroom
        try:         
            
            bedroom_element = page.find_element(By.CLASS_NAME,"_1ljm00u3z    ")
            if bedroom_element.text.split("\n")[0].strip() == 'Bedrooms':
                bedroom = bedroom_element.text.split("\n")[1].strip()
            else:
                bedroom = ''

        except:
            bedroom = ''
     
       
        
        # Bathroom
        try:
            
            bathroom_tag = page.find_element(By.CLASS_NAME,"_1ljm00u3z    ")
            if bathroom_tag.text.split("\n")[0].strip() == 'Bathrooms':
                bathroom = bathroom_tag.text.split("\n")[1].strip()

            elif bathroom_tag.text.split("\n")[2].strip() == 'Bathrooms':
                bathroom = bathroom_tag.text.split("\n")[3].strip()

            else:
                bathroom = ''
        except:
            bathroom =''


        # Living room

        try:
            
            livingroom_tag = page.find_element(By.CLASS_NAME,"_1ljm00u3z    ")
            if livingroom_tag.text.split("\n")[0].strip() == 'Living rooms':
                living_room = livingroom_tag.text.split("\n")[1].strip()

            elif livingroom_tag.text.split("\n")[2].strip() == 'Living rooms':
                living_room = livingroom_tag.text.split("\n")[3].strip()

            elif livingroom_tag.text.split("\n")[4].strip() == 'Living rooms':
                living_room = livingroom_tag.text.split("\n")[5].strip()

            else:
                living_room = ''
        except:
            living_room =''


        # Description
        try:
            description_tag = page.find_element(By.CLASS_NAME, '_1ankud53')
            description = description_tag.text

        except:
            description = ''


        # property Type
        try:
            property_type_tag =page.find_element(By.CLASS_NAME, '_1ankud51')
            property_type = property_type_tag.text.split("\n")[0].strip()

        except:
            property_type = ''

  
        # rent payment
        if transaction_type == 'rent':

            sales_price = ' '
            
            # rent price per month
            try:
                pcm = page.find_element(By.CLASS_NAME, '_170k6632')
                per_month = pcm.text.split(" ")[0].strip()
                
            except:
                per_month = ''
                
            # rent price per week
            try:
                pw = page.find_element(By.CLASS_NAME, '_170k6633')
                per_week = pw.text.split(" ")[0].strip()

            except:
                per_week = ''               

        else:
            # sales Price
            try:
                per_week = ''  
                per_month = ''
                
                price_tag = page.find_element(By.CLASS_NAME, '_170k6632 ')
                sales_price = price_tag.text.split(" ")[0].strip()
                
            except:
                sales_price = ' '

                

        # Location
        try:
            location_tag = page.find_element(By.CLASS_NAME, '_1ankud52')
            location = location_tag.text.split(",")[-1].strip()
        
        except:
            location =''
    

        # Agent
        try: 
            agent_tag =page.find_element(By.CLASS_NAME, '_12bxhf70')
            # print(agent_tag.get_attribute('innerHTML'))
            agent = agent_tag.get_attribute('alt')
            
        except:
            agent = ''


        #Listing Source
        listing_source = source

        # Listing URL
        try:
            listing_url_tag =page.find_element(By.CLASS_NAME, '_1maljyt1')
            listing_url = listing_url_tag.get_attribute('href')

        except:
            listing_url = ''
        

        
        # Date Added
        try:
            date_tag = page.find_element(By.CLASS_NAME, '_18cib8e1')
            date_string = date_tag.text.split(" on ")[-1].strip()
            listed_date = datetime.strptime(date_string,"%dth %B %Y").strftime("%d-%m-%Y")
            
        except:
            listed_date = ' '
                  


        # page_data.append({
        #     'transaction': transaction,
        #     'address': address,
        #     'bedroom': bedroom,
        #     'bathroom': bathroom,
        #     'sales_price': sales_price,
        #     'rent_perMonth': per_month,
        #     'rent_perWeek': per_week,
        #     'description': description,
        #     'propertyType': property_type,
        #     'location':location,
        #     'agent':agent,
        #     'listing_source':listing_source,
        #     'listing_url':listing_url,
        #     'date_type':date_type,
        #     'date':date,
        #     })
            
            
            
        page_data.append({
            'transaction': transaction,
            'address': address,
            'bedroom': bedroom,
            'bathroom': bathroom,
            'living_room': living_room,
            'sales_price': sales_price,
            'rent_perMonth': per_month,
            'rent_perWeek': per_week,
            'description': description,
            'propertyType': property_type,
            'location':location,
            'agent':agent,
            'listing_source':listing_source,
            'listing_url':listing_url,
            'listed_date': listed_date,
            })

    return page_data


    # return page_data


def get_data(url,transaction_type,source,start_page, end_page):
    browser = get_driver()
    all_pages_data = []

    for page in range(start_page, end_page+1):
        page_html = get_pages(browser,page,url)
        pages_data = parse_pages(page_html,transaction_type, source)
        all_pages_data.extend(pages_data)

    browser.quit()

    data = pd.DataFrame(all_pages_data)
    return data


In [40]:
# Specify the start and end page numbers for scraping
start_page = 1
end_page = 1

# Call the get_data function to scrape the data
rent_data = get_data(zrent_url,'rent','zoopla',start_page, end_page)

# Print the scraped data
rent_data.head()

  


Unnamed: 0,transaction,address,bedroom,bathroom,living_room,sales_price,rent_perMonth,rent_perWeek,description,propertyType,location,agent,listing_source,listing_url,listed_date
0,rent,"Hilldrop Crescent, Tufnell Park N7",2,1.0,1.0,,"£1,993",£460,A delightful two double bedroom apartment loca...,2 bed terraced house to rent,Tufnell Park N7,Primden Group,zoopla,https://www.zoopla.co.uk/to-rent/details/64651...,17-05-2023
1,rent,"Abingdon Road, London W8",4,,,,"£21,450","£4,950","An impressive and spectacularly designed, four...",4 bed detached house to rent,London W8,Strutt & Parker - Kensington,zoopla,https://www.zoopla.co.uk/to-rent/details/61853...,17-05-2023
2,rent,"Thrawl Street, Spitalfields, London E1",1,1.0,1.0,,"£2,650",£612,Superb warehouse apartment situated within thi...,1 bed flat to rent,London E1,Stirling Ackroyd - Shoreditch,zoopla,https://www.zoopla.co.uk/to-rent/details/58171...,17-05-2023
3,rent,"Sophora House, 342 Queenstown Road, London SW11",2,2.0,1.0,,"£3,683",£850,Stunning two bedroom apartment located in the ...,2 bed flat to rent,London SW11,Stirling Ackroyd - Nine Elms and Westminster,zoopla,https://www.zoopla.co.uk/to-rent/details/64651...,17-05-2023
4,rent,"Thrawl Street, Spitalfields, London E1",2,2.0,1.0,,"£3,750",£865,Superb warehouse apartment situated within thi...,2 bed flat to rent,London E1,Stirling Ackroyd - Shoreditch,zoopla,https://www.zoopla.co.uk/to-rent/details/58171...,17-05-2023


In [41]:
# Specify the start and end page numbers for scraping
start_page = 1
end_page = 1

# Call the get_data function to scrape the data
rent_data = get_data(zsales_url,'sales','zoopla',start_page, end_page)

# Print the scraped data
rent_data.head()

  


Unnamed: 0,transaction,address,bedroom,bathroom,living_room,sales_price,rent_perMonth,rent_perWeek,description,propertyType,location,agent,listing_source,listing_url,listed_date
0,sales,"Eastry Road, Erith, Kent DA8",3,1,2,"£450,000",,,Being sold with no chain with the opportunity ...,3 bed semi-detached house for sale,Kent DA8,Robinson Jackson - North Heath,zoopla,https://www.zoopla.co.uk/for-sale/details/6465...,17-05-2023
1,sales,"Parkspring Court, 102 High Street, Erith, Kent...",2,2,1,"£148,500",,,** 55% shared ownership ** spacious two bedroo...,2 bed flat for sale,Kent DA8,Robinson Jackson - North Heath,zoopla,https://www.zoopla.co.uk/for-sale/details/6465...,17-05-2023
2,sales,"Alie Street, London E1",2,2,1,"£725,000",,,A bright and modern two double bedroom apartme...,2 bed flat for sale,London E1,Hurford Salvi Carr - Aldgate,zoopla,https://www.zoopla.co.uk/for-sale/details/6465...,17-05-2023
3,sales,"District Court, 26 Commercial Road E1",2,1,1,"£847,000",,,This outstanding two-bedroom modern apartment ...,2 bed flat for sale,26 Commercial Road E1,Hurford Salvi Carr - Aldgate,zoopla,https://www.zoopla.co.uk/for-sale/details/6121...,17-05-2023
4,sales,"Pont Street, Knightsbridge, London SW1X",1,2,2,"£1,795,000",,,A fabulous one bedroom plus study (approximate...,1 bed flat for sale,London SW1X,Harrods Estates,zoopla,https://www.zoopla.co.uk/for-sale/details/6187...,17-05-2023


### Scrape Properties from onthemarket

In [91]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import json
import datetime
from datetime import datetime


# link to youtube trending page
om_renturl  = 'https://www.onthemarket.com/to-rent/property/london/?page={}&view=grid'
om_salesurl  = 'https://www.onthemarket.com/for-sale/property/london/?page={}&view=grid'


def get_driver():
    # path to chrome driver on my pc
    driver = webdriver.Chrome(r'C:\Windows\chromedriver.exe')
    return driver
    
def get_properties(driver):
    driver.get(om_renturl)
    OM_DIV_TAG = 'search-results'
    properties = driver.find_elements(By.CLASS_NAME, OM_DIV_TAG)
    return properties


def parse_properties(properties): 
    """title , url, thumbmail, channel, view
      uploaded, description"""
    
    # transcation 
    transcation  = ''

    # address
    address_tag = properties.find_element(By.CLASS_NAME, 'address')
    address = address_tag.text
    

    # bedroom
    bedroom_tag = properties.find_element(By.CLASS_NAME, 'otm-BedBathCount')
    bedroom = bedroom_tag.text.split("\n")[0].strip()
    
    
    # bathroom
    bathroom_tag = properties.find_element(By.CLASS_NAME, 'otm-BedBathCount')
    bathroom = bathroom_tag.text.split("\n")[-1].strip()


    # description
    description_tag = properties.find_element(By.CLASS_NAME, 'days-otm')
    description = description_tag.text

    # propertyType
    propertyType_tag = properties.find_element(By.CLASS_NAME, 'title')
    property_type = propertyType_tag.text


    # price
    price_tag = properties.find_element(By.CLASS_NAME, 'otm-Price')
    price = price_tag.text.split("\n")[-1].strip()
    pcm = price.split("pcm")[0].strip()
    pw = price.split("pcm")[-1].strip().split("£")[1].split(" ")[0]

    #

    # location
    location_tag = properties.find_element(By.CLASS_NAME, 'address')
    location = location_tag.text.split(",")[-1].strip()

    # agent
    agent_tag = properties.find_element(By.CLASS_NAME, 'lazyload-wrapper ')
    agent =  agent_tag.get_attribute('alt')
      
    # listing_source
    listing_source = ''
    

    listing_url_tag = properties.find_element(By.CLASS_NAME, 'agent-logo')
    listing_url = listing_url_tag.get_attribute('a')
    print(listing_url)
    # listing_url  = 'https://www.rightmove.co.uk' + listing_url

    # date_added
    date_added_tag = properties.find_element(By.CLASS_NAME, 'days-otm')
    date_added = date_added_tag.text

    


    return{
        'transcation':transcation,
        'address': address ,
        'bedroom': bedroom,
        'bathroom': bathroom,
        'description': description,
        'propertyType': property_type ,
        'price per month': pcm ,
        'price per week': pw,
        'location': location,
        'agent': agent,
        'listing_source': listing_source,
        'listing_url': listing_url,
        'date_added': date_added,
        }

# browser = get_driver()
# properties = get_properties(browser)
# properties_data = [parse_properties(property) for property in properties]

  del sys.path[0]


None
[{'transcation': '', 'address': 'Three Quays Apartments, 40 Lower Thames Street, London, EC3R', 'bedroom': '2', 'bathroom': '2', 'description': 'OnTheMarket > 14 days', 'propertyType': '2 bedroom apartment to rent', 'price per month': '£9,533', 'price per week': '2,200', 'location': 'EC3R', 'agent': None, 'listing_source': '', 'listing_url': None, 'date_added': 'OnTheMarket > 14 days'}]


In [108]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd


om_renturl_template = 'https://www.onthemarket.com/to-rent/property/london/?page={}&view=grid'


def get_driver():
    # path to chrome driver on my pc
    driver = webdriver.Chrome(r'C:\Windows\chromedriver.exe')
    return driver


def get_properties(driver,page):
    driver.get(om_renturl_template.format(page))
    OM_DIV_TAG = 'search-results'
    properties = driver.find_elements(By.CLASS_NAME, OM_DIV_TAG)
    return properties


def parse_properties(properties):
    """
    title, url, thumbnail, channel, view
    uploaded, description
    """

    property_data = []

    for property in properties:
        # Transaction
        transaction = ''

        # Address
        address_tag = property.find_element(By.CLASS_NAME, 'address')
        address = address_tag.text

        # Bedroom
        bedroom_tag = property.find_element(By.CLASS_NAME, 'otm-BedBathCount')
        bedroom = bedroom_tag.text.split("\n")[0].strip()

        # Bathroom
        bathroom_tag = property.find_element(By.CLASS_NAME, 'otm-BedBathCount')
        bathroom = bathroom_tag.text.split("\n")[-1].strip()

        # Description
        description_tag = property.find_element(By.CLASS_NAME, 'days-otm')
        description = description_tag.text

        # Property Type
        property_type_tag = property.find_element(By.CLASS_NAME, 'title')
        property_type = property_type_tag.text

        # Price
        price_tag = property.find_element(By.CLASS_NAME, 'otm-Price')
        price = price_tag.text.split("\n")[-1].strip()
        pcm = price.split("pcm")[0].strip()
        pw = price.split("pcm")[-1].strip().split("£")[1].split(" ")[0]

        # Location
        location_tag = property.find_element(By.CLASS_NAME, 'address')
        location = location_tag.text.split(",")[-1].strip()

        # Agent
        agent_tag = property.find_element(By.CLASS_NAME, 'lazyload-wrapper ')
        agent = agent_tag.get_attribute('alt')

        # Listing Source
        listing_source = ''

        # Listing URL
        listing_url_tag = property.find_element(By.CLASS_NAME, 'agent-logo')
        listing_url = listing_url_tag.get_attribute('a')
        # print(listing_url)
        # listing_url = 'https://www.rightmove.co.uk' + listing_url

        # Date Added
        date_added_tag = property.find_element(By.CLASS_NAME, 'days-otm')
        date_added = date_added_tag.text

        property_data.append({
            'transaction': transaction,
            'address': address,
            'bedroom': bedroom,
            'bathroom': bathroom,
            'description': description,
            'propertyType': property_type,
            'price per month': pcm,
            'price per week': pw,
            'location': location,
            'agent': agent,
            'listing_source': listing_source,
            'listing_url': listing_url,
            'date_added': date_added,
        })

    return property_data


def get_data(start_page, end_page):
    browser = get_driver()
    all_properties_data = []

    for page in range(start_page, end_page + 1):
        properties = get_properties(browser,page)
        properties_data = parse_properties(properties)
        all_properties_data.extend(properties_data)

    browser.quit()

    data = pd.DataFrame(all_properties_data)
    return data

In [102]:
# Specify the start and end page numbers for scraping
start_page = 1
end_page = 3

# Call the get_data function to scrape the data
data = get_data(start_page, end_page)

# Print the scraped data
data

  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,transaction,address,bedroom,bathroom,description,propertyType,price per month,price per week,location,agent,listing_source,listing_url,date_added
0,,"St. Dionis Road, Fulham, London, SW6",4,4,OnTheMarket > 14 days,4 bedroom terraced house to rent,"£31,417",7250,SW6,,,,OnTheMarket > 14 days
1,,"MOUNTVIEW CLOSE, HAMPSTEAD GARDEN SUBURB, NW11",5,4,OnTheMarket > 14 days,5 bedroom semi-detached house to rent,"£12,003",2770,NW11,,,,OnTheMarket > 14 days
2,,"Schoolbell Mews, Bow, London, E3",3,2,OnTheMarket yesterday,3 bedroom flat to rent,"£3,000",692,E3,,,,OnTheMarket yesterday


In [106]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd


om_renturl_template = 'https://www.onthemarket.com/to-rent/property/london/?page={}&view=grid'


def get_driver():
    # path to chrome driver on my pc
    driver = webdriver.Chrome(r'C:\Windows\chromedriver.exe')
    return driver


def get_properties(driver,page):
    driver.get(om_renturl_template.format(page))
    OM_DIV_TAG = 'otm-PropertyCardInfo'
    properties = driver.find_elements(By.CLASS_NAME, OM_DIV_TAG)
    return properties


def parse_properties(properties):
    
    
    







    property_data = []

    for property in properties:
        # Transaction
        transaction = ''

        # Address
        address_tag = property.find_element(By.CLASS_NAME, 'address')
        address = address_tag.text

        # Bedroom
        
        bedroom_tag = property.find_element(By.CLASS_NAME, 'otm-BedBathCount')
        bedroom = bedroom_tag.text.split("\n")[0].strip()

        # Bathroom
        bathroom_tag = property.find_element(By.CLASS_NAME, 'otm-BedBathCount')
        bathroom = bathroom_tag.text.split("\n")[-1].strip()

        # # Description
        # description_tag = property.find_element(By.CLASS_NAME, 'days-otm')
        # description = description_tag.text

        # # Property Type
        # property_type_tag = property.find_element(By.CLASS_NAME, 'title')
        # property_type = property_type_tag.text

        # # Price
        # price_tag = property.find_element(By.CLASS_NAME, 'otm-Price')
        # price = price_tag.text.split("\n")[-1].strip()
        # pcm = price.split("pcm")[0].strip()
        # pw = price.split("pcm")[-1].strip().split("£")[1].split(" ")[0]

        # # Location
        # location_tag = property.find_element(By.CLASS_NAME, 'address')
        # location = location_tag.text.split(",")[-1].strip()

        # # Agent
        # agent_tag = property.find_element(By.CLASS_NAME, 'lazyload-wrapper ')
        # agent = agent_tag.get_attribute('alt')

        # # Listing Source
        # listing_source = ''

        # # Listing URL
        # listing_url_tag = property.find_element(By.CLASS_NAME, 'agent-logo')
        # listing_url = listing_url_tag.get_attribute('a')
        # # print(listing_url)
        # # listing_url = 'https://www.rightmove.co.uk' + listing_url

        # # Date Added
        # date_added_tag = property.find_element(By.CLASS_NAME, 'days-otm')
        # date_added = date_added_tag.text

        property_data.append({
            'transaction': transaction,
            'address': address,
            'bedroom': bedroom,
            'bathroom': bathroom,
        })

    return property_data


def get_data(start_page, end_page):
    browser = get_driver()
    all_properties_data = []

    for page in range(start_page, end_page + 1):
        properties = get_properties(browser,page)
        properties_data = parse_properties(properties)
        all_properties_data.extend(properties_data)

    browser.quit()

    data = pd.DataFrame(all_properties_data)
    return data

# Specify the start and end page numbers for scraping
start_page = 1
end_page = 3

# Call the get_data function to scrape the data
data = get_data(start_page, end_page)

# Print the scraped data
data

  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,transaction,address,bedroom,bathroom
0,,"Admiralty House, 150 Vaughan Way, London, E1W",2,2
1,,The Compton NW8,2,2
2,,"Hill Street, Mayfair, W1",2,1
3,,"Wakeman Road, London",3,2
4,,"Circus Apartments, Westferry Circus, E14",2,2
...,...,...,...,...
64,,"Villiers Road, Southall",1,1
65,,"Grove Road, Hounslow",1,1
66,,"Wandsworth Road, Nine Elms, London, SW8",2,2
67,,"Keybridge, Vauxhall, London, SW8",2,2
