In [1]:
import re
import csv
import requests
import bs4
from typing import List
from bs4 import BeautifulSoup

import psycopg2 as pg

In [2]:
class Database(object):
    def __init__(self):
        self.__password = "Nkopuruk"
        self.__dbname = "postgres"
        self.__user = 'postgres'
        self.__connected_to_db = False
        self.__conn = None
        self.__cursor = None
        
    @property
    def database_status(self):
        return self.__connected_to_db
    
    @database_status.setter
    def database_status(self, value):
        self.__connected_to_db = value
        
    def setup_database_connection(self):
        try:
            self.__conn = pg.connect(database=self.__dbname, user=self.__user, 
                                      password=self.__password)
            self.database_status = True
        except pg.Error as error:
            print(error)
            print("Unable to connect to database")
            
    def get_connection(self):
        return self.__conn
            
    def query(self, query) -> List[tuple]:
        cursor = self.__conn.cursor()
        try:
            cursor.execute(query)
        except pg.Error as e:
            # print(e)
            self.__conn.rollback()
        else:
            self.__conn.commit()
            if cursor.description is not None:
                return cursor.fetchall()
            
    def close_connection(self):
        self.__conn.close()

In [3]:
class Content:
    def __init__(self, title, url, address, no_of_bedrooms,
                 no_of_bathrooms, no_of_toilets, agent_contact, price, 
                 description):
        self.title = title
        self.url = url
        self.address = address
        self.no_of_bedrooms = no_of_bedrooms
        self.no_of_bathrooms = no_of_bathrooms
        self.no_of_toilets = no_of_toilets
        self.agent_contact = agent_contact
        self.price = price
        self.description = description
        # self.type = building_type

    def print(self):
        print(f"URL listing:\t{self.url}")
        print(f"Address:\t{self.address}")
        print(f"No of bedrooms:\t {self.no_of_bedrooms}")
        print(f"No of bathrooms:\t {self.no_of_bathrooms}")
        print(f"No of toilets:\t{self.no_of_toilets}")
        print(f"Agent contact:\t{self.agent_contact}")
        print(f"Price: \t {self.price}")
        print(f"Description: \t {self.description}")
        # print(f"Building Type: \t {self.type}")


In [4]:
class Website:
    """Contains information about website structure"""

    def __init__(self, filename, url, search_url, result_listing, result_url, use_patterns,
                 absolute_url, title_tag, url_tag, address_tag, no_of_bedrooms_tag,
                 no_of_bathrooms_tag, no_of_toilets_tag, agent_contact_tag, price_tag, 
                 description_tag, get_parent):
        self.filename = filename
        self.url = url
        self.search_url = search_url
        self.result_listing = result_listing
        self.result_url = result_url
        self.use_patterns = use_patterns
        self.absolute_url = absolute_url

        self.title_tag = title_tag
        self.url_tag = url_tag

        self.address_tag = address_tag
        self.no_of_bedrooms_tag = no_of_bedrooms_tag
        self.no_of_bathrooms_tag = no_of_bathrooms_tag
        self.no_of_toilets_tag = no_of_toilets_tag
        self.agent_contact_tag = agent_contact_tag
        self.price_tag = price_tag
        self.description_tag = description_tag
        # self.type_tag = type_tag

        self.get_parent = get_parent

In [5]:
class Crawler:
    def __init__(self, site):
        self.site = site
        self.visited = set()
        self.content = []
        
        self.database = Database()
        self.database.setup_database_connection()

    def get_page(self, url) -> BeautifulSoup or None:
        """Using the url download the page link and convert to
        a Beautiful object"""
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.content, 'html5lib')

    def safe_get(self, page_obj, selector, pattern, parent=False, inner_page_link=None) -> bs4.ResultSet or str:
        """Retrieve the information from the page object using the selector"""
        
        # go deeper into the page using the provided link
        if selector['access_inner_page']:
            if inner_page_link is None:
                return
            absolute_link = f"{self.site.url}{inner_page_link}"
            page_obj = self.get_page(absolute_link)
            
        if page_obj is not None:
            if not pattern:
                child_obj = page_obj.select(selector['selector'])
                if parent:

                    child_obj = page_obj.select(selector['selector'])
                    # if the tag doesn't contain the element
                    if child_obj:
                        child_obj = child_obj[0]
                        child_obj = child_obj.parent
            else:
                child_obj = page_obj.find_all('a', href=re.compile(selector['selector']))
            if child_obj is not None and len(child_obj) > 0:
                return child_obj
        return ""

    def get_links(self, page_url, selector) -> List[str]:
        """Get the links needed to navigate to the next page"""
        links = []
        page = self.get_page(page_url)

        if page is not None:
            for tag in self.safe_get(page, selector, True):
                if 'href' in tag.attrs:
                    link = tag.attrs['href']
                    links.append(link)
        return links

    def parse(self, url) -> list or None:
        """Parse the page to remove and save the needed information"""
        bs = self.get_page(url)
        if bs is not None:
            contents = []

            housing_list = self.safe_get(bs, self.site.result_listing, self.site.use_patterns)
            
            if housing_list is not None:
                for house in housing_list:
                    title = self.safe_get(house, self.site.title_tag, self.site.use_patterns)
                    url = self.safe_get(house, self.site.url_tag, self.site.use_patterns)
                    address = self.safe_get(house, self.site.address_tag, self.site.use_patterns)

                    no_of_bedroom = self.safe_get(house, self.site.no_of_bedrooms_tag, self.site.use_patterns,
                                                  self.site.get_parent)
                    no_of_bathroom = self.safe_get(house, self.site.no_of_bathrooms_tag, self.site.use_patterns,
                                                   self.site.get_parent)
                    no_of_toilet = self.safe_get(house, self.site.no_of_toilets_tag, self.site.use_patterns,
                                                 self.site.get_parent)
                    agent_contact = self.safe_get(house, self.site.agent_contact_tag, self.site.use_patterns)
                    price = self.safe_get(house, self.site.price_tag, self.site.use_patterns)
                    description = self.safe_get(house, self.site.description_tag, self.site.use_patterns)
                    # building_type = self.safe_get(house, self.site.type_tag, self.site.use_patterns, inner_page_link=url[0].attrs['href'])

                    content = Content(title, url, address, no_of_bedroom,
                                      no_of_bathroom, no_of_toilet, agent_contact, price,
                                      description
                                      # building_type
                                      )
                    contents.append(content)
            return contents

    def clean_text(self, text) -> str:
        clean_text = re.sub("[,\n+]", ' ', text)

        return clean_text

    def save_content_to_file(self, contents) -> None:
        for info in contents:
            try:
                Title = self.clean_text(info.title[0].get_text())
            except:
                Title = ""

            try:
                URL = f"{self.site.url}{info.url[0].attrs['href']}"
            except:
                URL = ""

            try:
                Address = self.clean_text(info.address[0].get_text())
            except:
                Address = "0"

            try:
                if self.site.get_parent:
                    No_of_Bedrooms = self.clean_text(info.no_of_bedrooms.get_text())
                else:
                    No_of_Bedrooms = self.clean_text(info.no_of_bedrooms[0].get_text())
            except:
                No_of_Bedrooms = "0"

            try:
                if self.site.get_parent:
                    No_of_Bathrooms = self.clean_text(info.no_of_bathrooms.get_text())
                else:
                    No_of_Bathrooms = self.clean_text(info.no_of_bathrooms[0].get_text())
            except:
                No_of_Bathrooms = "0"

            try:
                if self.site.get_parent:
                    No_of_Toilets = self.clean_text(info.no_of_toilets.get_text())
                else:
                    No_of_Toilets = self.clean_text(info.no_of_toilets[0].get_text())
            except:
                No_of_Toilets = "0"

            try:
                Price = self.clean_text(info.price[0].get_text()).strip()[1:]
            except:
                Price = ""

            try:
                Agent_Contact = self.clean_text(info.agent_contact[0].get_text())
            except:
                Agent_Contact = ""
                
            try:
                Description = self.clean_text(info.description[0].get_text())
            except:
                Description = ""
            
            # try:
            #     Type = self.clean_text(info.type[4].get_text())
            # except:
            #     raise ""

            compile_info = [Title, URL, Address, No_of_Bedrooms,
                            No_of_Bathrooms, No_of_Toilets, Price, Agent_Contact]
            
            Title = Title.replace("'", "''")
            Address = Address.replace("'", "''")
            Agent_Contact = Agent_Contact.replace("'", "''")
            Price = Price.replace("'", "''")
            Description = Description.replace("'", "''")
            # Type = Type.replace("'", "''")
            
            query = fr"""INSERT INTO house_rent (title, url, address, no_of_bedrooms, no_of_bathrooms, no_of_toilets, agent_contact, price, description)
                VALUES ('{Title}', '{URL}', '{Address}', '{No_of_Bedrooms}', '{No_of_Bathrooms}', '{No_of_Toilets}', '{Agent_Contact}', '{Price}', '{Description}')
                        """
            
            self.database.query(query)
            
    def crawl(self) -> None:
        """Crawl through the websites and retrieving the necessary information"""
        print("Crawler started")

        content = self.parse(self.site.search_url)
        if content is not None and len(content) > 0:
            self.content.extend(content)
            self.save_content_to_file(content)

        links_to_visit = []
        links = self.get_links(self.site.search_url, self.site.result_url)

        if len(links) > 0:
            links_to_visit.extend(links)

        self.visited.add(self.site.search_url)

        for link in links_to_visit:
            if link in self.visited:
                continue
            else:
                if not self.site.absolute_url:
                    absolute_link = f"{self.site.url}{link}"
                else:
                    absolute_link = link

                # remove the needed information from the current page
                content = self.parse(absolute_link)

                try:
                    if content is not None or len(content) >= 0:
                        # add the link of the current page to the list of page
                        # already visited
                        self.visited.add(link)

                        # save the parsed content
                        # self.content.extend(content)

                        # get new links from the current page to add to the list
                        # of pages to be visited
                        links = self.get_links(absolute_link, self.site.result_url)
                        links_to_visit.extend(links)

                        self.save_content_to_file(content)
                        print("Done parsing {}".format(absolute_link))
                    else:
                        print("Problem reading from \t{}".format(link))
                except TypeError:
                    print(f"Have encounter an error at {link}")


In [6]:
def create_options_dict(selector, access_inner_page=False) -> dict:
    return {"selector": selector, 'access_inner_page': access_inner_page}

nigeria_property = {
    "filename": "nigeria_property.csv",
    'url': "https://nigeriapropertycentre.com",
    'search_url': "https://nigeriapropertycentre.com/for-rent",
    "result_listing": create_options_dict('div[itemtype*=ListItem]'),
    "result_url": create_options_dict("/for-rent\?.*"),
    "use_patterns": False,
    "absolute_url": False,

    "title_tag": create_options_dict("div[class*=wp-block-title] a"),
    "url_tag": create_options_dict("div[class*=wp-block-title] a"),
    "address_tag": create_options_dict("address[class=voffset-bottom-10]"),
    "no_of_bedrooms_tag": create_options_dict(".fa-bed"),
    "no_of_bathrooms_tag": create_options_dict(".fa-bath"),
    "no_of_toilets_tag": create_options_dict(".fa-toilet"),
    "agent_contact_tag": create_options_dict(".marketed-by"),
    "price_tag": create_options_dict(".pull-sm-left"),
    "description_tag": create_options_dict(".description"),
    # "type_tag": create_options_dict(".table-bordered tr td", True),
    
    "get_parent": True,
}

property_pro = {
    "filename": "propertypro.csv",
    "url": "https://www.propertypro.ng",
    "search_url": "https://www.propertypro.ng/property-for-rent",
    "result_listing": create_options_dict(".listings-property"),
    "result_url": create_options_dict("/.+\?.+\d"),
    "use_patterns": False,
    "absolute_url": False,

    "title_tag": create_options_dict(".listings-property-title"),
    "url_tag": create_options_dict(".single-room-text > a"),
    "address_tag": create_options_dict(".single-room-text > h4"),
    "no_of_bedrooms_tag": create_options_dict(".fur-areea span:nth-child(1)"),
    "no_of_bathrooms_tag": create_options_dict(".fur-areea span:nth-child(2)"),
    "no_of_toilets_tag": create_options_dict(".fur-areea span:nth-child(3)"),
    "agent_contact_tag": create_options_dict(".phone-icon"),
    "price_tag": create_options_dict(".n50 h3"),
    "description_tag": create_options_dict(".description-text", True),
    # "type_tag": None,

    "get_parent": False,
}

In [None]:
websites = [nigeria_property, property_pro]
def start_crawler(sites) -> List[Crawler]:
    crawlers = []
    for site in websites:
        site_structure = Website(**site)
        site_crawler = Crawler(site_structure)
        site_crawler.crawl()
        crawlers.append(site_crawler)
        print("\n\nDone with a website\n\n\n")

    return crawlers

crawlers = start_crawler(websites)

with open('crawler.pkl', 'wb') as file:
    pickle.dump(crawlers, file)

Crawler started
Done parsing https://nigeriapropertycentre.com/for-rent?page=2
Done parsing https://nigeriapropertycentre.com/for-rent?page=3
Done parsing https://nigeriapropertycentre.com/for-rent?page=4
Done parsing https://nigeriapropertycentre.com/for-rent?page=5
Done parsing https://nigeriapropertycentre.com/for-rent?page=6
Done parsing https://nigeriapropertycentre.com/for-rent?page=7
Done parsing https://nigeriapropertycentre.com/for-rent?page=8
Done parsing https://nigeriapropertycentre.com/for-rent?page=997
Done parsing https://nigeriapropertycentre.com/for-rent?page=998
Done parsing https://nigeriapropertycentre.com/for-rent?page=9
Done parsing https://nigeriapropertycentre.com/for-rent?page=10
Done parsing https://nigeriapropertycentre.com/for-rent?page=11
Done parsing https://nigeriapropertycentre.com/for-rent?page=996
Done parsing https://nigeriapropertycentre.com/for-rent?page=990
Done parsing https://nigeriapropertycentre.com/for-rent?page=991
Done parsing https://nigeri

In [7]:
def save_to_db(Title, URL, Address, No_of_Bedrooms, No_of_Bathrooms, 
              No_of_Toilets, Agent_Contact, Price):
    
    Title = Title.replace("'", "''")
    Address = Address.replace("'", "''")
    Agent_Contact = Agent_Contact.replace("'", "''")
    Price = Price.replace("'", "''")
    
    query = fr"""INSERT INTO house_rent (title, url, address, no_of_bedrooms, no_of_bathrooms, no_of_toilets, agent_contact, price)
                VALUES ('{Title}', '{URL}', '{Address}', '{No_of_Bedrooms}', '{No_of_Bathrooms}', '{No_of_Toilets}', '{Agent_Contact}', '{Price}')
                        """
    db.query(query)
    

In [25]:
url = "https://nigeriapropertycentre.com/for-rent"

In [26]:
bs = BeautifulSoup(requests.get(url).content)

In [31]:
bs.select("div[itemtype*=ListItem]")[0]

<div class="row property-list highlight-property" itemprop="itemListElement" itemscope="" itemtype="https://schema.org/ListItem">
<meta content="1" itemprop="position"/>
<link href="https://schema.org/RealEstateListing" itemprop="additionalType"/>
<div class="col-md-12">
<div class="wp-block property list">
<div class="ribbon base-alt hidden-xs"><span>Premium Plus Listing</span></div>
<div class="wp-block-title hidden-xs">
<a hidefocus="true" href="/for-rent/event-centre-venue/conference-meeting-training-rooms/lagos/ikeja/ogba/562133-boat-house-training-and-event-center" itemprop="url"><h3 itemprop="name">Boat House Training And Event Center</h3></a>
</div>
<div class="wp-block-body">
<div class="wp-block-img text-center">
<a hidefocus="true" href="/for-rent/event-centre-venue/conference-meeting-training-rooms/lagos/ikeja/ogba/562133-boat-house-training-and-event-center">
<div class="wp-block-img-container">
<div class="wp-block-date-over property-visuals-mobile">
<div class="inline">


In [46]:
bs = BeautifulSoup(requests.get("https://nigeriapropertycentre.com/for-rent/commercial/offices/abuja/wuse/zone-5/638787-furnished-and-serviced-office").content)

In [67]:
bs.select('.table-bordered tr td')[4]

<td><strong>Type:</strong> Office Space</td>