In [None]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [None]:
def scrape_pages(start_page, end_page):
    base_url = 'https://www.buyrentkenya.com/houses-for-sale'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    properties = []

    for page_num in range(start_page, end_page + 1):
        url = f'{base_url}?page={page_num}'
        print(f"Scraping page {page_num}: {url}")
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print(f"Failed to retrieve the page. Status code: {response.status_code}")
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        listings = soup.find_all('div', class_='listing-card')

        for listing in listings:

            # --- Extract basic info from listing card ---
            title_tag = listing.find('h2')
            title = title_tag.get_text(strip=True) if title_tag else 'No title'

            price_tag = listing.find('a', class_='pointer-events-none z-10 no-underline')
            price = price_tag.get_text(strip=True) if price_tag else 'No price'

            location_tag = listing.find('p', class_='w-full truncate font-normal capitalize')
            location = location_tag.get_text(strip=True) if location_tag else 'No location'

            
            # --- extract each property link ---
            property_tag = listing.find('a', href=True)
            property_url = urljoin(base_url, property_tag['href']) if property_tag else None

            if not property_url:
                continue


            # Swiper slides extraction (bedrooms, bathrooms, size)
            swiper_div = listing.find('div', class_='scrollable-list')

            bedrooms = bathrooms = size = 'N/A'

            if swiper_div:
                slides = swiper_div.find_all('div', class_='swiper-slide')
                for slide in slides:
                    text = slide.get_text(strip=True)
                    if 'Bedroom' in text:
                        bedrooms = text
                    elif 'Bathroom' in text:
                        bathrooms = text
                    elif 'mÂ²' in text or 'sq' in text.lower():
                        size = text

    # ===================================================
            # --- Visit the property detail page ---
            detail_response = requests.get(property_url, headers=headers)
            if detail_response.status_code != 200:
                print(f"Failed to load property page: {property_url}")
                continue


            detail_soup = BeautifulSoup(detail_response.content, "html.parser")

            created_at = "N/A"

            created_tag = detail_soup.find(
                string=lambda x: x and "Created At:" in x
            )

            if created_tag:
                created_at = created_tag.strip().replace("Created At:", "").strip()



            # --- Extract utilities and nearby facilities ---
            utilities =  []
            nearby = []

            sections = detail_soup.find_all("div", class_="px-3 py-3 even:bg-gray-50")
            for section in sections:
                title_span = section.find("span", class_="font-semibold")
                if not title_span:
                    continue

                section_name = title_span.get_text(strip=True).lower()
                items_div = section.find("div", class_="flex flex-wrap gap-3")
                if not items_div:
                    continue

                items = [span.get_text(strip=True) for span in items_div.find_all("span")]
                if "internal features" in section_name or "external features" in section_name:
                    utilities.extend(items)
                elif "nearby" in section_name:
                    nearby.extend(items)

    # ===================================================
            # --- Append property data ---
            properties.append({
                'Title': title,
                'Price': price,
                'Location': location,
                'Bedrooms': bedrooms,
                'Bathrooms': bathrooms,
                # 'Size': size_swiper,
                'Amenities': utilities,
                'Surroundings': nearby,
                'Created At': created_at
            })

            # --- Polite delay ---
            time.sleep(1)

        # Optional: stop once 500+ listings collected
        if len(properties) >= 500:
            print("Reached 500+ listings, stopping scrape.")
            break

    # Convert to DataFrame
    df = pd.DataFrame(properties)
    return df


In [16]:
df_all_pages = scrape_pages(start_page=1, end_page=2)

Scraping page 1: https://www.buyrentkenya.com/houses-for-sale?page=1
Failed to retrieve the page. Status code: 404
Scraping page 2: https://www.buyrentkenya.com/houses-for-sale?page=2


In [17]:
df_all_pages.head()

Unnamed: 0,Title,Price,Location,Bedrooms,Bathrooms,Amenities,Surroundings,Created At
0,5 Bed Townhouse with En Suite in Nyali Area,"KSh 29,500,000","Nyali Area, Nyali",5 Bedrooms,5 Bathrooms,"[Aircon, Alarm, Backup Generator, En Suite, Fi...","[Bus Stop, Golf Course, Hospital, Scenic View,...",09 February 2026
1,6 Bed Townhouse with En Suite in Lavington,"KSh 160,000,000",Lavington,6 Bedrooms,7 Bathrooms,"[Alarm, Backup Generator, En Suite, Fibre Inte...","[Bus Stop, Shopping Centre, Golf Course, Hospi...",06 February 2026
2,4 Bed House with En Suite in Runda,"KSh 155,040,000","Runda, Westlands",4 Bedrooms,5 Bathrooms,"[Alarm, Backup Generator, En Suite, Walk In Cl...","[Bus Stop, Shopping Centre, Hospital, Scenic V...",21 October 2025
3,5 Bed House with En Suite in Westlands Area,"KSh 98,000,000","Westlands Area, Westlands",5 Bedrooms,5 Bathrooms,"[Alarm, Backup Generator, En Suite, Fibre Inte...","[Bus Stop, Shopping Centre, Hospital, School]",17 February 2026
4,6 Bed Villa with En Suite at Loiyangalani Road,"KSh 85,000,000",Lavington,6 Bedrooms,7 Bathrooms,"[Aircon, Alarm, Backup Generator, En Suite, Fi...","[Bus Stop, Hospital, Scenic View, School, Shop...",17 February 2026


In [22]:
from sqlalchemy import create_engine
import pandas as pd
import seaborn as sns
import datetime
import os
from dotenv import load_dotenv
load_dotenv()

True

In [23]:
# get db credentials and load to db
database =os.getenv('database')
user = os.getenv('user')
password = os.getenv('password')
host = os.getenv('host')
port = os.getenv('port')

# Format: postgresql://username:password@host:port/database

engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")

In [24]:
query = ''' select * from houses."buy_rent"
'''

df_all_pages.to_sql("buy_rent", engine, schema="houses", if_exists="replace", index=False)

DETAIL:  The database was created using collation version 2.41, but the operating system provides version 2.42.
HINT:  Rebuild all objects in this database that use the default collation and run ALTER DATABASE postgres REFRESH COLLATION VERSION, or build PostgreSQL with the right library version.


ProgrammingError: (psycopg2.errors.InvalidSchemaName) schema "houses" does not exist
LINE 2: CREATE TABLE houses.buy_rent (
                     ^

[SQL: 
CREATE TABLE houses.buy_rent (
	"Title" TEXT, 
	"Price" TEXT, 
	"Location" TEXT, 
	"Bedrooms" TEXT, 
	"Bathrooms" TEXT, 
	"Size" TEXT
)

]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [None]:
# Load data from PostgreSQL into a DataFrame
df = pd.read_sql(query, engine)
df.head()

Unnamed: 0,Title,Price,Location,Bedrooms,Bathrooms,Size
0,5 Bed House in Kyuna,"KSh 90,000,000","Kyuna, Westlands",5 Bedrooms,,
1,4 Bed House with En Suite at Ruiru,"KSh 15,000,000",Ruiru,4 Bedrooms,6 Bathrooms,
2,6 Bed House with En Suite in Garden Estate,"KSh 130,000,000","Garden Estate, Roysambu",6 Bedrooms,4 Bathrooms,
3,4 Bed Townhouse with En Suite in South B,"KSh 20,000,000",South B,4 Bedrooms,5 Bathrooms,
4,4 Bed Townhouse with Swimming Pool in Kiambu Road,"KSh 45,000,000",Kiambu Road,4 Bedrooms,5 Bathrooms,


In [None]:
df.isnull().sum()


Title        0
Price        0
Location     0
Bedrooms     0
Bathrooms    0
Size         0
dtype: int64

In [None]:
df.dtypes

Title        object
Price        object
Location     object
Bedrooms     object
Bathrooms    object
Size         object
dtype: object