# **Name: Shantanu Bhute**
# **Student ID: 24201796**

# Property Sales Data Collection

## Objective
This notebook scrapes property sales data from the UCD teaching website for the years 2021 to 2024.
It collects details such as sale price, location, year built, and property description.

## Data Source
The dataset is retrieved from multiple pages, with each year having multiple paginated pages.

Example URL:  
[http://mlg.ucd.ie/modules/python/assignment1/property/2023-page01.html](http://mlg.ucd.ie/modules/python/assignment1/property/2023-page01.html)


In [1]:
# Install necessary libraries (if not already installed)
!pip install requests beautifulsoup4 pandas

# Import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time




In [2]:

BASE_URL = "http://mlg.ucd.ie/modules/python/assignment1/property/{year}-page{page}.html"
YEARS = ["2021", "2022", "2023", "2024"]  # Years we need to scrape

# Initialize an empty set to track unique records (to prevent duplicates)
unique_records = set()
property_data = []


In [3]:

def get_total_pages(year):
    """Finds the last page number for a given year dynamically."""
    first_page_url = BASE_URL.format(year=year, page="01")  # Start with page 1
    response = requests.get(first_page_url)

    if response.status_code != 200:
        print(f" Failed to fetch first page for {year}")
        return None  # If the first page request fails, skip this year

    soup = BeautifulSoup(response.text, "html.parser")

    # Find the pagination section and extract last page number
    pagination_links = soup.find_all("a", href=True)
    last_page = 1  # Default to 1 in case there's no pagination

    for link in pagination_links:
        if "-page" in link["href"]:  # Look for page numbers in the href
            try:
                page_number = int(link["href"].split("-page")[-1].split(".")[0])
                last_page = max(last_page, page_number)  # Track the highest page number
            except ValueError:
                continue

    print(f" Found {last_page} pages for {year}")
    return last_page



In [4]:
import re  # Import regex for text cleaning

def extract_year(text):
    """Extracts numeric year from a string, removes non-numeric characters."""
    year_digits = re.findall(r"\d{4}", text)  # Find 4-digit numbers
    return int(year_digits[0]) if year_digits else None  # Return first match if found, else None


In [5]:
# Loop through each year
for year in YEARS:
    total_pages = get_total_pages(year)
    if not total_pages:
        continue  # Skip this year if no valid pages found

    for page in range(1, total_pages + 1):
        page_number = f"{page:02d}"  # Formatting page number (01, 02, etc.)
        url = BASE_URL.format(year=year, page=page_number)

        print(f" Scraping: {url}")

        # Fetch page content
        response = requests.get(url)
        if response.status_code != 200:
            print(f" Failed to fetch data from {url}")
            continue  # Skip to the next page if request fails

        # Parse page content
        soup = BeautifulSoup(response.text, "html.parser")
        content_div = soup.find("div", id="content")
        if not content_div:
            print(f" No content found on page: {url}")
            continue

        # Extract all sale entries
        sales = content_div.find("ol").find_all("li")
        print(f" Entries found on the page: {len(sales)}")

        # Extract data for each sale
        for sale in sales:
            try:
                # Extract Date Sold
                date_sold = sale.find("span", class_="sold").text.strip()

                # Extract all table rows inside the sale table
                table_rows = sale.find("table", class_="sale").find_all("tr")

                # Initialize storage variables
                sale_price, location, year_built, garden, garage, description, first_time_buyer = [None] * 7

                # Loop through each row to extract data
                for row in table_rows:
                    label = row.find("td", class_="info").text.strip()
                    value = row.find("td").find_next("td").text.strip()

                    if "Sale Price" in label:
                        sale_price = value.replace("€", "").replace(",", "").strip()
                    elif "Property Location" in label:
                        location = value
                    elif "Year Built" in label:
                        year_built = value
                    elif "Garden" in label:
                        garden = value
                    elif "Garage" in label:
                        garage = value
                    elif "Description" in label:
                        description = value
                    elif "First Time Buyer" in label:
                        first_time_buyer = value

                # Create a unique key to prevent duplicates
                unique_key = (date_sold, sale_price, location, year_built, description)

                # Append only if this record is unique
                if unique_key not in unique_records:
                    unique_records.add(unique_key)  # Track record
                    property_data.append({
                        "Date Sold": date_sold,
                        "Sale Price (€)": float(sale_price) if sale_price else None,
                        "Location": location,
                        "Year Built": extract_year(year_built),
                        "Garden": garden,
                        "Garage": garage,
                        "Description": description,
                        "First Time Buyer": first_time_buyer
                    })

            except AttributeError:
                continue  # Skip if any field is missing

        print(f" Page {page} of {year} added. Total unique records so far: {len(property_data)}")

        # Pause for 1 second to avoid overloading the server (polite scraping)
        time.sleep(1)

print(f" Data extraction complete! Total unique records extracted: {len(property_data)}")


✅ Found 15 pages for 2021
 Scraping: http://mlg.ucd.ie/modules/python/assignment1/property/2021-page01.html
 Entries found on the page: 20
 Page 1 of 2021 added. Total unique records so far: 20
 Scraping: http://mlg.ucd.ie/modules/python/assignment1/property/2021-page02.html
 Entries found on the page: 20
 Page 2 of 2021 added. Total unique records so far: 40
 Scraping: http://mlg.ucd.ie/modules/python/assignment1/property/2021-page03.html
 Entries found on the page: 20
 Page 3 of 2021 added. Total unique records so far: 60
 Scraping: http://mlg.ucd.ie/modules/python/assignment1/property/2021-page04.html
 Entries found on the page: 20
 Page 4 of 2021 added. Total unique records so far: 80
 Scraping: http://mlg.ucd.ie/modules/python/assignment1/property/2021-page05.html
 Entries found on the page: 20
 Page 5 of 2021 added. Total unique records so far: 100
 Scraping: http://mlg.ucd.ie/modules/python/assignment1/property/2021-page06.html
 Entries found on the page: 20
 Page 6 of 2021 adde

In [6]:

# Convert to DataFrame
df = pd.DataFrame(property_data)

# Remove any additional duplicates just in case
df.drop_duplicates(inplace=True)

# Check DataFrame before saving
print(" DataFrame Shape (after removing duplicates):", df.shape)
print(df.head())

# Save to CSV
df.to_csv("property_sales.csv", index=False)

print(" Data saved successfully to property_sales.csv")


 DataFrame Shape (after removing duplicates): (1400, 8)
              Date Sold  Sale Price (€)     Location  Year Built Garden  \
0       Sold 2021-01-10        381302.0   Broomhouse      1967.0    Yes   
1       Sold 2021-01-10        325898.0   Broomhouse      1978.0    Yes   
2  Sold 18 January 2021        370354.0     Oak Park      1961.0    Yes   
3       Sold 2021-01-23         92480.0  Beacon Hill      1958.0    Yes   
4       Sold 2021-01-25        312030.0   Brookville      1987.0    Yes   

  Garage                                        Description First Time Buyer  
0    Yes  Type: Detached; Style: 1.5-Storey; 3 Bedrooms;...               No  
1    ???  Type: Detached; Style: 1-Storey; 3 Bedrooms; 1...              Yes  
2     No  Type: Detached; Style: 1-Storey; 3 Bedrooms; 2...               No  
3     No  Type: Bungalow; 1 Bathroom; Style: 1-Storey; 1...              Yes  
4    Yes  Type: Detached; Style: 1-Storey; 3 Bedrooms; 1...               No  
 Data saved success

In [7]:
print(f"Total records extracted: {len(property_data)}")  # Should be ~1400+

Total records extracted: 1400


In [8]:
print(f"Total records extracted: {len(property_data)}")
print(property_data[:5])  # Print first 5 records


Total records extracted: 1400
[{'Date Sold': 'Sold 2021-01-10', 'Sale Price (€)': 381302.0, 'Location': 'Broomhouse', 'Year Built': 1967, 'Garden': 'Yes', 'Garage': 'Yes', 'Description': 'Type: Detached; Style: 1.5-Storey; 3 Bedrooms; 1 Bathroom', 'First Time Buyer': 'No'}, {'Date Sold': 'Sold 2021-01-10', 'Sale Price (€)': 325898.0, 'Location': 'Broomhouse', 'Year Built': 1978, 'Garden': 'Yes', 'Garage': '???', 'Description': 'Type: Detached; Style: 1-Storey; 3 Bedrooms; 1 Bathroom', 'First Time Buyer': 'Yes'}, {'Date Sold': 'Sold 18 January 2021', 'Sale Price (€)': 370354.0, 'Location': 'Oak Park', 'Year Built': 1961, 'Garden': 'Yes', 'Garage': 'No', 'Description': 'Type: Detached; Style: 1-Storey; 3 Bedrooms; 2 Bathrooms', 'First Time Buyer': 'No'}, {'Date Sold': 'Sold 2021-01-23', 'Sale Price (€)': 92480.0, 'Location': 'Beacon Hill', 'Year Built': 1958, 'Garden': 'Yes', 'Garage': 'No', 'Description': 'Type: Bungalow; 1 Bathroom; Style: 1-Storey; 1 Bedroom', 'First Time Buyer': 'Yes

In [9]:
import os

file_path = "property_sales.csv"

if os.path.exists(file_path):
    print(" File exists:", file_path)
    print(" File size:", os.path.getsize(file_path), "bytes")
else:
    print(" CSV file does not exist.")

 File exists: property_sales.csv
 File size: 153940 bytes


In [12]:
print(soup.prettify()[:])  # Print the first 1000 characters of the page

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta content="noindex" name="robots"/>
  <meta content="Content on this site is posted for teaching purposes only. Original data is from theguardian.com" name="description"/>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   Property Sale Register - Sales 2024
  </title>
  <link href="http://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" rel="stylesheet" target="_blank"/>
  <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.3/jquery.min.js">
  </script>
  <script src="http://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/js/bootstrap.min.js">
  </script>
  <link href="style.css" rel="stylesheet" type="text/css"/>
  <style>
   li
    {
      padding-bottom: 0.8em;
    }
    table.sale
    {
      margin-left: 1em;
    }
    span.sold
    {
      font-weight: bold;
    }
  </style>
 </head>
 <body>
  <div cl