In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import threading
import json
import csv

base_url = "https://nethouseprices.com/house-prices/London/London?page={}"
data =[]


# Adjust the number of threads as needed
max_threads = 20
lock = threading.Lock()
exit_flag = False
page_number = 1

#Added a page limit as the max page number for the website was causing bugs when attempting all the max number of pages in the website.
maxPage = 11000
def scrape_page(pageNum):
    global exit_flag
    while not exit_flag and pageNum <= maxPage:
        print(pageNum)
        url = base_url.format(pageNum)
        response = requests.get(url)
        if response.status_code != 200:
            with lock:
                exit_flag = True
            break
        html_content = response.content
        soup = BeautifulSoup(html_content, 'html.parser')

        housing_table = soup.find('table', class_='striped module-full-width-table listing-table')
        if not housing_table:
            with lock:
                exit_flag = True
            break

        table_body = housing_table.find('tbody')

        for row in table_body.find_all('tr', class_='sold_price_row'):
            cells = row.find_all('td')
            details = cells[0].strong.a.text
            sale_price = cells[1].strong.text
            sale_date = cells[2].text
            property_info = row.find('div', class_='street-details-row').get_text()

            with lock:
                data.append({'Property Info' : property_info, 'Details': details, 'Sale Price': sale_price, 'Sale Date': sale_date})

        with lock:
            pageNum += max_threads

# Create and start multiple threads
threads = []
for i in range(max_threads):
    thread = threading.Thread(target=scrape_page, args=(page_number + i,))
    threads.append(thread)
    thread.start()

# Wait for all threads to finish
for thread in threads:
    thread.join()

# Create a Pandas DataFrame
df = pd.DataFrame(data)


In [None]:
#sort and clean up the data by removing unwanted symbols and embed the columns.
df[['Property Type','Contract Type', 'Build Type']] = df['Property Info'].str.split(',', expand=True)

df['Property Information'] = df[['Property Type', 'Contract Type', 'Build Type']].apply(
    lambda s: s.to_dict(), axis=1
)

df = df.drop(['Property Info','Property Type','Contract Type','Build Type'], axis=1)
df['Sale Price'] = df['Sale Price'].str.replace('£', '')
df['Sale Price'] = df['Sale Price'].str.replace(',', '')
df['Sale Price'] = df['Sale Price'].astype('int')

#Remove any rows below 50,000 and above 50,000,000
dfClean = df.copy()
dfClean = dfClean.drop(dfClean[dfClean['Sale Price'] < 50000].index)
dfClean = dfClean.drop(dfClean[dfClean['Sale Price'] > 50000000].index)

In [None]:
#convert to whatever file you wish.
dfClean.to_csv('housingDataCleaned.csv', encoding='utf-8')
dfClean.to_json('housingDataCleaned.json', orient='records', force_ascii = False)