-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
195 lines (151 loc) · 7.49 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
from fastapi import FastAPI
import requests
from bs4 import BeautifulSoup
import uvicorn
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
import re
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys # noqa
app = FastAPI()
CHROME_DRIVER_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "chromedriver") # noqa
def extract_property_details(url):
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
address_element = soup.find(itemprop="streetAddress")
if address_element is None:
raise ValueError("Failed to extract the property address. The website structure might have changed.") # noqa
address = address_element.get_text(strip=True)
# Regular expression to match UK postcodes
postcode_regex = r"[A-Za-z]{1,2}\d[A-Za-z\d]?\s*\d[A-Za-z]{2}"
postcode_match = re.search(postcode_regex, address)
if postcode_match:
postcode = postcode_match.group()
else:
raise ValueError("Failed to extract the postcode from the address. The postcode might not be included in the address.") # noqa
# Find the price using a different approach
price_tag = soup.find("span", text=re.compile("£"))
if price_tag is None:
raise ValueError("Failed to extract the price. The website structure might have changed.") # noqa
price = float(price_tag.get_text(strip=True).replace("£", "").replace(",", "")) # noqa
# Extract the property's street address
street_address = " ".join(address.split()[:-1])
# Find the price qualifier, if available
qualifier_tag = soup.find(attrs={"data-testid": "priceQualifier"})
if qualifier_tag is not None:
price_qualifier = qualifier_tag.get_text(strip=True)
else:
price_qualifier = None
return {
"address": address,
"street_address": street_address,
"postcode": postcode,
"price": price,
"price_qualifier": price_qualifier
}
def get_simd_data(postcode):
url = "https://simd.scot/"
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
driver.get(url)
postcode_input = driver.find_element(By.ID, "postcode")
postcode_input.send_keys(postcode)
submit_button = driver.find_element(By.ID, "postcodeButton")
# Wait for the submit button to be clickable and click it
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "postcodeButton"))) # noqa
# Move to the element before clicking it
actions = ActionChains(driver)
actions.move_to_element(submit_button).click().perform()
# Wait for the data to load
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "componenttable"))) # noqa
# Extract data from the table
table = driver.find_element(By.ID, "componenttable")
rows = table.find_elements(By.TAG_NAME, "tr") # noqa
simd_data = []
row_count = len(driver.find_elements(By.XPATH, '//*[@id="componenttable"]/tbody/tr')) # noqa
for i in range(2, row_count + 1):
domain_name = driver.find_element(By.XPATH, f'//*[@id="componenttable"]/tbody/tr[{i}]/td[1]').text.strip() # noqa
rank_text = driver.find_element(By.XPATH, f'//*[@id="componenttable"]/tbody/tr[{i}]/td[2]').text.strip() # noqa
if rank_text:
rank = int(re.sub(r"[^\d]", "", rank_text))
simd_data.append({"domain": domain_name, "rank": rank})
print(f"Extracted: {{'domain': '{domain_name}', 'rank': {rank}}}")
print(f"simd_data: {simd_data}")
return simd_data
def extract_section_data(soup, section_id):
section_data = {}
section = soup.find("a", {"href": f"#{section_id}"})
if section:
parent = section.find_parent("div", {"class": "tab-content"})
section = parent.find("div", {"id": section_id})
if section_id == "housing":
info_pieces = section.find_all("div", {"class": "info-piece"})
for info_piece in info_pieces:
header = info_piece.find("h3").get_text(strip=True)
description = info_piece.find("p").get_text(strip=True)
pie_chart_data = {}
for pie_segment in info_piece.find_all("div", {"class": "chartable"}): # noqa
label = pie_segment["data-label"]
value = float(pie_segment["data-value"])
pie_chart_data[label] = value
section_data[header] = {
"description": description,
"pie_chart_data": pie_chart_data
}
else:
for row in section.find_all("div", {"class": "row"}):
for key, value in zip(row.find_all("div", {"class": "col-md-6"}), # noqa
row.find_all("div", {"class": "col-md-6", "style": "font-weight: bold;"})): # noqa
key_text = key.get_text(strip=True).replace(":", "")
value_text = value.get_text(strip=True)
section_data[key_text] = value_text
return section_data
def get_geographical_data(postcode):
street_check_url = f"https://www.streetcheck.co.uk/postcode/{postcode.lower().replace(' ', '')}" # noqa
response = requests.get(street_check_url)
soup = BeautifulSoup(response.content, "html.parser")
geographical_data = {
"housing": extract_section_data(soup, "housing"),
"summary": extract_section_data(soup, "summary"),
"culture": extract_section_data(soup, "culture"),
"employment": extract_section_data(soup, "employment"),
"nearby": extract_section_data(soup, "nearby"),
"broadband": extract_section_data(soup, "services"),
}
return geographical_data
def get_recent_sale_prices(url, street_address):
# Use the street address to construct a URL for the "Recently sold & under offer" tab on Rightmove # noqa
search_url = f"https://www.rightmove.co.uk/house-prices/detail.html?country=england&locationIdentifier=REGION%5E{street_address}&searchLocation={street_address}" # noqa
response = requests.get(search_url)
soup = BeautifulSoup(response.content, "html.parser")
# Extract the recent sale prices
sold_prices = []
sold_price_tags = soup.find_all("td", {"class": "soldPrice"})
for price_tag in sold_price_tags:
price = float(price_tag.get_text(strip=True).replace("£", "").replace(",", "")) # noqa
sold_prices.append(price)
return sold_prices
@app.get("/property_analysis")
async def property_analysis(url: str):
property_details = extract_property_details(url)
postcode = property_details["postcode"]
street_address = property_details["street_address"]
simd_data = get_simd_data(postcode)
geographical_data = get_geographical_data(postcode)
recent_sale_prices = get_recent_sale_prices(postcode, street_address)
return {
"property_details": property_details,
"simd_data": simd_data,
"geographical_data": geographical_data,
"recent_sale_prices": recent_sale_prices,
}
if __name__ == "__main__":
uvicorn.run(app, host="127.0.0.1", port=8000)