In [None]:
import os
import time
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

# Chrome runs in headless mode
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)


In [None]:
all_data = []
limit = 15000  # Maximum number of ads
batch_size = 1000
output_file = "../data/raw/bina_rent.csv"

def scroll_to_bottom(driver, pause=3, max_scroll=25):
    """# Loads new ads by scrolling the site"""
    last_height = driver.execute_script("return document.body.scrollHeight")
    for _ in range(max_scroll):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


In [None]:
# If a CSV exists, read it
if os.path.exists(output_file):
    df_existing = pd.read_csv(output_file)
    all_data = df_existing.to_dict('records')
else:
    all_data = []

# Create a key to avoid fetching duplicate ads
seen_locations = set()
for record in all_data:
    unique_key = (record['rooms'], record['area_m2'], record['floor_current'], record['floor_total'], record['location'])
    seen_locations.add(unique_key)


In [None]:
url = "https://bina.az/kiraye/menziller"
driver.get(url)
time.sleep(3)

while len(all_data) < limit:
    scroll_to_bottom(driver, pause=2, max_scroll=10)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    sections = soup.find_all("section", id=["search-page-vipped", "search-page-regular-items"])

    for section in sections:
        cards = section.find_all("div", attrs={"data-cy": "item-card"})
        for item in cards:
            try:
                # /ay filter
                price_type = item.find("span", attrs={"data-cy": "item-card-price-container"})
                if not price_type or "/ay" not in price_type.text:
                    continue

                a_tag = item.find("a", attrs={"aria-label": True})
                if not a_tag:
                    continue

                aria = a_tag["aria-label"]

                # Rooms and area
                rooms_match = re.search(r"([\d\s\xa0]+)\s*otaqlı", aria)
                area_match = re.search(r"([\d\s\xa0]+)\s*m²", aria)
                rooms = int(rooms_match.group(1).replace('\xa0','').replace(' ','') ) if rooms_match else None
                area = int(area_match.group(1).replace('\xa0','').replace(' ','') ) if area_match else None

                # Floor
                floor = re.search(r"(\d+\/\d+)", aria)
                if floor:
                    floor_current, floor_total = map(int, floor.group(1).split("/"))
                else:
                    floor_current, floor_total = None, None

                # Location
                img = item.find("img", alt=True)
                location = img["alt"].strip() if img else None

                # Unique check
                unique_key = (rooms, area, floor_current, floor_total, location)
                if unique_key in seen_locations:
                    continue
                seen_locations.add(unique_key)

                # Price
                price_tag = item.find("span", attrs={"data-cy": "item-card-price-full"})
                if price_tag:
                    price_str = price_tag.text.strip().replace('\xa0','').replace(' ','')
                    try:
                        price = int(price_str)
                    except:
                        price = None
                else:
                    price = None

                all_data.append({
                    "rooms": rooms,
                    "area_m2": area,
                    "floor_current": floor_current,
                    "floor_total": floor_total,
                    "location": location,
                    "price": price
                })

                # Write to CSV each time a batch_size is collected
                if len(all_data) % batch_size == 0:
                    df = pd.DataFrame(all_data)
                    df.to_csv(output_file, index=False)
                    print(f"+{batch_size} ads added, now {len(all_data)} in total")

                if len(all_data) >= limit:
                    break

            except Exception as e:
                print("Error:", e)
                continue
        if len(all_data) >= limit:
            break


+1000 elan əlavə olundu, indi 6000 toplam
+1000 elan əlavə olundu, indi 7000 toplam
+1000 elan əlavə olundu, indi 8000 toplam
+1000 elan əlavə olundu, indi 9000 toplam
+1000 elan əlavə olundu, indi 10000 toplam
+1000 elan əlavə olundu, indi 11000 toplam
+1000 elan əlavə olundu, indi 12000 toplam
+1000 elan əlavə olundu, indi 13000 toplam
+1000 elan əlavə olundu, indi 14000 toplam
+1000 elan əlavə olundu, indi 15000 toplam


In [None]:
driver.quit()

df = pd.DataFrame(all_data)
print("Total UNIQUE ads:", len(df))

os.makedirs("../data/raw", exist_ok=True)
df.to_csv(output_file, index=False)
print("# CSV saved")


Toplam UNİKAL elan sayı: 15000
CSV yadda saxlanıldı
