In [6]:
# Imports & Config
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
from pathlib import Path

# Batch & scraping config
BATCH_SIZE = 2500
TOTAL_ELAN = 10000
SLEEP_SCROLL = 1.5  # scroll sonrasƒ± sleep

# Notebook scraping/ i√ßind…ôdirs…ô:
BASE_DIR = Path.cwd().parents[0]   # scraping ‚Üí bina_az_ml
RAW_DIR = BASE_DIR / "data" / "raw"
RAW_DIR.mkdir(parents=True, exist_ok=True)

In [7]:
# Selenium Driver Setup 
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

In [8]:
# Open Bina.az Website
driver.get("https://bina.az/kiraye/menziller")
time.sleep(3)  # JS render √º√ß√ºn g√∂zl…ôyirik

In [9]:
# Batch Scraping Function
def scrape_batch(driver, batch_size, seen_urls):
    batch_data = []

    while len(batch_data) < batch_size:
        soup = BeautifulSoup(driver.page_source, "html.parser")
        section = soup.find("section", id="search-page-regular-items")
        if not section:
            break

        cards = section.find_all("div", attrs={"data-cy": "item-card"})
        for item in cards:
            url_tag = item.find("a", attrs={"aria-label": True})
            if not url_tag:
                continue
            url = "https://bina.az" + url_tag["href"]
            if url in seen_urls:
                continue
            seen_urls.add(url)

            price_tag = item.find("span", attrs={"data-cy": "item-card-price-full"})
            period_tag = item.find("span", attrs={"data-cy": "item-card-price-container"})
            if not price_tag or not period_tag or "/ay" not in period_tag.text:
                continue

            price = int(re.sub(r"[^\d]", "", price_tag.text))
            text = url_tag["aria-label"]

            rooms_match = re.search(r"(\d+)\s*otaqlƒ±", text)
            area_match = re.search(r"(\d+)\s*m¬≤", text)
            rooms = int(rooms_match.group(1)) if rooms_match else None
            area = int(area_match.group(1)) if area_match else None

            img = item.find("img", alt=True)
            location = img["alt"].strip() if img else None

            batch_data.append({
                "location": location,
                "rooms": rooms,
                "area_m2": area,
                "price": price,
                "url": url
            })

            if len(batch_data) >= batch_size:
                break

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(SLEEP_SCROLL)

    return batch_data


In [10]:
# Main Scraping Loop
all_data = []
seen = set()
batch_num = 1

while len(all_data) < TOTAL_ELAN:
    print(f"üîπ Starting batch {batch_num}...")
    batch = scrape_batch(driver, BATCH_SIZE, seen)
    
    if not batch:
        print("‚ùå No more new listings.")
        break

    all_data.extend(batch)
    print(f"‚úÖ Batch {batch_num} done, collected {len(batch)} listings.")

    batch_num += 1

üîπ Starting batch 1...
‚úÖ Batch 1 done, collected 2500 listings.
üîπ Starting batch 2...
‚úÖ Batch 2 done, collected 2500 listings.
üîπ Starting batch 3...
‚úÖ Batch 3 done, collected 2500 listings.
üîπ Starting batch 4...


KeyboardInterrupt: 

In [11]:
len(all_data)


7500

In [12]:
# Close Driver
driver.quit()

In [13]:
# Save CSV
final_file = RAW_DIR / "bina_az_raw.csv"
pd.DataFrame(all_data).to_csv(final_file, index=False)
print(f"‚úÖ CSV hazƒ±rdƒ±r! Toplanan elan sayƒ±: {len(all_data)}")

‚úÖ CSV hazƒ±rdƒ±r! Toplanan elan sayƒ±: 7500
