In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import urllib.request
import asyncio
import aiohttp

In [85]:
# Define the base URL of the website to scrape
base_url = "https://myrealty.am/en/bnakaran/3-senyakanoc/Bagratuniats+Ave/Shengavit/Yerevan/141101"

# Initialize lists to store scraped data
apartment_data = []

# Create a directory to store the images
os.makedirs('images', exist_ok=True)

# Start with the first page
page_number = 1

# Construct the URL for the current page
url = f"{base_url}?page={page_number}"

# Send a GET request to the website
response = requests.get(url)

# Check if the page is empty or not found, and break the loop if so
if response.status_code != 200 or not response.text.strip():
    print("Failed", response.status_code)

# Parse the HTML content of the page with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

### Price

In [86]:
price_element = soup.find('div', class_='pl-0')
if price_element:
    price_element_text = price_element.get_text(strip=True)
    price_text_stripped = price_element_text.split("/")[0]
    price_text_stripped = price_text_stripped.replace(",", "")
    price = int(price_text_stripped)
    print(price)

150000


### Facilities

In [87]:
facilities = [li.find('label').get_text() for li in soup.find_all('li', class_='col-sm-6 col-lg-4 col-xl-3 mb-1')]
print(facilities)

['Heating', 'Internet', 'Hot water', 'Gas', 'Water', 'water 24/7', 'Furniture', 'Equipment', 'Euro windows', 'Iron door', 'Open balcony', 'Basement', 'Sunny', 'View', 'Close to the bus station', 'Garage']


### Location

In [88]:
# Find the div with the specific id
div_tag = soup.find('div', id='yandex_map_item_view')

# Extract the latitude and longitude from the data-lat and data-lng attributes
latitude = div_tag['data-lat']
longitude = div_tag['data-lng']

print(latitude, longitude)

40.18333300 44.51666700


### Area, room, floor, storeys

In [89]:
# Find the parent div with the specific class
parent_div = soup.find('div', class_='col-12 d-flex justify-content-between justify-content-sm-start item-view-price-params')

# Extract the area, room, floor, and storeys
area = parent_div.find('div', class_='pl-0').find('span').text
room = parent_div.find_all('div')[1].find('span').text
floor_storeys = parent_div.find_all('div')[2].find('span').text
floor, storeys = floor_storeys.split('/')

print('Area:', area)
print('Room:', room)
print('Floor:', floor)
print('Storeys:', storeys)

Area: 83 SQ. M.
Room: 3
Floor: 4
Storeys: 5


### ID

In [90]:
# Find the div with the specific class and extract the ID
id_div = soup.find('div', class_='item-view-id')
id_text = id_div.get_text(strip=True)  # Get the text content of the div
id_number = id_text.split()[-1]  # Split the text and get the last part, which is the ID number

print('ID:', id_number)

ID: 141101


In [91]:
# Find the ul tag with the specific class
ul_tag = soup.find('ul', class_='item-view-list-params')

for li_tag in ul_tag.find_all('li'):
    # Find the label and value within the li tag
    label = li_tag.find('label').get_text(strip=True)
    value = li_tag.find('span').get_text(strip=True)
    
    # Print the extracted information
    print(f'{label}: {value}')

Price (SQ. M.): 1,807
Bathroom: 1
Building type: Stone
Ceiling height: 2.8 M
condition: Newly repaired


In [94]:


# Extract image URLs
# Find img elements with the specific classes
img_elements = soup.find_all('img', class_=['owl-lazy', 'lazy-loaded'])

# Extract the src attribute of the img elements
img_urls = set([img['data-src'] for img in img_elements if 'data-src' in img.attrs])

async def download_image(session, url, image_index):
    async with session.get(url) as response:
        # Check if the request was successful
        if response.status != 200:
            return
        
        apartment_images_path = f'images/{id_number}'
        # Create a directory to save the images if it doesn't exist
        if not os.path.exists(apartment_images_path):
            os.makedirs(apartment_images_path)

        # Save the image to the local file system
        extension = url.split(".")[-1]
        file_name = os.path.join(apartment_images_path, f"{image_index}.{extension}")
        with open(file_name, 'wb') as f:
            f.write(await response.read())
        print(f"Downloaded {url}")

async def main():
    async with aiohttp.ClientSession() as session:
        tasks = [download_image(session, url, ind) for ind, url in enumerate(img_urls)]
        await asyncio.gather(*tasks)
    print("All images have been downloaded.")

# Check if there is a running event loop
if asyncio.get_running_loop():
    # If there is a running event loop, use create_task to schedule the coroutine
    asyncio.create_task(main())
else:
    # If there is no running event loop, use asyncio.run() to execute the coroutine
    asyncio.run(main())

Downloaded https://myrealty.am/images/22/3c/223cf4c5e7232cc2afd2216a5987f51e.jpg
Downloaded https://myrealty.am/images/61/62/616285efa9a170ae4c872166d2fc9aa6.jpg
Downloaded https://myrealty.am/images/ae/96/ae967fea70fdeaab13f132fee82ea193(1).jpg
Downloaded https://myrealty.am/images/ae/96/ae967fea70fdeaab13f132fee82ea193(2).jpg
Downloaded https://myrealty.am/images/e3/09/e309b4d5d09b3ca2eb49027617def2da(2).jpg
Downloaded https://myrealty.am/images/61/62/616285efa9a170ae4c872166d2fc9aa6(1).jpg
Downloaded https://myrealty.am/images/ae/96/ae967fea70fdeaab13f132fee82ea193(3).jpg
Downloaded https://myrealty.am/images/ae/96/ae967fea70fdeaab13f132fee82ea193.jpg
All images have been downloaded.


In [22]:
class ApartmentScraper:
    
    def __init__(self, webpage):
        
        # Send a GET request to the website
        response = requests.get(webpage)

        # Check if the page is empty or not found, and break the loop if so
        if response.status_code != 200 or not response.text.strip():
            print("Failed", response.status_code)

        # Parse the HTML content of the page with BeautifulSoup
        self.soup = BeautifulSoup(response.text, 'html.parser')
        
    # must through some errors
    def scrape(self):
        success = self.__scrape_id()
        
        if success:
            self.__scrape_price()
            self.__scrape_facilities()
            self.__scrape_location()
            self.__scrape_misc()
            self.__scrape_images()
        else:
            raise Exception
    
    def __scrape_id(self) -> bool:
        # Find the div with the specific class and extract the ID
        id_div = self.soup.find('div', class_='item-view-id')
        id_text = id_div.get_text(strip=True)  # Get the text content of the div
        id_number = id_text.split()[-1]  # Split the text and get the last part, which is the ID number
        
        if id_number is None:
            return False
        else:
            self.id = id_number
            return True
        
    def __scrape_facilities(self):
        facilities = [li.find('label').get_text() for li in self.soup.find_all('li', class_='col-sm-6 col-lg-4 col-xl-3 mb-1')]
        self.facilities = facilities

    def __scrape_price(self):
        price_element = self.soup.find('div', class_='pl-0')
        if price_element:
            price_element_text = price_element.get_text(strip=True)
            price_text_stripped = price_element_text.split("/")[0]
            price_text_stripped = price_text_stripped.replace(",", "")
            price = int(price_text_stripped)
            self.price = price
            
    def __scrape_location(self):
        # Find the div with the specific id
        div_tag = self.soup.find('div', id='yandex_map_item_view')

        # Extract the latitude and longitude from the data-lat and data-lng attributes
        latitude = div_tag['data-lat']
        longitude = div_tag['data-lng']
        self.latitude = latitude
        
        self.longitude = longitude
        
    def __scrape_misc(self):
        # Find the parent div with the specific class
        parent_div = self.soup.find('div', class_='col-12 d-flex justify-content-between justify-content-sm-start item-view-price-params')

        # Extract the area, room, floor, and storeys
        area = parent_div.find('div', class_='pl-0').find('span').text
        room = parent_div.find_all('div')[1].find('span').text
        floor_storeys = parent_div.find_all('div')[2].find('span').text
        floor, storeys = floor_storeys.split('/')

        self.area = area
        self.room = room
        self.floor = floor
        self.storeys = storeys
        
    def __scrape_images(self):
        # Extract image URLs
        # Find img elements with the specific classes
        img_elements = self.soup.find_all('img', class_=['owl-lazy', 'lazy-loaded'])

        # Extract the src attribute of the img elements
        img_urls = set([img['data-src'] for img in img_elements if 'data-src' in img.attrs])

        async def download_image(session, url, image_index):
            async with session.get(url) as response:
                # Check if the request was successful
                if response.status != 200:
                    return

                apartment_images_path = f'images/{self.id}'
                # Create a directory to save the images if it doesn't exist
                if not os.path.exists(apartment_images_path):
                    os.makedirs(apartment_images_path)

                # Save the image to the local file system
                extension = url.split(".")[-1]
                file_name = os.path.join(apartment_images_path, f"{image_index}.{extension}")
                with open(file_name, 'wb') as f:
                    f.write(await response.read())
                print(f"Downloaded {url}")

        async def main():
            async with aiohttp.ClientSession() as session:
                tasks = [download_image(session, url, ind) for ind, url in enumerate(img_urls)]
                await asyncio.gather(*tasks)
            print(f"All images for ID {self.id} have been downloaded.")

        # Check if there is a running event loop
        if asyncio.get_running_loop():
            # If there is a running event loop, use create_task to schedule the coroutine
            asyncio.create_task(main())
        else:
            # If there is no running event loop, use asyncio.run() to execute the coroutine
            asyncio.run(main())
            

In [23]:
scraper = ApartmentScraper(webpage = "https://myrealty.am/en/bnakaran/3-senyakanoc/Bagratuniats+Ave/Shengavit/Yerevan/141101")

In [24]:
scraper.scrape()

Downloaded https://myrealty.am/images/ae/96/ae967fea70fdeaab13f132fee82ea193(3).jpg
Downloaded https://myrealty.am/images/61/62/616285efa9a170ae4c872166d2fc9aa6(1).jpg
Downloaded https://myrealty.am/images/ae/96/ae967fea70fdeaab13f132fee82ea193.jpg
Downloaded https://myrealty.am/images/ae/96/ae967fea70fdeaab13f132fee82ea193(1).jpg
Downloaded https://myrealty.am/images/e3/09/e309b4d5d09b3ca2eb49027617def2da(2).jpg
Downloaded https://myrealty.am/images/61/62/616285efa9a170ae4c872166d2fc9aa6.jpg
Downloaded https://myrealty.am/images/ae/96/ae967fea70fdeaab13f132fee82ea193(2).jpg
Downloaded https://myrealty.am/images/22/3c/223cf4c5e7232cc2afd2216a5987f51e.jpg
All images for ID 141101 have been downloaded.


In [21]:
print(scraper.price)
print(scraper.area, scraper.room, scraper.floor, scraper.storeys)

150000
83 SQ. M. 3 4 5


In [36]:
class ScraperPipeline:
    
    def __init__(self, base_url, storage):
        self.base_url = base_url
        self.page = 1
        
        self.__set_soup(base_url)
        
    def __set_soup(self, url):
        
        # Send a GET request to the website
        response = requests.get(url)

        # Check if the page is empty or not found, and break the loop if so
        if response.status_code != 200 or not response.text.strip():
            print("Failed", response.status_code)

        # Parse the HTML content of the page with BeautifulSoup
        self.soup = BeautifulSoup(response.text, 'html.parser')
        
    def navigate_to_next_page(self):
        self.page += 1
        self.__set_soup(f"{self.base_url}?page={self.page}")
    
    def scrape_apartments_from_current_page(self):
        apartments = self.get_apartment_links_for_current_page()
        for apartment_link in apartments:
            apartment_scraper = ApartmentScraper(apartment_link)
            apartment_scraper.scrape()
    
    def get_apartment_links_for_current_page(self):
        # Find all 'a' elements with the specific class
        a_elements = self.soup.find_all('a', class_='btn btn-pink-transparent btn-cs text-uppercase item-more-btn ml-auto')

        # Iterate over the found 'a' elements and navigate to their links
        links = []
        for a_element in a_elements:
            link = a_element.get('href')
            links.append(link)
            
        return links

In [37]:
scraper_pipeline = ScraperPipeline("https://myrealty.am/en/apartments-for-sale/7784", None)

In [39]:
scraper_pipeline.scrape_apartments_from_current_page()