In [21]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import urllib.request
import asyncio
import aiohttp
import csv

In [2]:
from abc import ABC, abstractmethod

class ApartmentScraper(ABC):
    @abstractmethod
    def scrape(self):
        """
        Abstract method for scraping the current page of an individual apartment.
        Subclasses must implement this method.
        """
        pass

    @abstractmethod
    def values(self):
        """
        Abstract method for returning a dictionary with keys and values scraped from the apartment page.
        Subclasses must implement this method.
        """
        pass


In [3]:
class ApartmentScrapingPipeline(ABC):
    
    def __init__(self, apartment_scraper):
        """
        Constructor that takes an type of ApartmentScraper as an argument.

        Args:
            apartment_scraper (ApartmentScraper): An type of ApartmentScraper to be used for scraping.
        """
        self.apartment_scraper = apartment_scraper

    @abstractmethod
    def get_apartment_links(self, page_url):
        """
        Abstract method for getting links to individual apartments on a page.
        Subclasses must implement this method.

        Args:
            page_url (str): The URL of the page containing apartment links.

        Returns:
            list: A list of apartment URLs.
        """
        pass

    @abstractmethod
    def scrape_apartment(self, apartment_url):
        """
        Abstract method for scraping data from an individual apartment page.
        Subclasses must implement this method.

        Args:
            apartment_url (str): The URL of the apartment page to scrape.

        Returns:
            dict: A dictionary containing scraped data.
        """
        pass

    @abstractmethod
    def navigate_to_next_page(self):
        """
        Abstract method for navigating to the next page of apartment listings.
        Subclasses must implement this method.
        """
        pass


In [22]:
class Storage(ABC):
    @abstractmethod
    def initialize(self):
        """
        Abstract method for initializing the storage mechanism.
        Subclasses must implement this method.
        """
        pass

    @abstractmethod
    def append(self, data_dict):
        """
        Abstract method for storing data in the storage mechanism.
        Subclasses must implement this method.

        Args:
            data: The data to be stored.
        """
        pass

    @abstractmethod
    def path(self):
        """
        Abstract method for getting the path or location of the stored data.
        Subclasses must implement this method.

        Returns:
            str: The path or location of the stored data.
        """
        pass


In [33]:
class MyRealtyApartmentScraper(ApartmentScraper):
    
    def __init__(self, webpage):
        
        # Send a GET request to the website
        response = requests.get(webpage)

        # Check if the page is empty or not found, and break the loop if so
        if response.status_code != 200 or not response.text.strip():
            print("Failed", response.status_code)

        # Parse the HTML content of the page with BeautifulSoup
        self.soup = BeautifulSoup(response.text, 'html.parser')
        
    # must through some errors
    def scrape(self):
        success = self.__scrape_id()
        
        if success:
            self.__scrape_price()
            self.__scrape_facilities()
            self.__scrape_location()
            self.__scrape_misc()
            self.__scrape_images()
        else:
            raise Exception
            
    def values(self):
        return {
            "id": self.id,
            "price" : self.price,
            "facilities" : self.facilities,
            "location" : self.address,
            "area" : self.area,
            "room" : self.room,
            "floor" : self.floor,
            "storeys" : self.storeys
        }
    
    def __scrape_id(self) -> bool:
        # Find the div with the specific class and extract the ID
        id_div = self.soup.find('div', class_='item-view-id')
        id_text = id_div.get_text(strip=True)  # Get the text content of the div
        id_number = id_text.split()[-1]  # Split the text and get the last part, which is the ID number
        
        if id_number is None:
            return False
        else:
            self.id = id_number
            return True
        
    def __scrape_facilities(self):
        facilities = [li.find('label').get_text() for li in self.soup.find_all('li', class_='col-sm-6 col-lg-4 col-xl-3 mb-1')]
        self.facilities = facilities

    def __scrape_price(self):
        price_element = self.soup.find('div', class_='pl-0')
        if price_element:
            price_element_text = price_element.get_text(strip=True)
            price_text_stripped = price_element_text.split("/")[0]
            price_text_stripped = price_text_stripped.replace(",", "")
            price = int(price_text_stripped)
            self.price = price
            
    def __scrape_location(self):
        # Find the div with the specific id
        div_tag = self.soup.find('div', id='yandex_map_item_view')

        # Extract the latitude and longitude from the data-lat and data-lng attributes
        latitude = div_tag['data-lat']
        longitude = div_tag['data-lng']
        self.latitude = latitude
        self.longitude = longitude
        
        address_div = self.soup.find('div', class_='col-auto item-view-address d-none d-xl-block mr-0')

        # Extract the text within the div element
        address = address_div.get_text(strip=True)
        self.address = address

        
    def __scrape_misc(self):
        # Find the parent div with the specific class
        parent_div = self.soup.find('div', class_='col-12 d-flex justify-content-between justify-content-sm-start item-view-price-params')

        # Extract the area, room, floor, and storeys
        area = parent_div.find('div', class_='pl-0').find('span').text
        room = parent_div.find_all('div')[1].find('span').text
        floor_storeys = parent_div.find_all('div')[2].find('span').text
        floor, storeys = floor_storeys.split('/')

        self.area = area
        self.room = room
        self.floor = floor
        self.storeys = storeys
        
    def __scrape_images(self):
        # Extract image URLs
        # Find img elements with the specific classes
        img_elements = self.soup.find_all('img', class_=['owl-lazy', 'lazy-loaded'])

        # Extract the src attribute of the img elements
        img_urls = set([img['data-src'] for img in img_elements if 'data-src' in img.attrs])

        async def download_image(session, url, image_index):
            async with session.get(url) as response:
                # Check if the request was successful
                if response.status != 200:
                    return

                apartment_images_path = f'images/{self.id}'
                # Create a directory to save the images if it doesn't exist
                if not os.path.exists(apartment_images_path):
                    os.makedirs(apartment_images_path)

                # Save the image to the local file system
                extension = url.split(".")[-1]
                file_name = os.path.join(apartment_images_path, f"{image_index}.{extension}")
                with open(file_name, 'wb') as f:
                    f.write(await response.read())
#                 print(f"Downloaded {url}")

        async def main():
            async with aiohttp.ClientSession() as session:
                tasks = [download_image(session, url, ind) for ind, url in enumerate(img_urls)]
                await asyncio.gather(*tasks)
#             print(f"All images for ID {self.id} have been downloaded.")

        # Check if there is a running event loop
        if asyncio.get_running_loop():
            # If there is a running event loop, use create_task to schedule the coroutine
            asyncio.create_task(main())
        else:
            # If there is no running event loop, use asyncio.run() to execute the coroutine
            asyncio.run(main())
            

In [34]:
scraper = MyRealtyApartmentScraper(webpage = "https://myrealty.am/en/bnakaran/3-senyakanoc/Bagratuniats+Ave/Shengavit/Yerevan/141101")

In [35]:
scraper.scrape()

In [36]:
print(scraper.values())

{'id': '141101', 'price': 150000, 'facilities': ['Heating', 'Internet', 'Hot water', 'Gas', 'Water', 'water 24/7', 'Furniture', 'Equipment', 'Euro windows', 'Iron door', 'Open balcony', 'Basement', 'Sunny', 'View', 'Close to the bus station', 'Garage'], 'location': 'Yerevan, Shengavit, Bagratuniats Ave', 'area': '83 SQ. M.', 'room': '3', 'floor': '4', 'storeys': '5'}


In [37]:
class MyRealtyScrapingPipeline(ApartmentScrapingPipeline):

    def __init__(self, base_url, storage):
        self.base_url = base_url
        self.page = 1
        self.storage = storage  # Assuming you have some storage mechanism

        self.__set_soup(base_url)
        super().__init__(MyRealtyApartmentScraper)

    def __set_soup(self, url):
        # Send a GET request to the website
        response = requests.get(url)

        # Check if the page is empty or not found, and break the loop if so
        if response.status_code != 200 or not response.text.strip():
            print("Failed", response.status_code)

        # Parse the HTML content of the page with BeautifulSoup
        self.soup = BeautifulSoup(response.text, 'html.parser')

    def navigate_to_next_page(self):
        self.page += 1
        self.__set_soup(f"{self.base_url}?page={self.page}")

    def scrape_apartment(self, apartment_url):
        # Create an instance of ApartmentScraper with the provided URL
        apartment_scraper = self.apartment_scraper(apartment_url)

        # Call the scrape method of the ApartmentScraper
        apartment_scraper.scrape()
        apartment_data = apartment_scraper.values()

        # Store or process the scraped data as needed
        self.storage.append(apartment_data)  # Replace with your storage mechanism

    def get_apartment_links(self, page_url=None):
        if page_url is None:
            page_url = self.base_url

        # Find all 'a' elements with the specific class
        a_elements = self.soup.find_all('a', class_='btn btn-pink-transparent btn-cs text-uppercase item-more-btn ml-auto')

        # Iterate over the found 'a' elements and navigate to their links
        links = []
        for a_element in a_elements:
            link = a_element.get('href')
            links.append(link)

        return links

In [54]:
class CSVStorage(Storage):
    
    fieldnames = ["id", "price", "facilities", "location", "area", "room", "floor", "storeys"]
    
    def __init__(self, file_path):
        self.file_path = file_path
        self.initialize()

    def initialize(self):
        # Initialize the CSV file and write the header if the file doesn't exist
        with open(self.file_path, mode='a+', newline='') as file:
            file.seek(0)
            if not file.read(1):
                # File is empty, write the header
                writer = csv.DictWriter(file, fieldnames = self.fieldnames)
                writer.writeheader()

    def append(self, data_dict):
        # Append data in the CSV file
        with open(self.file_path, mode='a+', newline='') as file:
            writer = csv.DictWriter(file, fieldnames=self.fieldnames)
            writer.writerow(data_dict)

    def path(self):
        return self.file_path


In [55]:
storage = CSVStorage("apartments.csv")
storage.initialize()

In [56]:
scraper_pipeline = MyRealtyScrapingPipeline("https://myrealty.am/en/apartments-for-sale/7784", storage)

In [57]:
links = scraper_pipeline.get_apartment_links()
for link in links:
    scraper_pipeline.scrape_apartment(link)

{'id': '141351', 'price': 920000, 'facilities': ['Hot water', 'Gas', 'Heating'], 'location': 'Yerevan, Center, Amiryan St', 'area': '146 SQ. M.', 'room': '4', 'floor': '5', 'storeys': '19'}
{'id': '141350', 'price': 135000, 'facilities': ['Heating', 'Gas', 'Hot water', 'Furniture', 'Equipment'], 'location': 'Yerevan, Center, Argishti St', 'area': '52 SQ. M.', 'room': '2', 'floor': '5', 'storeys': '10'}
{'id': '141349', 'price': 570000, 'facilities': ['Heating'], 'location': 'Yerevan, Center, Amiryan St', 'area': '172 SQ. M.', 'room': '6', 'floor': '5', 'storeys': '5'}
{'id': '141348', 'price': 410000, 'facilities': ['Heating', 'Gas', 'Hot water', 'Electricity', 'water 24/7', 'Water', 'Furniture', 'Equipment'], 'location': 'Yerevan, Center, Tumanyan St', 'area': '98 SQ. M.', 'room': '3', 'floor': '4', 'storeys': '4'}
{'id': '141346', 'price': 155000, 'facilities': ['Heating', 'Internet', 'Hot water', 'Electricity', 'Air-conditioner', 'Gas', 'Water', 'water 24/7', 'Furniture', 'Equipment

{'id': '141285', 'price': 242000, 'facilities': ['water 24/7', 'Water', 'Open balcony', 'Euro windows', 'Sunny', 'View', 'Bilateral', 'Iron door'], 'location': 'Yerevan, Arabkir, Riga St', 'area': '112 SQ. M.', 'room': '3', 'floor': '5', 'storeys': '13'}
{'id': '141284', 'price': 620000, 'facilities': ['Electricity', 'water 24/7', 'Water', 'Sewerage, Canalization', 'Open balcony', 'Elevator', 'Euro windows', 'Sunny', 'View', 'Close to the bus station'], 'location': 'Yerevan, Center, Tsitsernakaberd Highway', 'area': '230 SQ. M.', 'room': '5', 'floor': '23', 'storeys': '23'}
{'id': '141271', 'price': 78000, 'facilities': ['Heating', 'Internet', 'Hot water', 'Electricity', 'Gas', 'Water', 'water 24/7', 'Furniture', 'Equipment', 'Euro windows', 'Iron door', 'Balcony', 'Tile', 'Laminate flooring', 'Sunny', 'View', 'Close to the bus station'], 'location': 'Yerevan, Nor Norq, Moldovakan St', 'area': '50 SQ. M.', 'room': '2', 'floor': '1', 'storeys': '5'}
{'id': '141268', 'price': 270000, 'fa

Task exception was never retrieved
future: <Task finished name='Task-121' coro=<MyRealtyApartmentScraper.__scrape_images.<locals>.main() done, defined at /var/folders/67/47xpns1s0zn6h8g8npsy5_q80000gn/T/ipykernel_81214/2720596733.py:123> exception=ServerDisconnectedError('Server disconnected')>
Traceback (most recent call last):
  File "/var/folders/67/47xpns1s0zn6h8g8npsy5_q80000gn/T/ipykernel_81214/2720596733.py", line 126, in main
    await asyncio.gather(*tasks)
  File "/var/folders/67/47xpns1s0zn6h8g8npsy5_q80000gn/T/ipykernel_81214/2720596733.py", line 106, in download_image
    async with session.get(url) as response:
  File "/Users/petrostepoyan/opt/anaconda3/lib/python3.9/site-packages/aiohttp/client.py", line 1141, in __aenter__
    self._resp = await self._coro
  File "/Users/petrostepoyan/opt/anaconda3/lib/python3.9/site-packages/aiohttp/client.py", line 560, in _request
    await resp.start(conn)
  File "/Users/petrostepoyan/opt/anaconda3/lib/python3.9/site-packages/aiohtt