In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
def number_of_pages(url_page):
    """
    Scrapping informiation about number of pages 
    """
    result = requests.get(url_page).text
    doc = BeautifulSoup(result, 'html.parser')
    last_page = doc.find_all('a', class_='css-1mi714g')[-1].text
    return int(last_page)

In [3]:
cities = ['warszawa','wroclaw','poznan','krakow','gdansk']
categories = ['wynajem','sprzedaz']
rooms_number = ['one', 'two', 'three', 'four']

In [4]:
def property_info_scrapper(url, pages, city_name, poperty_type, category_type):
    """
    Property information search (amount,localisation and district, sq. meters, link to offer, offer title)
    """
    page_offers = []
    for page in range(1, pages+1):
        
        url_split = url.split('page=')
        url_connected = url_split[0] + 'page=' +str(page) + url_split[1][1:]
        text_page = requests.get(url_connected).text
        doc = BeautifulSoup(text_page, 'html.parser')
       
        for offer in doc.find_all('div', class_='css-1sw7q4x'):
            try:
                #Title scrap
                title = offer.find('h6',class_='css-16v5mdi er34gjf0').text
                #Price scrap 
                price = offer.find('p',class_='css-10b0gli er34gjf0').text
                #District info
                district = offer.find('p',class_='css-veheph er34gjf0').text
                #Sq. meters
                sq_meters = offer.find('span',class_='css-643j0o').text
                #Weblink
                link = offer.find('a', class_ = 'css-rc5s2u')['href']
                web_link = f'https://www.olx.pl{link}'

                offer_summary = [title, price, poperty_type,city_name, category_type ,district, sq_meters, web_link]
                page_offers.append(offer_summary)
            except AttributeError:
                continue
        
    return page_offers
    
    
def data_scraper(city):
    """
    Find information about properties separating by category (to sell/ to rent) and by room number (eg. one-room flat)
    in given city
    """
    all_data = []
    for category in categories:
        start_url = f'https://www.olx.pl/nieruchomosci/mieszkania/{category}/{city}/?page=1&search%5Bfilter_enum_rooms%5D%5B0%5D=REPL'
        for rooms in rooms_number:
            start_url = start_url.replace('REPL',rooms) #Zamienić na końcu
            page_numbers = number_of_pages(start_url)
            #Scraping data
            propety_informations = property_info_scrapper(start_url, page_numbers, city, rooms, category)
            all_data.extend(propety_informations)
            start_url = start_url.replace(rooms,'REPL')
            
    print(f'{city}: data collected')
    return all_data
            

In [6]:
data_wroclaw = data_scraper('wroclaw')

wroclaw: data collected


In [7]:
data_wawa = data_scraper('warszawa')

warszawa: data collected


In [8]:
data_krak = data_scraper('krakow')

krakow: data collected


In [9]:
data_poz = data_scraper('poznan')

poznan: data collected


In [10]:
data_gda = data_scraper('gdansk')

gdansk: data collected


In [11]:
columns = ['Title','Amout','Number_of_rooms','City','Category','District','Sqr_meters','WebLink']

df_wro = pd.DataFrame(data_wroclaw, columns=columns)
df_wwa = pd.DataFrame(data_wawa, columns=columns)
df_krk = pd.DataFrame(data_krak, columns=columns)
df_poz = pd.DataFrame(data_poz, columns=columns)
df_gda = pd.DataFrame(data_gda, columns=columns)

df_wro.to_excel('wro.xlsx',index=False)
df_wwa.to_excel('wwa.xlsx',index=False)
df_krk.to_excel('krk.xlsx',index=False)
df_poz.to_excel('poz.xlsx',index=False)
df_gda.to_excel('gda.xlsx',index=False)
