In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
base_url = 'https://imoti.info/en/for-sale/grad-sofiya/apartments/page-{}?pubtype=1&pubtype=2&pubtype=3&pubtype=4&pubtype=6&pubtype=8'

locations = []
prices = []
parameters = []

for i in range(1, 500):
    url = base_url.format(i)
    
    response = requests.get(url)
    
    if response.status_code != 200:
        print(f"Failed to retrieve page {i}")
        continue
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    location_elements = soup.find_all('a', class_='locationDesktop')
    price_elements = soup.find_all('div', class_='price')
    parameters_elements = soup.find_all('div', class_='parameters')

    max_length = max(len(location_elements), len(price_elements), len(parameters_elements))
    
    for index in range(max_length):
        if index < len(location_elements):
            locations.append(location_elements[index].text.strip())
        else:
            locations.append('nan')

        if index < len(price_elements):
            prices.append(price_elements[index].text.strip())
        else:
            prices.append('nan')

        if index < len(parameters_elements):
            parameters.append(parameters_elements[index].text.strip())
        else:
            parameters.append('nan')

df = pd.DataFrame({'Location': locations, 'Price': prices, 'Parameters': parameters})

df


Failed to retrieve page 298
Failed to retrieve page 348
Failed to retrieve page 423


Unnamed: 0,Location,Price,Parameters
0,"AtticSimeonovo, Sofia",106 100 €,"78 sq.m, 1st floor"
1,"AtticKrastova vada, Sofia",89 435 €,"57 sq.m, 2025 г., Ground Floor floor"
2,"AtticTsentar, Sofia",275 000 €,"120 sq.m, 1980 г., 9th floor"
3,"AtticSimeonovo, Sofia",62 800 €,"46 sq.m, 1st floor"
4,"AtticOborishte, Sofia",166 000 €,"70 sq.m, 2000 г., 6th floor"
...,...,...,...
9896,"2 bedroomOvcha kupel, Sofia",156 000 €,"104 sq.m, 2024 г., 4th floor"
9897,"2 bedroomReduta, Sofia",231 000 €,"110 sq.m, 2025 г., 3rd floor"
9898,"2 bedroomYavorov, Sofia",285 000 €,"145 sq.m, 17th floor"
9899,"2 bedroomMladost 1, Sofia",165 000 €,"106 sq.m, 1984 г., 17th floor"


In [3]:
data = df.copy()
data

Unnamed: 0,Location,Price,Parameters
0,"AtticSimeonovo, Sofia",106 100 €,"78 sq.m, 1st floor"
1,"AtticKrastova vada, Sofia",89 435 €,"57 sq.m, 2025 г., Ground Floor floor"
2,"AtticTsentar, Sofia",275 000 €,"120 sq.m, 1980 г., 9th floor"
3,"AtticSimeonovo, Sofia",62 800 €,"46 sq.m, 1st floor"
4,"AtticOborishte, Sofia",166 000 €,"70 sq.m, 2000 г., 6th floor"
...,...,...,...
9896,"2 bedroomOvcha kupel, Sofia",156 000 €,"104 sq.m, 2024 г., 4th floor"
9897,"2 bedroomReduta, Sofia",231 000 €,"110 sq.m, 2025 г., 3rd floor"
9898,"2 bedroomYavorov, Sofia",285 000 €,"145 sq.m, 17th floor"
9899,"2 bedroomMladost 1, Sofia",165 000 €,"106 sq.m, 1984 г., 17th floor"


In [4]:
def price_manipulation():
    data.Price = data.Price.str.replace('€' ,'')
    data.Price = data.Price.str.replace('nan', '')
    data['Price'] = data['Price'].str.replace(r'[^\d]', '', regex=True)
    data['Price'] = data['Price'].replace('', 'nan').astype(float).astype('Int64')
    return data

In [5]:
data = price_manipulation()
data

Unnamed: 0,Location,Price,Parameters
0,"AtticSimeonovo, Sofia",106100,"78 sq.m, 1st floor"
1,"AtticKrastova vada, Sofia",89435,"57 sq.m, 2025 г., Ground Floor floor"
2,"AtticTsentar, Sofia",275000,"120 sq.m, 1980 г., 9th floor"
3,"AtticSimeonovo, Sofia",62800,"46 sq.m, 1st floor"
4,"AtticOborishte, Sofia",166000,"70 sq.m, 2000 г., 6th floor"
...,...,...,...
9896,"2 bedroomOvcha kupel, Sofia",156000,"104 sq.m, 2024 г., 4th floor"
9897,"2 bedroomReduta, Sofia",231000,"110 sq.m, 2025 г., 3rd floor"
9898,"2 bedroomYavorov, Sofia",285000,"145 sq.m, 17th floor"
9899,"2 bedroomMladost 1, Sofia",165000,"106 sq.m, 1984 г., 17th floor"


In [6]:
def parameters_clean():

    data['square_metres'] = data.Parameters.str.split(',').str.get(0).str.split(' ').str.get(0)
    data['floor'] = data.Parameters.str.split(',').str.get(2).str.split(' ').str.get(1)
    data['floor'] = data.floor.str.split('').str.get(1)
    data['floor'] =data.floor.replace('G', 0)
    data['bedrooms'] = data.Location.str.split(' ').str.get(0)
    data.bedrooms.replace('Attic', '1') 
    data.bedrooms.str.split()
    data['floor'] = data.floor.fillna(data.floor.median())
    data['Price']=data.Price.fillna(data.Price.median())
    data['floor'] = data.floor.astype(int)
    data['square_metres'] = data.square_metres.astype(int)
    return data
data = parameters_clean()
data

Unnamed: 0,Location,Price,Parameters,square_metres,floor,bedrooms
0,"AtticSimeonovo, Sofia",106100,"78 sq.m, 1st floor",78,3,"AtticSimeonovo,"
1,"AtticKrastova vada, Sofia",89435,"57 sq.m, 2025 г., Ground Floor floor",57,0,AtticKrastova
2,"AtticTsentar, Sofia",275000,"120 sq.m, 1980 г., 9th floor",120,9,"AtticTsentar,"
3,"AtticSimeonovo, Sofia",62800,"46 sq.m, 1st floor",46,3,"AtticSimeonovo,"
4,"AtticOborishte, Sofia",166000,"70 sq.m, 2000 г., 6th floor",70,6,"AtticOborishte,"
...,...,...,...,...,...,...
9896,"2 bedroomOvcha kupel, Sofia",156000,"104 sq.m, 2024 г., 4th floor",104,4,2
9897,"2 bedroomReduta, Sofia",231000,"110 sq.m, 2025 г., 3rd floor",110,3,2
9898,"2 bedroomYavorov, Sofia",285000,"145 sq.m, 17th floor",145,3,2
9899,"2 bedroomMladost 1, Sofia",165000,"106 sq.m, 1984 г., 17th floor",106,1,2


In [7]:
data.dtypes

Location         object
Price             Int64
Parameters       object
square_metres     int32
floor             int32
bedrooms         object
dtype: object

In [8]:
data.drop(columns=('Parameters'),inplace=True)

In [14]:
def region_clean():
    data['bedrooms'] = data.bedrooms.str.replace('Levski','')
    data['bedrooms'] = data.bedrooms.str.replace('Suhata','')
    data['bedrooms'] = data.bedrooms.str.replace('Sveta','')
    data['bedrooms'] = data.bedrooms.str.replace('Obelya','')
    data['bedrooms'] = data.bedrooms.str.replace('Zaharna','')
    data['bedrooms'] = data.bedrooms.str.replace('Voenna','')
    data['bedrooms'] = data.bedrooms.str.replace('Poligona','')
    data['bedrooms'] = data.bedrooms.str.replace('Yavorov','')
    data['bedrooms'] = data.bedrooms.str.replace('Meditinska','')
    data['bedrooms'] = data.bedrooms.str.replace('Reduta','')
    data['bedrooms'] = data.bedrooms.str.replace('Gorna','')
    data['bedrooms'] = data.bedrooms.str.replace('Svoboda','')
    data['bedrooms'] = data.bedrooms.str.replace('.z.','')
    data['bedrooms'] = data.bedrooms.str.replace('Oborishte','')
    data['bedrooms'] = data.bedrooms.str.replace('Yavorov','')
    data['bedrooms'] = data.bedrooms.str.replace('Fondovi','')
    data['bedrooms'] = data.bedrooms.str.replace('Hai','')
    data['bedrooms'] = data.bedrooms.str.replace('Nadda','')
    data['bedrooms'] = data.bedrooms.str.replace('Meditsinska','')
    data['bedrooms'] = data.bedrooms.str.replace(',','')
    data['bedrooms'] = data.bedrooms.str.replace('m','')
    data['bedrooms'] = data.bedrooms.str.replace('Doktorski','')
    data['bedrooms'] = data.bedrooms.str.replace('v','')
    data['bedrooms'] = data.bedrooms.str.replace('Simeonovo','')
    data['bedrooms'] = data.bedrooms.str.replace('Pancharevo','')
    data['bedrooms'] = data.bedrooms.str.replace('Borovo','')
    data['bedrooms'] = data.bedrooms.str.replace('Orlandovtsi','')
    data['bedrooms'] = data.bedrooms.str.replace('Moderno','')
    data['bedrooms'] = data.bedrooms.str.replace('Dragalevtsi','')
    data['bedrooms'] = data.bedrooms.str.replace('Krasna','')
    data['bedrooms'] = data.bedrooms.str.replace('Triagalnika','')
    data['bedrooms'] = data.bedrooms.str.replace('Vrabnitsa','')
    data['bedrooms'] = data.bedrooms.str.replace('Bankya','')
    data['bedrooms'] = data.bedrooms.str.replace('Ilinden','')
    data['bedrooms'] = data.bedrooms.str.replace('Gotse','')
    data['bedrooms'] = data.bedrooms.str.replace('Gadzhi','')
    data['bedrooms'] = data.bedrooms.str.replace('Studentski','')
    data['bedrooms'] = data.bedrooms.str.replace('Krasna','')
    data['bedrooms'] = data.bedrooms.str.replace('Serdika','')
    data['bedrooms'] = data.bedrooms.str.replace('Belite','')
    data['bedrooms'] = data.bedrooms.str.replace('Banishora','')
    data['bedrooms'] = data.bedrooms.str.replace('Dragalevtsi','')
    data['bedrooms'] = data.bedrooms.str.replace('Tsentar','')
    data['bedrooms'] = data.bedrooms.str.replace('Lozenets','')
    data['bedrooms'] = data.bedrooms.str.replace('Ovcha','')
    data['bedrooms'] = data.bedrooms.str.replace('Malinova','')
    data['bedrooms'] = data.bedrooms.str.replace('Krastova','')
    data['bedrooms'] = data.bedrooms.str.replace('Lyulin','')
    data['bedrooms'] = data.bedrooms.str.replace('Boyana','')
    data['bedrooms'] = data.bedrooms.str.replace('Mladost','')
    data['bedrooms'] = data.bedrooms.str.replace('Lagera','')
    data['bedrooms'] = data.bedrooms.str.replace('Darvenitsa','')
    data['bedrooms'] = data.bedrooms.str.replace('-t','')
    data['bedrooms'] = data.bedrooms.str.replace('Bakston','')
    data['bedrooms'] = data.bedrooms.str.replace('Krasno','')
    data['bedrooms'] = data.bedrooms.str.replace('Gorublyane','')
    data['bedrooms'] = data.bedrooms.str.replace('Razsadnika','')
    data['bedrooms'] = data.bedrooms.str.replace('Dianabad','')
    data['bedrooms'] = data.bedrooms.str.replace('Karpuzitsa','')
    data['bedrooms'] = data.bedrooms.str.replace('Geo','')
    data['bedrooms'] = data.bedrooms.str.replace('Zona','')
    data['bedrooms'] = data.bedrooms.str.replace('Geo','')
    data['bedrooms'] = data.bedrooms.str.replace('Hladilnika','')
    data['bedrooms'] = data.bedrooms.str.replace('Bistritsa','')
    data['bedrooms'] = data.bedrooms.str.replace('Pavlovo','')
    data['bedrooms'] = data.bedrooms.str.replace('Slatina','')
    data['bedrooms'] = data.bedrooms.str.replace('Strelbishte','')
    data['bedrooms'] = data.bedrooms.str.replace('Manastirski','')
    data['bedrooms'] = data.bedrooms.str.replace('Poduyane','')
    data['bedrooms'] = data.bedrooms.str.replace('Ivan','')
    data['bedrooms'] = data.bedrooms.str.replace('.z.Kinotsentara','')
    data['bedrooms'] = data.bedrooms.str.replace('Geo','')
    data['bedrooms'] = data.bedrooms.str.replace('Druzhba','')
    data['bedrooms'] = data.bedrooms.str.replace('Izgrev','')
    data['bedrooms'] = data.bedrooms.str.replace('Vitosha','')
    data['bedrooms'] = data.bedrooms.str.replace('Ocha','')
    data['bedrooms'] = data.bedrooms.str.replace('Lnets','')
    data['bedrooms'] = data.bedrooms.str.replace('Malinoa','')
    data['bedrooms'] = data.bedrooms.str.replace('Krastoa','')
    data['bedrooms'] = data.bedrooms.str.replace('Dragaletsi','')
    data['bedrooms'] = data.bedrooms.str.replace('Karptsa','')
    data['bedrooms'] = data.bedrooms.str.replace('Panchareo','')
    data['bedrooms'] = data.bedrooms.str.replace('Kinotsentara','')
    data['bedrooms'] = data.bedrooms.str.replace('Darenitsa','')
    data['bedrooms'] = data.bedrooms.str.replace('Paloo','')
    data['bedrooms'] = data.bedrooms.str.replace('Darenitsa','')
    data['bedrooms'] = data.bedrooms.str.replace('Boroo','')
    data['bedrooms'] = data.bedrooms.str.replace('Drba','')
    data['bedrooms'] = data.bedrooms.str.replace('Sieonoo','')
    data['bedrooms'] = data.bedrooms.str.replace('Radnika','')
    data['bedrooms'] = data.bedrooms.str.replace('Orlandotsi','')
    data['bedrooms'] = data.bedrooms.str.replace('Ian','')
    data['bedrooms'] = data.bedrooms.str.replace('re','')
    data['bedrooms'] = data.bedrooms.str.replace('Tolstoy','')
    data['bedrooms'] = data.bedrooms.str.replace('Dobroslatsi','')

    
    return data
data  = region_clean()
data



# Another way to do it :
#......

# bedroom_manipulation = ['Simeonovo', 'Krastova vada'....]

# for locations in bedroom_manipulation:
#     data['bedrooms']=data.Location.str.replace(locations,'')

  data['bedrooms'] = data.bedrooms.str.replace('.z.','')
  data['bedrooms'] = data.bedrooms.str.replace('.z.Kinotsentara','')


Unnamed: 0,Location,Price,square_metres,floor,bedrooms
0,"AtticSimeonovo, Sofia",106100,78,3,Attic
1,"AtticKrastova vada, Sofia",89435,57,0,Attic
2,"AtticTsentar, Sofia",275000,120,9,Attic
3,"AtticSimeonovo, Sofia",62800,46,3,Attic
4,"AtticOborishte, Sofia",166000,70,6,Attic
...,...,...,...,...,...
9896,"2 bedroomOvcha kupel, Sofia",156000,104,4,2
9897,"2 bedroomReduta, Sofia",231000,110,3,2
9898,"2 bedroomYavorov, Sofia",285000,145,3,2
9899,"2 bedroomMladost 1, Sofia",165000,106,1,2


In [16]:
data.bedrooms.value_counts()

2             5153
3             2219
1             1783
Maisonette     454
Studio         223
Attic           69
Name: bedrooms, dtype: int64

In [17]:
location_manipulation = ['Attic','Maisonette','2 bedroom', '1 bedroom', '3 bedroom', 'Studio']

for location in location_manipulation:
    data['Location']=data.Location.str.replace(location,'')
    data['Location'] = data.Location.str.split(',').str.get(0)
data

Unnamed: 0,Location,Price,square_metres,floor,bedrooms
0,Simeonovo,106100,78,3,Attic
1,Krastova vada,89435,57,0,Attic
2,Tsentar,275000,120,9,Attic
3,Simeonovo,62800,46,3,Attic
4,Oborishte,166000,70,6,Attic
...,...,...,...,...,...
9896,Ovcha kupel,156000,104,4,2
9897,Reduta,231000,110,3,2
9898,Yavorov,285000,145,3,2
9899,Mladost 1,165000,106,1,2


In [18]:
from sqlalchemy import create_engine

engine = create_engine('sqlite:///Imot_bg.db')

table_name = 'Imoti'

data.to_sql(name=table_name, con=engine, if_exists='replace', index=False)

engine.dispose()