In [13]:
import requests
import pandas
import numpy as np
from tqdm import tqdm
from bs4 import BeautifulSoup as bs
import sqlite3
import re
from datetime import date

In [14]:
GLOBAL_PART = 'https://www.gipernn.ru/'

BUSINES_CENTRE_PART = 'biznes-centry/'
RETAIL_CENTRE_PART = 'torgovye-centry/'

FIFTY_ON_PAGE_PART = '?per-page=50'

all_links = []

In [15]:
def get_coordinates(city, district, address):
    url = 'http://127.0.0.1:5000/get_coordinates'
    data = {'city': city, 'district': district, 'address': address}
    response = requests.post(url, json=data)
    
    if response.status_code == 200:
        return response.json().get('coordinates')
    else:
        return response.json().get('error')

In [16]:
def page_counter(LINK):
    html = requests.get(LINK)
    soup = bs(html.text, 'html.parser')
    count = soup.find('div', class_='count')

    count = int(count.text.split(':')[1].strip())
    page_count = int(np.ceil(count / 50))

    print(count)
    
    return page_count

In [17]:
def links_arr(LINK, page_count):
    links = []
    for page in tqdm(range(page_count + 1), ascii=True):
        html = requests.get(f"{LINK}&page={page}")
        soup = bs(html.text, 'html.parser')
        trs = soup.find('tbody').find_all('tr')
        for tr in trs:
            link = tr.find('a')
            if link:
                link = GLOBAL_PART[:-1] + link.get('href')
                if(link not in all_links):
                    links.append(link)
        links = list(set(links))  
    print(len(links))  
    return links

In [18]:
def kvartir_w2(key, val, tmp):
    if key == 'Район': return
    elif key == 'Адрес': 
        try: 
            replacements = {
                'ул. ': '',
                'д.\xa0': '',
                '\xa0': ' ',
                'д.': '',
                ',': '',
                'б-р': 'бульвар',
                'просп.': 'проспект',
                'пер.': 'переулок',
                'пос.': 'посёлок',
                'ш.': 'шоссе',
                '  ': ' ',
                '-я': ''
            }
            for old, new in replacements.items():
                val = val.replace(old, new)

            # Регулярные выражения и замены
            regex_replacements = [
                (r'\bмкр\.\s*(\d+)-й\s*(\d+)\b', r'\1 микрорайон \2'),  # 'мкр. <число>-й <число>'
                (r'\bмкр\.\s*([\w\s]+)\s*(\d+)-й\s*(\d+)\b', r'\2 микрорайон \1 \3')  # 'мкр. <район> <число>-й <число>'
            ]
            if 'мкр.' in val:
                for pattern, repl in regex_replacements:
                    try: val = re.sub(pattern, repl, val)
                    except: pass
            tmp[key] = val
        except: 
            pass
        link = f"https://nominatim.openstreetmap.org/search.php?q={tmp['Город'].replace(' ', '+')}+{tmp[key].replace(' ', '+')}&format=jsonv2"
        # Отправка запроса с заголовком User-Agent
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
        
        response = requests.get(link, headers=headers)
        
        if response.status_code == 200:
            # Парсинг JSON-ответа
            data_json = response.json()
            
            # Обработка данных из JSON-ответа
            if data_json:
                # Пример получения широты и долготы
                tmp['Широта'] = data_json[0]['lat']
                tmp['Долгота'] = data_json[0]['lon']
            else:
                print(f'Cords error: {tmp['Ссылка']}')
                coords = get_coordinates(tmp['Город'], tmp['Район'], tmp[key])
                if 'error' in coords:
                    print(f"Error: {coords}")
                else:
                    try:
                        tmp['Широта'] = coords.split(',')[0]
                        tmp['Долгота'] = coords.split(',')[1]
                    except:
                        pass
                            # print(f"Coordinates: {coords}")
    elif key == 'Высота потолков':
        val = val.split(' м')[0].replace(',', '.')
        try:
            tmp[key] = float(val)
        except:
            tmp[key] = float(val.split('. ')[-1])
    elif key == 'Средняя цена за м2':
        tmp[key] = int(val.replace('\xa0', '').split('р')[0])
    elif key == 'Технический план': return
    elif key == 'Год постройки': 
        try:
            tmp[key] = int(val)
        except:
            tmp[key] = int(val.split(',')[0])
    elif key == 'Этажность': 
        tmp[key] = int(val)
    elif key == 'Подъездов': 
        tmp[key] = int(val)
    elif key == 'Квартир': 
        tmp[key] = int(val)
    else: tmp[key] = val

In [19]:
def individual_w0(key, val, tmp):
    if key == 'Комнат':
        if val == 'студия': tmp[key] = 0
        else: tmp[key] = int(val.split(' ')[0])
    elif key == 'Этаж / этажность':
        tmp['Этаж'] = int(val.replace(' ', '').replace(',', '.').split('/')[0])
    elif key == 'Высота потолков':
        val = val.split(' м')[0].replace(',', '.')
        try:
            tmp[key] = float(val)
        except:
            tmp[key] = float(val.split('. ')[-1])
    elif key == 'Площадь':
        val = val.replace(' ', '').replace(',', '.').split('/')
        tmp['Общая площадь'] = float(val[0])
        if len(val) > 1: tmp['Жилая площадь'] = float(val[1])
        if len(val) > 2: tmp['Площадь кухни'] = float(val[2])
    elif key == 'Площадь участка':
        tmp[key] = float(val.replace(' ', '').split('с')[0])
    elif key == 'Количество этажей':
        tmp['Этажность'] = int(val)
    elif key == 'Год постройки дома':
        kvartir_w2('Год постройки', val, tmp)
    else: tmp[key] = val

In [20]:
def commertial_w0(key, val, tmp):
    if key == 'Комнат':
        if val == 'студия': tmp[key] = 0
        else: tmp[key] = int(val.split(' ')[0])
    elif key == 'Этаж / этажность':
        val = val.replace(' ', '').split('/')
        tmp['Этаж'] = int(val[0])
        if val[1]: tmp['Этажность'] = int(val[1])
    elif key == 'Количество этажей':
        tmp['Этажность'] = int(val)
    elif key == 'Высота потолков':
        val = val.split(' м')[0].replace(',', '.')
        try:
            tmp[key] = float(val)
        except:
            tmp[key] = float(val.split('. ')[-1])
    elif key == 'Площадь':
        tmp[key] = float(val.replace(' ', '').replace(',', '.').split('м')[0])
    elif key == 'Площадь участка':
        tmp[key] = float(val.replace(' ', '').replace(',', '.').split('с')[0])
    elif key == 'Общая площадь':
        tmp[key] = float(val.replace(' ', '').replace(',', '.').split('м')[0])
    elif key == 'Площадь земельного участка':
        tmp[key] = float(val.replace(' ', '').replace(',', '.').split('га')[0])
    elif key == 'Общая площадь зданий':
        tmp[key] = float(val.replace(' ', '').replace(',', '.').split('м')[0])
    elif key == 'Отдельный вход':
        tmp[key] = val.replace({'есть':1, 'да':1, 'нет':0})
    else: tmp[key] = val.replace('\xa0', '')

In [21]:
def parse(links, deal, tip):
    df = pandas.DataFrame()

    match tip:
        case 'kvartir':
            def w2_parse(): kvartir_w2(key, val, tmp)
            def w0_parse(): individual_w0(key, val, tmp)
            def segment(): tmp['Сегмент'] = "Жилая недвижимость"
        case 'domov':
            def w2_parse(): return
            def w0_parse(): individual_w0(key, val, tmp)
            def segment(): tmp['Сегмент'] = "Жилая недвижимость"
        case 'ofisov':
            def w2_parse(): return
            def w0_parse(): commertial_w0(key, val, tmp)
            def segment(): tmp['Сегмент'] = "Комерческая недвижимость"
        case 'skladov':
            def w2_parse(): return
            def w0_parse(): commertial_w0(key, val, tmp)
            def segment(): tmp['Сегмент'] = "Комерческая недвижимость"
        case 'proizvodstvennyh-ploschadey':
            def w2_parse(): return
            def w0_parse(): commertial_w0(key, val, tmp)
            def segment(): tmp['Сегмент'] = "Комерческая недвижимость"
        case 'pomescheniy':
            def w2_parse(): return
            def w0_parse(): commertial_w0(key, val, tmp)
            def segment(): tmp['Сегмент'] = "Комерческая недвижимость"

    for link in tqdm(links, ascii=True):
        html = requests.get(link)
        soup = bs(html.text, 'html.parser')
        tmp = {} # словарь для временных данных

        tmp['Ссылка'] = link

        header = soup.find('h1')
        if header:
            location = header.text.replace(header.next, '')
            header = header.text.strip().replace(location, '').replace('\xa0', ' ')
        tmp['Заголовок'] = header

        segment()
        try:
            t = header.split('на')[0]
            tmp['Подсегмент'] = t.split(' ', 1)[1]
            tmp['Тип сделки'] = t.split(' ', 1)[0]
        except: 
            try:
                t = header.split('в')[0]
                tmp['Подсегмент'] = t.split(' ', 1)[1]
                tmp['Тип сделки'] = t.split(' ', 1)[0]
            except: pass

        price = soup.find('div', class_ = 'price')
        if price:
            price = price.text.replace('\xa0', '')
            price = int(price.strip().split('р')[0])
        tmp['Цена'] = price

        description = soup.find('div', class_='formatted-text')
        if description:
            description = description.text.strip().replace('\xa0', ' ')
        tmp['Описание'] = description

        location = location.split(', ')
        tmp['Город'] = location[1]
        tmp['Район'] = location[0].replace(' район', '')

        try:
            w2 = soup.find('table', id = 'w2')
            if w2:
                w2 = w2.find_all('tr')
                for tr in w2:
                    key = tr.find('th').text.strip()
                    val = tr.find('td').text.strip()
                    if val == 'Уточнить': tmp[key] = ''
                    else: w2_parse()
        except:
            print(f'w2 PROBLEM: {link}')        
        
        try:
            w0 = soup.find('table', id = 'w0')
            if w0:
                w0 = w0.find_all('tr')
                for tr in w0:
                    key = tr.find('th').text.strip()
                    val = tr.find('td').text.strip()
                    if val == 'Уточнить': tmp[key] = np.nan
                    else:
                        if key not in tmp.keys(): # записи в 2х таблицах могут 
                            w0_parse() # пересекаться, а зачем нам копии?
        except: 
            print(f'w0 PROBLEM: {link}')   

        try:
            properties = soup.find('div', class_ = 'properties m-t-2')
            if properties:
                properties = properties.find_all('div', class_ = 'property')
                for property in properties:
                    key = property.find('div', class_ = 'property-name').text.strip()
                    val = property.find('div', class_ = 'property-value').text.strip()
                    if key not in tmp.keys(): # записи в 2х таблицах могут пересекаться, а зачем нам копии?
                        if val == 'есть' or val == 'да': tmp[key] = 1
                        elif val == 'нет': tmp[key] = 0
                        else: tmp[key] = val
        except: 
            print(f'props PROBLEM: {link}')   
    
        try:
            buildings = soup.find('div', 'around').find_all('div', 'group')
            for building in buildings:
                name = building.find('div','group-body').find('span').text
                building = building.find('div','group-body').find('div', 'company')
                dist = building.find('span', 'company__distance').text.split('м')[0]
                if dist == "в здании":
                    tmp[name + '(расстояние в метрах)'] = 0
                else: tmp[name + '(расстояние в метрах)'] = dist 
        except: 
            pass

        df = pandas.concat([df, pandas.DataFrame(tmp, index=(range(1)))], ignore_index=True)
        df['Источник'] = 'gipernn'
        df['Дата парсинга'] = str(date.today())
         
    return df

In [22]:
dataframe = pandas.DataFrame()

In [23]:
deal = 'prodazha'
tip = 'kvartir'
LINK = GLOBAL_PART + deal + '-' + tip + FIFTY_ON_PAGE_PART

In [24]:
page_count = page_counter(LINK)

4536


In [25]:
links = links_arr(LINK, page_count)
all_links += links

  0%|          | 0/92 [00:00<?, ?it/s]

 79%|#######9  | 73/92 [00:32<00:11,  1.72it/s]

In [None]:
tmp_df = parse(links, deal, tip)
dataframe = pandas.concat([dataframe, tmp_df], axis=0)
conn = sqlite3.connect(f'DF_{date.today()}.db')
dataframe.to_sql(f'{tip}', conn, if_exists='replace')
conn.close()

  0%|          | 1/4049 [01:04<72:57:36, 64.89s/it]

w2 PROBLEM: https://www.gipernn.ru/prodazha-kvartir/1-komnatnaya-ul-novokuznechihinskaya-d-7-id2923228


  0%|          | 2/4049 [01:05<30:27:29, 27.09s/it]

w2 PROBLEM: https://www.gipernn.ru/prodazha-kvartir/1-komnatnaya-ul-proletarskaya-d-4-k1-id2916559


  0%|          | 3/4049 [01:05<16:44:58, 14.90s/it]

w2 PROBLEM: https://www.gipernn.ru/prodazha-kvartir/2-komnatnaya-per-bakinskiy-d-8-id2915985


  0%|          | 4/4049 [01:06<10:17:04,  9.15s/it]

w2 PROBLEM: https://www.gipernn.ru/prodazha-kvartir/1-komnatnaya-ul-burnakovskaya-d-83-id2923090


  0%|          | 5/4049 [01:06<6:44:23,  6.00s/it] 

w2 PROBLEM: https://www.gipernn.ru/prodazha-kvartir/3-komnatnaya-ul-vasileva-d-1-id2919189


  0%|          | 6/4049 [01:07<4:36:04,  4.10s/it]

w2 PROBLEM: https://www.gipernn.ru/prodazha-kvartir/kvartira-studiya-ul-yanki-kupaly-d-16a-id2922982


  0%|          | 7/4049 [01:07<3:13:18,  2.87s/it]

w2 PROBLEM: https://www.gipernn.ru/prodazha-kvartir/kvartira-studiya-ul-sputnika-d-40-id2920843


  0%|          | 8/4049 [01:07<9:32:19,  8.50s/it]

w2 PROBLEM: https://www.gipernn.ru/prodazha-kvartir/2-komnatnaya-ul-karla-marksa-d-16-id2909923





KeyboardInterrupt: 

In [None]:
# df = pandas.DataFrame()
# for deal in ['arenda', 'prodazha']:
#     for tip in ['kvartir', 'domov', 'ofisov', 'skladov', 'zdaniy', 'pomesheniy', 'torgovych-ploshadey']:
#         LINK = GLOBAL_PART + deal + '-' + tip + FIFTY_ON_PAGE_PART
#         print(deal + '-' + tip)
#         page_count = page_counter(LINK)
#         links = links_arr(LINK, page_count)
#         all_links += links

#         tmp_df = parse(links, deal, tip)
#         df = pandas.concat([df, tmp_df], axis=0)
#     conn = sqlite3.connect(f'{date.today()}.db')
#     df.to_sql(f'DF_{date.today()}_{tip}_{deal}', conn, if_exists='replace')
#     conn.close()