In [1]:
import re
import warnings
warnings.filterwarnings('ignore')

import os
import json
import time
import numpy as np
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC

In [2]:
data_dir = "../data"
train_data = pd.read_csv(f"{data_dir}/new/new_train_ver3.csv")
test_data = pd.read_csv(f"{data_dir}/new/new_test_ver3.csv")

In [3]:
train_data['is_test'] = 0
test_data['is_test'] = 1

data = pd.concat([train_data, test_data])
data['is_test'].value_counts() 

0    1118822
1       9272
Name: is_test, dtype: int64

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1128094 entries, 0 to 9271
Data columns (total 35 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   본번                      1128094 non-null  float64
 1   부번                      1128094 non-null  float64
 2   아파트명                    1128094 non-null  object 
 3   전용면적                    1128094 non-null  float64
 4   층                       1128094 non-null  int64  
 5   건축년도                    1128094 non-null  int64  
 6   도로명                     1128094 non-null  object 
 7   부동산유형                   1128094 non-null  object 
 8   분양형태                    1128094 non-null  object 
 9   k-관리방식                  1128094 non-null  object 
 10  k-복도유형                  1128094 non-null  object 
 11  k-난방방식                  1128094 non-null  object 
 12  k-전체동수                  250887 non-null   float64
 13  k-전체세대수                 251969 non-null   float64
 14  k-건설사

In [5]:
# '주소1' 컬럼 생성
data['주소1'] = data.apply(lambda row: f"{row['구']} {row['동']} {int(row['본번'])}-{int(row['부번'])}" if row['부번'] != 0 else f"{row['구']} {row['동']} {int(row['본번'])}", axis=1)

# '주소2' 컬럼 생성
data['주소2'] = data.apply(lambda row: f"{row['구']} {row['도로명']}", axis=1)

# 1.부동산 이름 찾기

In [6]:
# address_type = '주소1'
# unique_address = data[address_type].unique()
# print(len(unique_address))

In [7]:
# url = "https://land.seoul.go.kr/land/wskras/generalInfo.do"
# browser = webdriver.Chrome()
# browser.get(url)

# # 자치구 선택 요소 찾기
# select_element_gu = browser.find_element(By.ID, 'selSgg')
# select_gu = Select(select_element_gu)

# # 자치구의 이름과 value를 저장할 딕셔너리
# gu_dict = {}
# for option in select_gu.options:
#     gu_name = option.text
#     if gu_name != "자치구 선택":
#         value = option.get_attribute('value')
#         gu_dict[gu_name] = value

# # 자치구 별로 동의 옵션 값 가져오기
# gd_dict = {}
# for gu_name, gu_value in gu_dict.items():
#     # 자치구 선택
#     select_gu.select_by_value(gu_value)
#     time.sleep(1)  # 페이지 업데이트를 기다림
    
#     # 동 선택 요소 찾기
#     select_element_dong = browser.find_element(By.ID, 'selBjdong')
#     select_dong = Select(select_element_dong)
    
#     dong_dict = {}
#     for option in select_dong.options:
#         dong_name = option.text
#         if dong_name != "동 선택":
#             value = option.get_attribute('value')
#             dong_dict[dong_name] = value
#     gd_dict[gu_name] = dong_dict

# browser.quit()

# with open(f'{data_dir}/gu_dict.json', 'w', encoding='utf-8') as f:
#     json.dump(gu_dict, f, ensure_ascii=False, indent=4)

# with open(f'{data_dir}/gu_dong_dict.json', 'w', encoding='utf-8') as f:
#     json.dump(gd_dict, f, ensure_ascii=False, indent=4)

In [8]:
# with open(f'{data_dir}/gu_dict.json', 'r', encoding='utf-8') as f:
#     gu_dict = json.load(f)

# with open(f'{data_dir}/gu_dong_dict.json', 'r', encoding='utf-8') as f:
#     gu_dong_dict = json.load(f)

# print(gu_dict)
# print(gu_dong_dict)

In [9]:
address_type = '주소1'
unique_address = data[address_type].unique()
print(len(unique_address))

8956


In [10]:
def extract_info(text):
    # Regular expression to match the pattern
    pattern = re.compile(r'(\d+)세대.*총(\d+)개동')
    
    # Find matches
    matches = pattern.findall(text)
    
    if matches:
        # Since we are looking for the first match
        households, buildings = matches[0]
        return int(households), int(buildings)
    return 0, 0

In [11]:
def search_by_title(sub_url):
    sub_browser = webdriver.Chrome()
    sub_browser.get(sub_url)
    time.sleep(2)

    summary_info = sub_browser.find_element(By.ID, "summaryInfo")
    build_name = summary_info.find_element(By.ID, "complexTitle").text
    build_type = summary_info.find_element(By.CLASS_NAME, "label--category").text

    tables = sub_browser.find_elements(By.CLASS_NAME, "info_table_wrap")
    if len(tables) != 2:
        tables = sub_browser.find_elements(By.CLASS_NAME, "info_table_wrap")

    table1_data = tables[0].find_elements(By.CLASS_NAME, "table_td")
    table2_data = tables[1].find_elements(By.CLASS_NAME, "table_td")

    total_householdes, total_buildings = extract_info(table1_data[0].text)
    total_parkings = table1_data[3].text.split('(')[0][:-1]
    construct = table1_data[6].text
    heat_type = table1_data[7].text
    corr_type = table2_data[3].text

    pyeong_tab_list = sub_browser.find_element(By.ID, "complex_pyeong_tab_list")
    anchors = pyeong_tab_list.find_elements(By.TAG_NAME, "a")
    excl_areas = []
    for anchor in anchors:
        excl_area = anchor.get_attribute("title").split(" ")[-1][:-1]
        excl_areas.append(float(excl_area))

    sub_browser.quit()
    return [
        build_name, 
        build_type, 
        total_buildings, 
        total_householdes, 
        total_parkings, 
        construct, 
        heat_type, 
        corr_type, 
        excl_areas
    ]

In [12]:
def search_by_name(name_query):
    ## 네이버 부동산
    sub_url = "https://new.land.naver.com/complexes?ms=37.476763,127.05721,17&a=APT:ABYG:JGC:PRE&e=RETAIL"
    sub_browser = webdriver.Chrome()
    sub_browser.get(sub_url)
    time.sleep(1.5)

    ## 검색어 입력
    inp = sub_browser.find_element(By.ID, "land_search")
    inp.send_keys(name_query)
    inp.send_keys(Keys.RETURN)
    time.sleep(1.5)

    ## 부동산 정보 구역
    if sub_browser.find_elements(By.ID, "summaryInfo"):
        summary_info = sub_browser.find_element(By.ID, "summaryInfo")
        build_name = summary_info.find_element(By.ID, "complexTitle").text
        build_type = summary_info.find_element(By.CLASS_NAME, "label--category").text

        btn_box = summary_info.find_element(By.CLASS_NAME, "complex_detail_link")
        btns = btn_box.find_elements(By.TAG_NAME, "button")
        btns[0].click()
        time.sleep(2)

        tables = sub_browser.find_elements(By.CLASS_NAME, "info_table_wrap")
        if len(tables) != 2:
            tables = sub_browser.find_elements(By.CLASS_NAME, "info_table_wrap")
            
        table1_data = tables[0].find_elements(By.CLASS_NAME, "table_td")
        table2_data = tables[1].find_elements(By.CLASS_NAME, "table_td")

        total_householdes, total_buildings = extract_info(table1_data[0].text)
        total_parkings = table1_data[3].text.split('(')[0][:-1]
        construct = table1_data[6].text
        heat_type = table1_data[7].text
        corr_type = table2_data[3].text

        pyeong_tab_list = sub_browser.find_element(By.ID, "complex_pyeong_tab_list")
        anchors = pyeong_tab_list.find_elements(By.TAG_NAME, "a")
        excl_areas = []
        for anchor in anchors:
            excl_area = anchor.get_attribute("title").split(" ")[-1][:-1]
            excl_areas.append(float(excl_area))

    else:
        return None


    return [
        build_name, 
        build_type, 
        total_buildings, 
        total_householdes, 
        total_parkings, 
        construct, 
        heat_type, 
        corr_type, 
        excl_areas
    ]

In [13]:
no_results = []

In [28]:
import random
# random.shuffle(unique_address)

url = "https://www.naver.com"
browser = webdriver.Chrome()
browser.get(url)

apt_name_filter = {
    "월드메르디앙B" : "월드메르디앙",
    "탑건위너스빌" : "탑건위너빌",
    "삼성동대유리채" : "삼상동유리채아파트",
    "임탑-B" : "임탑B",
    "개포자이" : "개포자이아파트",
    "개포주공5단지" : "개포주공5단지아파트",
    "이태원" : "이태원아파트",
    "뉴서울" : "뉴서울아파트"
}

for idx, address in enumerate(unique_address[(2251+2926+237+248+37+22+1282):]):
    ## "관악구 봉천동 7-312" 
    ## "동대문구 장안동 365-4"
    ## "강남구 수서동 738"
    ## "금천구 독산동 1006-132"
    ## "서초구 서초동 1633-1"
    ## "금천구 독산동 1162"
    ## "중구 장충동1가 104"
    ## "구로구 구로동 806-9"
    ## "중구 신당동 313-8"
    ## "서대문구 창천동 416"
    ## "강남구 청담동 104-13"
    # address = 

    parts = address.split()
    gu_name = parts[0].strip()
    dong_name = parts[1].strip()
    bonbun_bubun = parts[2].strip()
    
    apt_exc_areas = data.loc[data['주소1'] == address, '전용면적'].unique()
    apt_name = data.loc[data['주소1'] == address, '아파트명'].values[0]
    if apt_name in apt_name_filter:
        apt_name = apt_name_filter[apt_name]

    print(f"\n{idx:>08}")
    print(f"[원본] : 서울특별시 {gu_name} {dong_name} {bonbun_bubun} {apt_name}")

    if dong_name == "신사동" and bonbun_bubun in ["277-2", "277-19"]:
        continue

    ### Input Box
    if idx == 0:
        search_box = browser.find_element(By.CLASS_NAME, 'search_input_box')
    else:
        search_box = browser.find_element(By.CLASS_NAME, 'greenbox')
    search_inp = search_box.find_element(By.TAG_NAME, 'input')
    if idx != 0: 
        search_inp.clear()

    ### Input Query
    if apt_name != 'unknown':
        query = f"{dong_name} {bonbun_bubun} {apt_name}"
    else:
        query = f"{dong_name} {bonbun_bubun}"

    search_inp.send_keys(query)
    search_inp.send_keys(Keys.RETURN)
    time.sleep(1.5)

    ## 부동산이 바로 검색되었을 때.
    place_type = None
    search_result = False
    if browser.find_elements(By.CLASS_NAME, "_au_nland_apart_single"):
        search_result = True
        print(f"OPT 1 - Search Result : {search_result}")

        title =  browser.find_element(By.CLASS_NAME, "title_area")
        strong = title.find_element(By.CLASS_NAME, "name")
        anchor = strong.find_element(By.TAG_NAME, "a")
        place_url = anchor.get_attribute("href")
        place_name = anchor.text

        search_data = search_by_title(place_url)

    ## 부동산 리스트로 검색되었을 때
    elif search_result == False and browser.find_elements(By.CLASS_NAME, "_au_nland_apart_list"):
        search_result = True
        print(f"OPT 2 - Search Result : {search_result}")

        place_list = browser.find_element(By.CLASS_NAME, "realty_list")
        place_items = place_list.find_elements(By.CLASS_NAME, "realty_area")
        for item in place_items:
            info_box = item.find_element(By.CLASS_NAME, "info_box")
            anchor = info_box.find_element(By.TAG_NAME, 'a')
            place_name = anchor.get_attribute("title")
            place_url = anchor.get_attribute("href")

            search_data = search_by_title(place_url)
            if set(apt_exc_areas).issubset(set(search_data[-1])):
                break

    ## 이 주소의 장소
    elif search_result == False and browser.find_elements(By.CLASS_NAME, "uOjIX"):
        section = browser.find_element(By.CLASS_NAME, "uOjIX")
        if browser.find_elements(By.CLASS_NAME, "dmLin"):
            search_result = True
            print(f"OPT 3 - Search Result : {search_result}")
            place_name = browser.find_element(By.CLASS_NAME, "dmLin").text

            if "동" in place_name:
                query = f"{place_name}"
            else:
                query = f"{dong_name} {place_name}"
            print(query)
            search_data = search_by_name(query)

            if search_data is None and apt_name != "unknown":
                search_data = search_by_name(f"{dong_name} {apt_name}")

            if search_data is None:
                print(f"OPT 3 - No Result")
                no_results.append(f"{dong_name} {bonbun_bubun} {apt_name}")
                continue
        else:
            print(f"OPT 3 - No Result")
            no_results.append(f"{dong_name} {bonbun_bubun} {apt_name}")
            continue

    ## 부동산이 아닌 해당 주소에 대한 정보 카드가 나왔을 때
    elif search_result == False and browser.find_elements(By.CLASS_NAME, "Lhcly"):
        place_list = browser.find_element(By.CLASS_NAME, "Lhcly")
        place_items = place_list.find_elements(By.TAG_NAME, "li")
        for item in place_items:
            place_type = item.find_element(By.CLASS_NAME, "JP8Ar").text
            if place_type == "아파트" or place_type == "주택" or place_type == "오피스텔":
                search_result = True
                print(f"OPT 4 -Search Result : {search_result}")

                anchor = item.find_element(By.CLASS_NAME, "place_bluelink")
                place_name = anchor.find_element(By.TAG_NAME, "span").text
                place_url = anchor.get_attribute("href")
        
                if "동" in place_name:
                    query = f"{place_name}"
                else:
                    query = f"{dong_name} {place_name}"
                print(query)
                search_data = search_by_name(query)

                if search_data is None and apt_name != "unknown":
                    search_data = search_by_name(f"{dong_name} {apt_name}")

                if not search_data is None and len(apt_exc_areas) > 0 and len(search_data[-1]) > 0:
                    if set(apt_exc_areas).issubset(set(search_data[-1])):
                        break
                else:
                    print(f"OPT 4 - No Result")
                    no_results.append(f"{dong_name} {bonbun_bubun} {apt_name}")
        else:
            no_results.append(f"{dong_name} {bonbun_bubun} {apt_name}")
            continue

    ## 플레이스
    elif search_result == False and browser.find_elements(By.ID, "loc-main-section-root"):
        if browser.find_elements(By.CLASS_NAME, "R1Lzz"):
            place_list = browser.find_element(By.CLASS_NAME, "R1Lzz")
            place_items = place_list.find_elements(By.TAG_NAME, "li")
            for item in place_items:
                place_type = item.find_element(By.CLASS_NAME, "YzBgS").text
                if place_type == "아파트" or place_type == "주택" or place_type == "오피스텔":
                    search_result = True
                    print(f"OPT 5 - Search Result : {search_result}")

                    apcpt = item.find_element(By.CLASS_NAME, "place_bluelink")
                    place_name = apcpt.find_element(By.CLASS_NAME, "YwYLL").text

                    anchor_box = browser.find_element(By.CLASS_NAME, "ouxiq")
                    anchor = anchor_box.find_element(By.CLASS_NAME, "P7gyV")
                    place_url = anchor.get_attribute("href")

                    if "동" in place_name:
                        query = f"{place_name}"
                    else:
                        query = f"{dong_name} {place_name}"
                    print(query)
                    search_data = search_by_name(query)

                    if search_data is None and apt_name != "unknown":
                        search_data = search_by_name(f"{dong_name} {apt_name}")

                    if not search_data is None and len(apt_exc_areas) > 0 and len(search_data[-1]) > 0:
                        if set(apt_exc_areas).issubset(set(search_data[-1])):
                            break
                    else:
                        print(f"OPT 5 - No Result")
                        no_results.append(f"{dong_name} {bonbun_bubun} {apt_name}")
            else:
                no_results.append(f"{dong_name} {bonbun_bubun} {apt_name}")
                continue

        elif search_result == False and browser.find_elements(By.CLASS_NAME, "XJdTz"):
            place_list = browser.find_element(By.CLASS_NAME, "XJdTz")
            place_items = place_list.find_elements(By.CLASS_NAME, "IgAxB")
            for item in place_items:
                place_type = item.find_element(By.CLASS_NAME, "zAeFj").text
                if place_type == "아파트" or place_type == "주택" or place_type == "오피스텔":
                    search_result = True
                    print(f"OPT 5 - Search Result : {search_result}")

                    apcpt = item.find_element(By.CLASS_NAME, "place_bluelink")
                    place_name = apcpt.find_element(By.TAG_NAME, "span").text

                    if "동" in place_name:
                        query = f"{place_name}"
                    else:
                        query = f"{dong_name} {place_name}"
                    print(query)
                    search_data = search_by_name(query)

                    if search_data is None and apt_name != "unknown":
                        search_data = search_by_name(f"{dong_name} {apt_name}")

                    if not search_data is None and len(apt_exc_areas) > 0 and len(search_data[-1]) > 0:
                        if set(apt_exc_areas).issubset(set(search_data[-1])):
                            break
                    else:
                        no_results.append(f"{dong_name} {bonbun_bubun} {apt_name}")
                        print(f"OPT 5 - No Result")
            else:
                no_results.append(f"{dong_name} {bonbun_bubun} {apt_name}")
                continue

    else:
        print(f"No Result")
        continue
    

    if "(주상복합)" in search_data[0]:
        search_data[0] = search_data[0].split('(')[0]
        search_data[1] = "주상복합"
    elif not place_type is None:
        search_data[1] = place_type

    print(f" -아파트명 : {search_data[0]}")
    print(f" -부동산유형 : {search_data[1]}")
    print(f" -전체동수 : {search_data[2]}")
    print(f" -전체세대수 : {search_data[3]}")
    print(f" -전체주차수 : {search_data[4]}")
    print(f" -건설사 : {search_data[5]}")
    print(f" -난방방식 : {search_data[6]}")
    print(f" -복도유형 : {search_data[7]}")
    print(f" -전용면적 : {search_data[8]}")

    data.loc[data[address_type] == address, '아파트명'] = search_data[0]
    data.loc[data[address_type] == address, '부동산유형'] = search_data[1]
    data.loc[data[address_type] == address, 'k-전체동수'] = search_data[2]
    data.loc[data[address_type] == address, 'k-전체세대수'] = search_data[3]
    data.loc[data[address_type] == address, '주차대수'] = search_data[4]
    data.loc[data[address_type] == address, 'k-건설사'] = search_data[5]
    data.loc[data[address_type] == address, 'k-난방방식'] = search_data[6]
    data.loc[data[address_type] == address, 'k-복도유형'] = search_data[7]


00000000
[원본] : 서울특별시 강남구 역삼동 783-30 공간쉐르빌
OPT 3 - Search Result : True
역삼동 공간쉐르빌아파트
 -아파트명 : 공간쉐르빌
 -부동산유형 : 아파트
 -전체동수 : 1
 -전체세대수 : 17
 -전체주차수 : 15
 -건설사 : 두성종합건설(주)
 -난방방식 : 개별난방, 도시가스
 -복도유형 : 계단식
 -전용면적 : [41.61, 43.27, 48.95, 71.68, 72.84, 75.45, 76.26, 78.08, 79.48]

00000001
[원본] : 서울특별시 강남구 역삼동 787-4 리츠빌
OPT 3 - Search Result : True
역삼동 리츠빌아파트
OPT 3 - No Result

00000002
[원본] : 서울특별시 강남구 역삼동 729-37 명인갤러리
OPT 3 - Search Result : True
역삼동 명인갤러리
OPT 3 - No Result

00000003
[원본] : 서울특별시 강남구 역삼동 761-3 상지리츠빌역삼2차
OPT 3 - Search Result : True
역삼동 상지리츠빌 역삼2차
 -아파트명 : 상지리츠빌역삼2차
 -부동산유형 : 아파트
 -전체동수 : 1
 -전체세대수 : 19
 -전체주차수 : 39
 -건설사 : 상지건영(주)
 -난방방식 : 개별난방, 도시가스
 -복도유형 : 계단식
 -전용면적 : [132.02, 141.53, 156.33, 162.0, 165.06, 174.83]

00000004
[원본] : 서울특별시 강남구 역삼동 727-11 프리마빌
OPT 3 - Search Result : True
역삼동 역삼프리마빌
 -아파트명 : 역삼프리마빌
 -부동산유형 : 아파트
 -전체동수 : 1
 -전체세대수 : 19
 -전체주차수 : 29
 -건설사 : 백두건설산업(주)
 -난방방식 : 개별난방, 도시가스
 -복도유형 : 계단식
 -전용면적 : [82.33, 92.35, 102.79, 106.13, 124.8, 130.7, 13

In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1128094 entries, 0 to 9271
Data columns (total 37 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   본번                      1128094 non-null  float64
 1   부번                      1128094 non-null  float64
 2   아파트명                    1128094 non-null  object 
 3   전용면적                    1128094 non-null  float64
 4   층                       1128094 non-null  int64  
 5   건축년도                    1128094 non-null  int64  
 6   도로명                     1128094 non-null  object 
 7   부동산유형                   1128094 non-null  object 
 8   분양형태                    1128094 non-null  object 
 9   k-관리방식                  1128094 non-null  object 
 10  k-복도유형                  1128094 non-null  object 
 11  k-난방방식                  1128094 non-null  object 
 12  k-전체동수                  916825 non-null   float64
 13  k-전체세대수                 916825 non-null   float64
 14  k-건설사

In [30]:
print(len(no_results))

2869


In [31]:
train_df = data[data['is_test'] == 0]
test_df = data[data['is_test'] == 1]
print(train_df.shape, test_df.shape)

train_df = train_df.drop(columns=['is_test'])
test_df = test_df.drop(columns=['is_test', 'target'])

train_df.to_csv(f"{data_dir}/new/new_train_ver4.csv", index=False)
test_df.to_csv(f"{data_dir}/new/new_test_ver4.csv", index=False)

(1118822, 37) (9272, 37)


In [32]:
with open("./no_searched.txt", "w") as f:
    for nr in set(no_results):
        f.write(f"{nr}\n")