### 국토교통부_상업업무용 부동산 매매 신고 자료

In [1]:
# 데이터 프레임 관련
import numpy  as np
import pandas as pd

# XML 관련
import bs4
from os import name

# API 호출 관련
import json
import requests

# 기타
from tqdm import tqdm
pd.set_option('display.width',1000)

In [None]:
## getRTMSDataSvcNrgTrade : 상업업무용 부동산 매매 실거래가

## 각 컬럼 값 (참고 : https://www.data.go.kr/tcs/dss/selectApiDataDetailView.do?publicDataPk=15057267)
"""
resultCode : 결과코드
resultMsg : 결과메시지
Deal Amount : 거래금액
Building Area : 건물 전용/연면적
Building Use : 건물주용도
Build Year : 건축년도
Classification of Share Dealing : 지분거래 구분
Deal Year : 계약년도
Plottage : 대지권면적
Dong : 물건 소재 법정동
Sigungu : 물건 소재 시군구
Land Use : 물건 소재 토지 용도지역
Deal Month : 계약월
Building Type : 일반/집합 건물 구분
Deal Day : 계약일
Regional Code : 지역코드
Floor : 거래물건 층(집합건물)
""" 

### DB connect

In [None]:
# module
import psycopg2
import datetime
from tqdm import tqdm

# table
table = 'm1.rtms_trade'

# values 설정
val = '(' + ('%s,' * len(df.columns))[:-1] + ')'

# execute_mogrify
def execute_mogrify(conn, df, table, val):
    # Create a list of tuples from the dataframe values
    tuples = [tuple(x) for x in df.to_numpy()]
    # Comma-separated dataframe columns
    cols = ','.join(list(df.columns))
    # SQL query to execute
    cursor = conn.cursor()
    values = [cursor.mogrify(val, tup).decode('utf8') for tup in
              tuples]
    query = "INSERT INTO %s(%s) VALUES " % (table, cols) + ",".join(values)
    try:
        cursor.execute(query, tuples)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    cursor.close()

# insert
for j in tqdm(range(0, len(df), 10000)):

    # DB Connect
    conn = psycopg2.connect(
        host='redshift-cluster-1.ctvbwnnvbdkl.ap-northeast-2.redshift.amazonaws.com',
        port=5439,
        dbname='dev',
        user='awsuser',
        password='cremaoAdmin1234qwer!!'
    )

    # 10000개 단위로 나누기
    tmp = df[j:j + 10000]

    # insert
    execute_mogrify(conn, tmp, table, val)
    print(datetime.datetime.now(), ' : ', j)
    tmp = pd.DataFrame()

# row count
print(len(df))

1. 현재 데이터 수집

In [2]:
# API 호출
url        = 'http://openapi.molit.go.kr/OpenAPI_ToolInstallPackage/service/rest/RTMSOBJSvc/getRTMSDataSvcNrgTrade'
serviceKey = 'C94PKnigawlH2iISVWz1Itw9tVzyuOCbOvKgzCrGOUZWvs3XqrkhH+/ntypJ7kK2P2g16vLYCJif3TNfO1i1KA=='
params = {
      'serviceKey' : serviceKey
    , 'LAWD_CD'  : '11110'  # 구 단위
    , 'DEAL_YMD' : '202207' # 날짜
}
response = requests.get(url, params=params)
content  = response.text

# item 태그 분리
xml_obj = bs4.BeautifulSoup(content,'lxml-xml')
rows    = xml_obj.findAll('item')

# ver 4 (한 줄씩 데이터 프레임으로 만들어서 concat)
# 리스트에 값 담기
row_list   = [] # 행 값
name_list  = [] # 열이름 값
value_list = [] # 데이터 값
DF = pd.DataFrame() # 최종 값

# 데이터 수집
for i in tqdm(range(0, len(rows))):
    columns = rows[i].find_all()
    
    for j in range(0,len(columns)):
        name_list.append(columns[j].name)
        value_list.append(columns[j].text)
    
    row_list = value_list
    df_each  = pd.DataFrame([row_list], columns=name_list)
    DF = pd.concat([DF, df_each])
    
    # 초기화
    name_list  = []
    value_list = []

# nan처리
df = df.replace(' ', np.nan)
df = df.drop(columns=['해제사유발생일','해제여부'])
df.columns = ['deal_amount', 'trade_type', 'building_area','building_use','build_year','deal_year','plottage','dong','sigungu','land_use','deal_month','building_type','deal_day','broker_location','regional_code','floor_']
df['build_year'] = df['build_year'].astype(str).str.split('.').str[0]

# CSV 파일로 저장
DF.to_excel('./RTMSDataSvcNrgTrade.xlsx', encoding='utf-8')

100%|█████████████████████████████████████████████████████████████████████████████████| 35/35 [00:00<00:00, 547.70it/s]


2. 과거 데이터 수집

In [None]:
# 법정동코드 앞 5자리 추출
df = pd.read_csv('./법정동코드_앞5자리.csv').drop_duplicates().reset_index(drop=True)
df['법정동코드'] = df['법정동코드'].astype(str)

# 날짜 생성(2019.01 ~ 2022.07)
month_list = pd.date_range(start='2019.01', end='2022.07', freq='M').strftime('%Y%m')

# API 호출
url        = 'http://openapi.molit.go.kr/OpenAPI_ToolInstallPackage/service/rest/RTMSOBJSvc/getRTMSDataSvcNrgTrade'
serviceKey = 'C94PKnigawlH2iISVWz1Itw9tVzyuOCbOvKgzCrGOUZWvs3XqrkhH+/ntypJ7kK2P2g16vLYCJif3TNfO1i1KA=='

row_list   = [] # 행 값
name_list  = [] # 열이름 값
value_list = [] # 데이터 값
DF = pd.DataFrame() # 최종 값

for month in tqdm(month_list):
    for code in df['법정동코드']:
        params = {
              'serviceKey' : serviceKey
            , 'LAWD_CD'  : code  # 구 단위
            , 'DEAL_YMD' : month # 월 단위
        }
        response = requests.get(url, params=params)
        content  = response.text
        
        # item 태그 분리
        xml_obj = bs4.BeautifulSoup(content,'lxml-xml')
        rows    = xml_obj.findAll('item')
        
        # 데이터 수집
        for i in range(0, len(rows)):
            columns = rows[i].find_all()
            
            for j in range(0,len(columns)):
                name_list.append(columns[j].name)
                value_list.append(columns[j].text)
            
            row_list = value_list
            df_each  = pd.DataFrame([row_list], columns=name_list)
            DF = pd.concat([DF, df_each])

            # 초기화
            name_list  = []
            value_list = []

# 공백 처리
DF = DF.replace(' ', None)
DF = DF.replace(np.nan, None)
DF.columns = ['deal_amount', 'trade_type', 'building_area','building_use','build_year','deal_year','plottage','dong','sigungu','land_use','deal_month','building_type','deal_day','broker_location','regional_code','unregister_day','unregister_status','floor_','share_dealing']
DF['build_year'] = DF['build_year'].astype(str).str.split('.').str[0]

# CSV 파일로 저장
DF.to_csv('./RTMSDataSvcNrgTrade.csv', encoding='utf-8', index=False)

 67%|███████████████████████████████████████████████████▎                         | 28/42 [2:05:21<2:15:22, 580.15s/it]

###### test

In [2]:
DF = pd.read_csv('./rtms_trade.csv')

  DF = pd.read_csv('./rtms_trade.csv')


In [10]:
DF['unregister_status'].value_counts()

O    19766
Name: unregister_status, dtype: int64

In [23]:
DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315494 entries, 0 to 315493
Data columns (total 19 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   deal_amount        315494 non-null  object 
 1   trade_type         62099 non-null   object 
 2   building_area      315494 non-null  float64
 3   building_use       315494 non-null  object 
 4   build_year         311427 non-null  float64
 5   deal_year          315494 non-null  int64  
 6   plottage           91281 non-null   float64
 7   dong               315494 non-null  object 
 8   sigungu            313436 non-null  object 
 9   land_use           315452 non-null  object 
 10  deal_month         315494 non-null  int64  
 11  building_type      315494 non-null  object 
 12  deal_day           315494 non-null  int64  
 13  broker_location    33480 non-null   object 
 14  regional_code      315494 non-null  int64  
 15  unregister_day     19766 non-null   object 
 16  un

In [12]:
# 공백 처리
DF = DF.replace(' ', None)
DF = DF.replace(np.nan, None)

In [18]:
DF[DF['해제여부'].notnull()]

Unnamed: 0,거래금액,거래유형,건물면적,건물주용도,건축년도,년,대지면적,법정동,시군구,용도지역,월,유형,일,중개사소재지,지역코드,해제사유발생일,해제여부,층,구분
79880,10000,,14.91,판매,2006.0,2020,,인의동,종로구,일반상업,2,집합,24,,11110,20.02.28,O,,
79881,10000,,10.08,판매,2006.0,2020,,인의동,종로구,일반상업,2,집합,24,,11110,20.02.28,O,,
79882,10000,,7.45,판매,2006.0,2020,,인의동,종로구,일반상업,2,집합,24,,11110,20.02.28,O,,
79944,220000,,461.82,제2종근린생활,1970.0,2020,74.4,충무로3가,중구,일반상업,2,일반,21,,11140,20.05.11,O,,
80066,19300,,16.53,제1종근린생활,2018.0,2020,,성수동2가,성동구,준공업,2,집합,22,,11200,20.03.17,O,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315193,40577,직거래,180.49,숙박,2001.0,2022,658,원동면 대리,양산시,생산관리,6,일반,22,,48330,22.06.30,O,,
315203,76000,중개거래,478.32,제2종근린생활,2015.0,2022,1270,칠원읍 용산리,함안군,계획관리,6,일반,8,경남 창원마산회원구,48730,22.07.28,O,,
315376,70000,중개거래,66.15,제1종근린생활,2015.0,2022,,노형동,제주시,일반상업,6,집합,23,제주 제주시,50110,22.07.19,O,,
315377,100000,중개거래,81.27,제1종근린생활,2015.0,2022,,노형동,제주시,일반상업,6,집합,23,제주 제주시,50110,22.07.19,O,,


In [19]:
DF.columns = ['deal_amount', 'trade_type', 'building_area','building_use','build_year','deal_year','plottage','dong','sigungu','land_use','deal_month','building_type','deal_day','broker_location','regional_code','unregister_day','unregister_status','floor_','share_dealing']

In [20]:
DF.to_csv('./RTMSDataSvcNrgTrade.csv', encoding='utf-8', index=False)

In [15]:
df = pd.read_csv('./getRTMSDataSvcNrgTrade(201901~202207).csv')
df

Unnamed: 0,거래금액,거래유형,건물면적,건물주용도,건축년도,년,대지면적,법정동,시군구,용도지역,월,유형,일,중개사소재지,지역코드,해제사유발생일,해제여부,층,구분
0,24000,,76.33,판매,1983.0,2019,,당주동,종로구,일반상업,1,집합,29,,11110,,,,
1,52749,,51.62,기타,2004.0,2019,,내수동,종로구,일반상업,1,집합,30,,11110,,,,
2,3650000,,2848.11,교육연구,1994.0,2019,1791.1,신문로2가,종로구,제1종일반주거,1,일반,25,,11110,,,,
3,38512,,39.67,숙박,2005.0,2019,,수송동,종로구,일반상업,1,집합,15,,11110,,,,
4,21000,,75.67,업무,1991.0,2019,,관훈동,종로구,일반상업,1,집합,23,,11110,,,13.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315489,22000,직거래,60.68,숙박,2020.0,2022,,안덕면 사계리,서귀포시,계획관리,6,집합,17,,50130,,,2.0,
315490,13570,직거래,12.13,숙박,2002.0,2022,,표선면 토산리,서귀포시,계획관리,6,집합,13,,50130,,,24.0,
315491,14280,직거래,12.13,숙박,2002.0,2022,,표선면 토산리,서귀포시,계획관리,6,집합,15,,50130,,,26.0,
315492,17850,직거래,12.13,숙박,2002.0,2022,,표선면 토산리,서귀포시,계획관리,6,집합,16,,50130,,,24.0,


In [None]:
df['구분'].value_counts()

지분    51
Name: 구분, dtype: int64

In [8]:
# 공백 처리
df = df.replace(' ', None)
df = df.replace(np.nan, None)
df = df.drop(columns=['해제사유발생일','해제여부'])
df.columns = ['deal_amount', 'trade_type', 'building_area','building_use','build_year','deal_year','plottage','dong','sigungu','land_use','deal_month','building_type','deal_day','broker_location','regional_code','floor_','classification_share_dealing']

ValueError: Length mismatch: Expected axis has 17 elements, new values have 16 elements