In [1]:
import numpy as np
import pandas as pd
import cx_Oracle

In [2]:
# 월별 테이블
mon = pd.period_range('2012-08-01', '2012-10-31', freq='M')
# mon = pd.period_range('2012-08-01', '2020-07-31', freq='M')
mon_table_raw = list(mon.astype(str))
mon_table = []
for string in mon_table_raw:
    new_string = string.replace("-", "")
    mon_table.append(new_string)
print(mon_table)

['201208', '201209', '201210']


In [3]:
# 연별 테이블
year = pd.period_range('2012-08-01', '2014-07-31', freq='Y')
# year = pd.period_range('2012-08-01', '2020-07-31', freq='M')
year_table = list(year.astype(str))
print(year_table)

['2012', '2013', '2014']


In [4]:
def get_data_from_db(query):
    conn = cx_Oracle.connect('hoseo', 'hoseo', 'localhost:1521/xe')
    df = pd.read_sql(query, conn)
    conn.close()
    return df

### 월별 지역별 매매건수

In [5]:
# 월별 지역별 매매건수
def make_region_deal_count():
    
    for yearmon in mon_table:
        query = f"""
            SELECT region, COUNT(*) AS "{yearmon}"
            FROM apt_deal_price
            WHERE TO_CHAR(contract_date, 'YYYYMM') = '{yearmon}' 
            GROUP BY region 
            ORDER BY region ASC
            """
        queD = get_data_from_db(query)

        if yearmon == '201208':
            dataD = queD.copy()
        else:
            dataD = dataD.merge(queD)
    dataD.info()
    dataD.to_csv('apt-data-cache/deal_count_monthly.csv', index=False, encoding='utf-8-sig')  

In [6]:
make_region_deal_count()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17 entries, 0 to 16
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   REGION  17 non-null     object
 1   201208  17 non-null     int64 
 2   201209  17 non-null     int64 
 3   201210  17 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 680.0+ bytes


In [7]:
testD = pd.read_csv('apt-data-cache/deal_count_monthly.csv')
testD

Unnamed: 0,REGION,201208,201209,201210
0,강원도,933,1191,1495
1,경기도,5631,8083,10645
2,경상남도,1623,2113,3168
3,경상북도,1781,2087,2701
4,광주광역시,1204,1483,2012
5,대구광역시,2515,3259,4283
6,대전광역시,763,1140,1582
7,부산광역시,1725,2235,3498
8,서울특별시,2222,3388,4918
9,세종특별자치시,84,112,151


### test 중

### 월별 지역별 평형별 매매

In [8]:
# 월별 지역별 매매건수
def make_deal_count_monthly_region_sizecat():
    
    for yearmon in mon_table:
        query = f"""
            SELECT region AS 지역, apt_size_cat AS 평형, count(*) AS 건수
            FROM 
                (
                    SELECT region,
                       CASE
                           WHEN  apt_size < 50 THEN '국임'
                           WHEN  apt_size >= 50 AND apt_size <= 60 THEN '초소형'
                           WHEN  apt_size > 60 AND apt_size <= 85 THEN '소형'
                           WHEN  apt_size > 85 AND apt_size <= 100 THEN '중소형'
                           WHEN  apt_size > 100 AND apt_size <= 135 THEN '중대형'
                           WHEN  apt_size > 135 THEN '대형'
                       ELSE '초대형'       
                       END AS apt_size_cat
                    FROM APT_DEAL_PRICE
                 )
             GROUP BY region, apt_size_cat
             ORDER BY region ASC, apt_size_cat DESC
             """
        queD = get_data_from_db(query)

        if yearmon == '201208':
            dataD = queD.copy()
        else:
            dataD = dataD.merge(queD)
    dataD.info()
    dataD.to_csv('apt-data-cache/deal_count_monthly_region_sizecat.csv', index=False, encoding='utf-8-sig')

In [9]:
make_deal_count_monthly_region_sizecat()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102 entries, 0 to 101
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   지역      102 non-null    object
 1   평형      102 non-null    object
 2   건수      102 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 3.2+ KB


In [88]:
dataD = get_data_from_db(query)
dataD.info()
dataD.head(12)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102 entries, 0 to 101
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   지역      102 non-null    object
 1   평형      102 non-null    object
 2   건수      102 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 2.5+ KB


Unnamed: 0,지역,평형,건수
0,강원도,초소형,45311
1,강원도,중소형,1674
2,강원도,중대형,8086
3,강원도,소형,50540
4,강원도,대형,2181
5,강원도,국임,34561
6,경기도,초소형,349483
7,경기도,중소형,24901
8,경기도,중대형,169442
9,경기도,소형,584984


### 주택 규모별 매매건수

In [49]:
# 국민임대주택 규모: 전용면적 50m²(15평) 미만, 무주택 세대주 월평균 소득 50% 이하
#                   전용면적 50m²(15평)이상∼60m²(18평) 이하, 무주택 세대주 월평균 소득 70% 이하
# 국민주택 규모: 전용면적 85m²(25.7평) 이하
#               전용면적 100m²(30.2평) 이하, 수도권을 제외한 도시지역이 아닌 읍·면 지역
#               전용면적 100m²(30.2평)초과~135m²(40.8평)이하
#               전용면적 135m²(40.8평)초과

# 한국감정원 기중 아파트 규모 분류
# ~ 60m² 초소형
# 60m² ~ 85m² 소형
# 85m² ~ 100m² 중소형
# 100m² ~ 135m² 중대형
# 135m² ~ 대형

In [50]:
# 지역 정보
region_table = [
    '강원도', '경기도', '경상남도', '경상북도', '광주광역시', '대구광역시',
    '대전광역시', '부산광역시', '서울특별시', '세종특별자치시', '울산광역시',
    '인천광역시', '전라남도', '전라북도', '제주특별자치도', '충청남도', '충청북도']

In [51]:
size_conditions = [
    ('국임', 'apt_size < 50'),
    ('초소형', 'apt_size >= 50 AND apt_size <= 60'),
    ('소형', 'apt_size > 60 AND apt_size <= 85'),
    ('중소형', 'apt_size > 85 AND apt_size <= 100'),
    ('중대형', 'apt_size > 100 AND apt_size <= 135'),
    ('대형', 'apt_size > 135')]

col_apt_size_data = [
    'region', 'apt_size', 'y2012', 'y2013', 'y2014',
    'y2015', 'y2016', 'y2017', 'y2018', 'y2019', 'y2020']    

In [55]:
# 지역내 사이즈별 연간 매매건수
def make_region_size_count(year = 2012):
    for cond in size_conditions:

        query = f"""
            SELECT region, COUNT(apt_size) AS "{cond[0]}"
            FROM apt_deal_price
            WHERE to_char(contract_date, 'YYYY') = '{year}' AND {cond[1]}
            GROUP BY region
            ORDER BY region ASC
            """

        queD = get_data_from_db(query)

        if cond[0] == '국임':
            dataD = queD.copy()
        else:
            dataD = dataD.merge(queD, how='left')
        
    dataD.fillna(0, inplace=True)
    dataD = dataD.astype( \
        {'국임':'int64', '초소형':'int64', '소형':'int64',
        '중소형':'int64', '중대형':'int64', '대형':'int64'})
    
    # dataD.info()

    dataD.set_index('REGION', inplace=True)
    data_list = dataD.values.reshape(len(region_table) * len(size_conditions), )

    return data_list


In [56]:
# 년도별 지역별 사이즈별 매매건수 데이터 만들기
def complete_region_size_count():

    # 최종 데이터 테이블 형태 만들기
    row_count = len(region_table) * len(size_conditions)
    col_count = len(col_apt_size_data)
    allD = pd.DataFrame(np.zeros((row_count, col_count), dtype='int64'), columns=col_apt_size_data)

    size_list = [ x[0] for x in size_conditions ]
    allD['apt_size'] = size_list * len(region_table)

    region_list = []
    for r in region_table:
        region_list.extend([r] * len(size_conditions))
    allD['region'] = region_list

    # 년도별 데이터 수집
    for year in year_table:
        data_list = make_region_size_count(year)
        allD['y' + year] = data_list
        
    allD.info()
    allD.to_csv('apt-data-cache/deal_count_region_size_yearly.csv', index=False, encoding='utf-8-sig')

In [57]:
complete_region_size_count()

KeyboardInterrupt: 

In [58]:
testD = pd.read_csv('apt-data-cache/deal_count_region_size_yearly.csv')
testD.head(12)

Unnamed: 0,region,apt_size,y2012,y2013,y2014,y2015,y2016,y2017,y2018,y2019,y2020
0,강원도,국임,1745,5230,5375,0,0,0,0,0,0
1,강원도,초소형,2164,5709,6467,0,0,0,0,0,0
2,강원도,소형,2063,5477,6316,0,0,0,0,0,0
3,강원도,중소형,75,194,218,0,0,0,0,0,0
4,강원도,중대형,432,894,992,0,0,0,0,0,0
5,강원도,대형,151,326,239,0,0,0,0,0,0
6,경기도,국임,5283,17442,23048,0,0,0,0,0,0
7,경기도,초소형,11330,40630,46031,0,0,0,0,0,0
8,경기도,소형,17987,63772,73604,0,0,0,0,0,0
9,경기도,중소형,638,2393,3079,0,0,0,0,0,0


## 아래 내용은 작업중..

In [19]:
# 월별 지역별 사이즈별 매매건수 데이터 수집
def make_region_size_count_month(ym = 201208):
    for cond in size_conditions:

        query = f"""
            SELECT region AS region, COUNT(NVL(apt_size, 0)) AS "{cond[0]}"
            FROM apt_deal_price
            WHERE to_char(contract_date, 'YYYYMM') = '{ym}' AND {cond[1]}
            GROUP BY region
            ORDER BY region ASC
            """

        queD = get_data_from_db(query)

        if cond[0] == 'under_50':
            dataD = queD.copy()
        else:
            dataD = dataD.merge(queD, how='left')
    
    # dataD empty 처리

    dataD.fillna(0, inplace=True)
    dataD = dataD.astype( \
        {'under_50':'int64', '50_60':'int64', '60_85':'int64',
        '85_100':'int64', '100_135':'int64', '135_over':'int64'})
    
    # dataD.info()

    dataD.set_index('REGION', inplace=True)
    data_list = dataD.values.reshape(len(region_table) * len(size_conditions), )

    return data_list

In [20]:
# 월별 지역별 사이즈별 매매건수 데이터 만들기
def complete_region_size_count():
    yyyymm = get_yyyymm()
    col_ym = [ 'ym' + y for y in yyyymm ]
    col_apt_size_month = ['region', 'apt_size']
    col_apt_size_month += col_ym

    # 최종 데이터 테이블 형태 만들기
    row_count = len(region_table) * len(size_conditions)
    col_count = len(col_apt_size_month)
    allD = pd.DataFrame(np.zeros((row_count, col_count), dtype='int64'), columns=col_apt_size_month)

    size_list = [ x[0] for x in size_conditions ]
    allD['apt_size'] = size_list * len(region_table)

    region_list = []
    for r in region_table:
        region_list.extend([r] * len(size_conditions))
    allD['region'] = region_list

    # 년도별 데이터 수집
    for ym in yyyymm:
        data_list = make_region_size_count(ym)
        allD['ym' + ym] = data_list
        
    allD.info()
    allD.to_csv('sub-data-files/region_size_deal_count_month.csv', index=False, encoding='utf-8-sig')