In [1]:
import os
import re
import pandas as pd
from bs4 import BeautifulSoup
import requests
from sqlalchemy import create_engine
import json
import pymysql

In [2]:
def read_data(filename):
    
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        api_key = data[0][1]
        category = list()
        urls = list()
        for row in data[1:]:
            category.append(row[0])
            urls.append(row[1])
            
    return api_key, category, urls

In [9]:
def make_frame(api_key, category, urls, startnumber, endnumber):        # 데이터 분류 및 저장
    
    region = {'지구구분 코드':['GIGU001','GIGU002','GIGU003','GIGU004','GIGU005','GIGU006','GIGU007','GIGU009','GIGU010','GIGU011', 'GIGU012'],
         '이름':['잠실지구', '광나루지구', '뚝섬지구', '잠원지구', '반포지구', '이촌지구', '여의도지구', '양화지구', '난지지구', '망원지구','강서지구']}
    region_df = pd.DataFrame(region)
    region_df = region_df[['지구구분 코드', '이름']]
    region_df.to_csv("./data/region.csv", mode='w', encoding='cp949', header=False, index=False)
    
    facilities = {'지형지물 코드':['ABB003', 'AB003', 'ABB100', 'ABB109', 'AAB104', 'ABB103', 'ABB101', 'ABB102', 'ABB201', 'ABB012', 'AAB670', 'ADA037', 'ABB112'],
                  '이름':['Tennis1', 'Tennis2', 'Soccer', 'Baseball', 'Gateball', 'Basketball', 'Volleyball', 'Badminton', 'WaterLeisure', 'Pool', 'InlineSkate1', 'InlineSkate2', 'Jokgu']
        
    }
    facility_df = pd.DataFrame(facilities)
    facility_df = facility_df[['지형지물 코드', '이름']]
    facility_df.to_csv("./data/facility.csv", mode='w', encoding='cp949', header=False, index=False)
    
    for i in range(len(urls)):
        url = urls[i].replace("(인증키)",api_key)
        url += str(startnumber)+'/'+str(endnumber)
        req = requests.get(url)
        html = req.text
        soup = BeautifulSoup(html, 'html.parser')
        tag_list = preprocess_tag_list(get_tag_list(soup))

        if category[i] == 'Baseball':
            del tag_list[4]
        else:
            del tag_list[3]

        col_name = ['종목명', '지형지물 코드', '고유번호', '지구구분 코드', '이름', '전화번호', '요금안내', '위도', '경도']
        total_info = dict()
        total_info[col_name[0]] = category[i]
        
        for j in range(len(tag_list)):

            temp = list()
            for code in soup.find_all(tag_list[j]):
                if code.text == ' ':
                    temp.append('NULL')
                else:
                    temp.append(code.text)
            if len(temp) != len(soup.find_all(tag_list[0])):
                for _ in range(len(soup.find_all(tag_list[0])) - len(temp)):
                    temp.append('NULL')
            total_info[col_name[j+1]] = temp

        df = pd.DataFrame(total_info)
        df = df[['종목명', '지형지물 코드', '고유번호', '지구구분 코드', '이름', '전화번호', '요금안내', '위도', '경도']]
        info_df = df[['지구구분 코드','지형지물 코드', '종목명', '이름', '전화번호', '요금안내', '위도', '경도']]
        if os.path.isfile("./data/info.csv") == False:
            info_df.to_csv("./data/info.csv", mode='w', encoding='cp949', header=False, index=False)
        else:
            info_df.to_csv("./data/info.csv", mode='a', encoding='cp949', header=False, index=False)

In [4]:
def get_tag_list(soup):                   # 필요한 Tag 선별
    
    tag_list = []
    for link in soup.find('row'):
        if link.name == None:
            pass
        elif link.name == 'objectid':
            pass
        elif link.name == 'h_org_code':
            pass
        elif link.name == 'mge_nam':
            pass
        elif link.name == 'pic':
            pass
        elif link.name == 'note':
            pass
        else:
            tag_list.append(link.name)
            
    return tag_list

In [5]:
def preprocess_tag_list(tag_list):             # Tag list 통일
    
    if 'rmk' not in tag_list:
        tag_list.insert(-2, 'rmk')
    if 'tel' not in tag_list:
        tag_list.insert(-3, 'tel')
    if 'idn' not in tag_list:
        tag_list.insert(1, 'idn')
    if len(tag_list) != 9:
        tag_list.insert(3, tag_list[3]+'2')
        
    return tag_list

In [6]:
api_key, category, urls = read_data('../url_data/info.txt')

In [7]:
startnumber=1
endnumber=50

In [10]:
make_frame(api_key, category, urls, startnumber, endnumber)