In [1]:
import requests
import time

restaurants = []  # 배열 선언

# 전체 페이지 수를 설정
total_pages = 576
url_template = "https://www.bluer.co.kr/api/v1/restaurants?page={page}&size=30&query=&foodType=&foodTypeDetail=&feature=&location=&locationDetail=&area=&areaDetail=&priceRange=&ribbonType=&recommended=false&isSearchName=false&tabMode=single&searchMode=ribbonType&zone1=&zone2=&zone2Lat=&zone2Lng="

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
    "Accept": "application/hal+json",
    "x-requested-with": "XMLHttpRequest"
}

# 크롤링 진행
for page in range(total_pages):
    url = url_template.format(page=page)
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        restaurants.extend(data["_embedded"]["restaurants"])
        print(f"Page {page + 1}/{total_pages} collected successfully.")
    else:
        print(f"Failed to fetch page {page + 1}. Status code: {response.status_code}")
    
    # 각 요청 사이에 시간 간격 추가
    time.sleep(2)  # 2초 간격

# 전체 데이터를 저장
import json
with open("restaurants.json", "w", encoding="utf-8") as f:
    json.dump(restaurants, f, ensure_ascii=False, indent=4)

print("전체 페이지 크롤링 완료. JSON 파일로 저장됨.")


Page 1/576 collected successfully.
Page 2/576 collected successfully.
Page 3/576 collected successfully.
Page 4/576 collected successfully.
Page 5/576 collected successfully.
Page 6/576 collected successfully.
Page 7/576 collected successfully.
Page 8/576 collected successfully.
Page 9/576 collected successfully.
Page 10/576 collected successfully.
Page 11/576 collected successfully.
Page 12/576 collected successfully.
Page 13/576 collected successfully.
Page 14/576 collected successfully.
Page 15/576 collected successfully.
Page 16/576 collected successfully.
Page 17/576 collected successfully.
Page 18/576 collected successfully.
Page 19/576 collected successfully.
Page 20/576 collected successfully.
Page 21/576 collected successfully.
Page 22/576 collected successfully.
Page 23/576 collected successfully.
Page 24/576 collected successfully.
Page 25/576 collected successfully.
Page 26/576 collected successfully.
Page 27/576 collected successfully.
Page 28/576 collected successfully.
P

In [2]:
#### 1. 전처리

import pandas as pd
from pandas import json_normalize

# JSON 파일 읽기
with open("restaurants.json", "r", encoding="utf-8") as f:
    restaurants = json.load(f)

# Pandas DataFrame으로 변환
df = pd.DataFrame(restaurants)

# 기존 전처리 코드 적용
# 필요 없는 컬럼 제거
columns_to_drop = ["createdDate", "id", "timeInfo", "gps", "tags", "status", "bookStatus",
                   "buzUsername", "business", "pageView", "brandMatchStatus", "brandRejectReason",
                   "orderDescending", "foodTypeDetails", "countEvaluate", "bookmark", "features",
                   "feature107", "brandBranches", "foodTypes", "brandHead", "firstImage", "firstLogoImage",
                   "_links"]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# Nested JSON 컬럼 정규화
nested_columns = ["headerInfo", "defaultInfo", "statusInfo", "juso", "review", "etcInfo"]
for column in nested_columns:
    if column in df.columns:
        n = pd.json_normalize(df[column])  # 정규화
        n.columns = [f"{column}_{subcol}" for subcol in n.columns]  # 열 이름 접두사 추가
        df = pd.concat([df.drop(columns=[column]), n], axis=1)  # 기존 열 삭제 후 결합

# 필요 없는 sub_columns 제거
sub_columns_to_drop = ["headerInfo_nickname", "headerInfo_year", "headerInfo_ribbonTypeByOrdinal",
                       "defaultInfo_websiteFacebook", "statusInfo_storeType", "statusInfo_openEra",
                       "statusInfo_newOpenDate", "juso_roadAddrPart2", "juso_jibunAddr", "juso_zipNo",
                       "juso_admCd", "juso_detBdNmList", "juso_zone2_1", "juso_zone2_2", "juso_map_1",
                       "juso_map_2", "review_readerReview", "review_businessReview", "review_editorReview",
                       "etcInfo_toilet", "etcInfo_toiletEtc", "etcInfo_chain", "etcInfo_close", "etcInfo_renewal",
                       "etcInfo_appYn", "etcInfo_projectNo", "etcInfo_reviewerRecommend", "etcInfo_onlySiteView",
                       "etcInfo_history", "etcInfo_mainMemo"]
df = df.drop(columns=[col for col in sub_columns_to_drop if col in df.columns])

# 데이터 저장
df.to_csv("nested_all_restaurants.csv", index=False)
print("전처리 테스트 완료. CSV 파일 저장됨.")


전처리 테스트 완료. CSV 파일 저장됨.


In [3]:
import numpy as np
import re

# 빈칸이나 "없음"으로 표기된 칸을 None으로 변경
df.replace(["", "없음"], None, inplace=True)

# foodDetailTypes의 리스트 해제
# 각 행의 값이 리스트라면 이를 문자열로 변환 (쉼표로 구분된 문자열로 병합)
if "foodDetailTypes" in df.columns:
    df["foodDetailTypes"] = df["foodDetailTypes"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)

# ribbonType mapping
ribbon_mapping = {
    "RIBBON_ONE": 1,
    "RIBBON_TWO": 2,
    "RIBBON_THREE": 3,
    "RIBBON_NONE": 0
}

if "headerInfo_ribbonType" in df.columns:
    df["headerInfo_ribbonType"] = df["headerInfo_ribbonType"].map(ribbon_mapping)

# 웹사이트 열 통합
if "defaultInfo_website" in df.columns and "defaultInfo_websiteInstagram" in df.columns:
    def merge_websites(row):
        websites = [site for site in [row.get("defaultInfo_website"), row.get("defaultInfo_websiteInstagram")] if site]
        return ", ".join(websites) if websites else None

    df["defaultInfo_website_combined"] = df.apply(merge_websites, axis=1)
    df.drop(columns=["defaultInfo_website", "defaultInfo_websiteInstagram"], inplace=True)
    df.rename(columns={"defaultInfo_website_combined": "defaultInfo_website"}, inplace=True)

# statusInfo_openDate 형식 변환 (년도 4자리로 추출 후 "2024년" 형식으로 표기)
def extract_year_with_suffix(date):
    if isinstance(date, str):
        # 패턴 매칭으로 숫자 4자리 추출
        match = re.search(r'\d{4}', date)
        if match:
            return f"{int(match.group(0))}년"  # "2024년" 형식으로 반환
    return None

if "statusInfo_openDate" in df.columns:
    df["statusInfo_openDate"] = df["statusInfo_openDate"].apply(extract_year_with_suffix)

# 최종 결과를 CSV 파일로 저장
df.to_csv("cleaned_all_restaurants.csv", index=False)
print("수정된 최종 CSV 파일 저장 완료!")


수정된 최종 CSV 파일 저장 완료!
