In [None]:
# summary

# web : server-client : url
# requests : 동적페이지 : URL 변화 없이 페이지의 데이터 수정 : json(str) > response.json() > DataFrame
# requests : 정적페이지 : URL 변화 있이 페이지의 데이터 수정 : html(str) > BeautifulSoup > css-selector > DataFrame
# selenium : 웹브라우져를 python 코드로 컨트롤해서 데이터 수집
# requests(동적페이지,API) > requests(정적페이지) > selenium

# 웹크롤링 절차
# 1. 웹서비스분석(개발자도구), API문서 : URL
# 2. request(URL) > response(data) : data(json(str), html(str))
# 3. data(json(str), html(str)) > response.json(), BeautifulSoup(css-selector) > DataFrame

# request 할때 401,403,500 등등의 에러가 발생하는 경우 > headers 수정해서 데이터 요청(user-agent, refere)
# API 이용 : request token 수집후 크롤링

### Zigbang 원룸 매물 데이터 수집
- 동이름 > 매물 데이터

In [2]:
import pandas as pd
import requests

#### Process
    - 동이름으로 위도 경도 구하기
    - 위도 경도로 geohash 알아내기
    - geohash로 매물 아이디 가져오기
    - 매물 아이디로 매물 정보 가져오기

#### 1. 동이름으로 위도 경도 구하기

In [11]:
# 1. URL
address = '망원동'
url = f'https://apis.zigbang.com/v2/search?leaseYn=N&q={address}&serviceType=원룸'

# 2. request > response : json(str)
response = requests.get(url)

# 3. json(str) > lat, lng
data = response.json()['items'][0]
lat, lng = data['lat'], data['lng']
lat, lng

(37.556785583496094, 126.9013442993164)

#### 2. 위도 경도로 geohash 알아내기
- geohash2 : pip install geohash2

In [12]:
import geohash2

In [15]:
# precision : 클수록 영역이 작아짐
geohash = geohash2.encode(lat, lng, precision=5)
geohash

'wydjx'

#### 3. geohash로 매물 아이디 가져오기

In [17]:
url = f'https://apis.zigbang.com/v2/items?deposit_gteq=0&domain=zigbang&geohash={geohash}&needHasNoFiltered=true&\
rent_gteq=0&sales_type_in=전세|월세&service_type_eq=원룸'
response = requests.get(url)
response

<Response [200]>

In [21]:
data = response.json()['items']
ids = [item['item_id'] for item in data]
len(ids), ids[:5]

(670, [35346672, 35365643, 35070248, 35265682, 35411116])

#### 4. 매물 아이디로 매물 정보 가져오기

In [22]:
url = 'https://apis.zigbang.com/v2/items/list'
params = {
    'domain': 'zigbang',
    'withCoalition': 'true',
    'item_ids': ids[:900], # 아이템 데이터의 갯수를 999개까지 사용 가능
}
response = requests.post(url, params)
response

<Response [200]>

In [26]:
pd.options.display.max_columns = 50

In [28]:
data = response.json()['items']
df = pd.DataFrame(data)
df.tail(1)

Unnamed: 0,section_type,item_id,images_thumbnail,sales_type,sales_title,deposit,rent,size_m2,공급면적,전용면적,계약면적,room_type_title,floor,floor_string,building_floor,title,is_first_movein,room_type,address,random_location,is_zzim,status,service_type,tags,address1,address2,address3,manage_cost,reg_date,is_new,contract
669,,35442758,https://ic.zigbang.com/ic/items/35442758/1.jpg,전세,전세,21000,0,59.93,"{'m2': 59.93, 'p': '18.1'}","{'m2': 51.29, 'p': '15.5'}",,,2,2,2,"⭐대출,보증보험OK⭐올리모델링⭐역세권⭐투룸⭐",,4,마포구 중동,"{'lat': 37.57022264132613, 'lng': 126.90537466...",False,True,빌라,[],서울시 마포구 중동,,,1,2023-02-16T11:23:15+09:00,True,


In [29]:
df.columns

Index(['section_type', 'item_id', 'images_thumbnail', 'sales_type',
       'sales_title', 'deposit', 'rent', 'size_m2', '공급면적', '전용면적', '계약면적',
       'room_type_title', 'floor', 'floor_string', 'building_floor', 'title',
       'is_first_movein', 'room_type', 'address', 'random_location', 'is_zzim',
       'status', 'service_type', 'tags', 'address1', 'address2', 'address3',
       'manage_cost', 'reg_date', 'is_new', 'contract'],
      dtype='object')

In [32]:
address

'망원동'

In [36]:
colums = ['item_id', 'sales_type', 'deposit', 'rent', 'size_m2',
          'floor', 'building_floor', 'title', 'address', 'status', 'service_type', 'tags', 'address1', 'manage_cost']
df = df[colums]
df = df[df['address'].str.contains(address)].reset_index(drop=True)
df.tail(2)

Unnamed: 0,item_id,sales_type,deposit,rent,size_m2,floor,building_floor,title,address,status,service_type,tags,address1,manage_cost
89,35408930,전세,10390,0,54.64,3,3,🌸중기청80/버팀목🌸마포구청역5분🌸깔끔한방🌸,마포구 망원동,True,원룸,[추천],서울시 마포구 망원동,10
90,35197484,월세,200,60,16.53,3,3,💖즉시입주💖나오면바로계약💖실매물💖선착순매물💖,마포구 망원동,True,원룸,[],서울시 마포구 망원동,6


In [None]:
# pep documents
# pep20, pep8
# flake8 : 코드효율성 체크

In [39]:
# 문법(에러발생O,코드실행X), 컨벤션(에러발생X,코드실행O)
1data = 1

SyntaxError: invalid syntax (<ipython-input-39-4c11d7547b80>, line 2)

In [40]:
data = 1

In [37]:
import this

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!
