# Handling XML Format Data
- local weather report rss:https://www.kma.go.kr/wid/queryDFSRSS.jsp?zone=4182025000

In [9]:
import requests

url = "https://www.kma.go.kr/wid/queryDFSRSS.jsp?zone=4182025000"
response = requests.get(url).text
response[:100]

#the returned text tells us that this data is a xml file

'<?xml version="1.0" encoding="UTF-8" ?>\n<rss version="2.0">\n<channel>\n<title>기상청 동네예보 웹서비스 - 경기도 가평군'

In [19]:
# transforming xml into a dictionary (xml to dictionary)
import xmltodict

# parse -> methodology that explains and transforms only the format of data information,
#          without changing it.
# 변수 response에 저장된 정보를 parsing 

data = xmltodict.parse(response)

In [11]:
# display list of keys for the dictionary for the data
# in this case, it only returns rss. we should further see whether there are any
# sub-key lists under rss

data.keys()

odict_keys(['rss'])

In [12]:
data['rss'].keys()

odict_keys(['@version', 'channel'])

In [13]:
data['rss']['channel'].keys()

odict_keys(['title', 'link', 'description', 'language', 'generator', 'pubDate', 'item'])

In [14]:
data['rss']['channel']['title']

'기상청 동네예보 웹서비스 - 경기도 가평군 가평읍 도표예보'

In [15]:
data['rss']

# 출력된 내용은 Dictionary의 종류 중 OrderedDict 형식
# OrderedDict: Unlike dictionary, OrderedDict displays information in accordance to order
# Order is usually important in XML files

OrderedDict([('@version', '2.0'),
             ('channel',
              OrderedDict([('title', '기상청 동네예보 웹서비스 - 경기도 가평군 가평읍 도표예보'),
                           ('link', 'http://www.kma.go.kr/weather/main.jsp'),
                           ('description', '동네예보 웹서비스'),
                           ('language', 'ko'),
                           ('generator', '동네예보'),
                           ('pubDate', '2022년 03월 10일 (목)요일 17:00'),
                           ('item',
                            OrderedDict([('author', '기상청'),
                                         ('category', '경기도 가평군 가평읍'),
                                         ('title',
                                          '동네예보(도표) : 경기도 가평군 가평읍 [X=69,Y=132]'),
                                         ('link',
                                          'http://www.kma.go.kr/weather/forecast/timeseries.jsp?searchType=INTEREST&dongCode=4182025000'),
                                         ('guid',
                            

In [17]:
# rss - channel - item - decription - body - data 에서
# hour, day, temp, ..., wfKor

data['rss']['channel']['item']['description']['body']['data'].keys()

# AttributeError: List는 keys 메서드를 사용할 수 없음
# data['rss']['channel']['item']['description']['body']['data']  --> It is a list (a list of dictionaries to be exact)
# data는 현재 14개의 List 형태로 구성되어 있음

AttributeError: 'list' object has no attribute 'keys'

In [18]:
weather_list = data['rss']['channel']['item']['description']['body']['data']
weather_list[0].keys()

odict_keys(['@seq', 'hour', 'day', 'temp', 'tmx', 'tmn', 'sky', 'pty', 'wfKor', 'wfEn', 'pop', 'r12', 's12', 'ws', 'wd', 'wdKor', 'wdEn', 'reh', 'r06', 's06'])

In [22]:
weather_list[0]['wfKor']

# weather_list는 List이고 그 안에 Dictionary들이 들어 있음 -> Data Frame으로 변환할 수 있음

'맑음'

In [20]:
import pandas as pd
pd.DataFrame(weather_list)

Unnamed: 0,@seq,hour,day,temp,tmx,tmn,sky,pty,wfKor,wfEn,pop,r12,s12,ws,wd,wdKor,wdEn,reh,r06,s06
0,0,21,0,6.0,-999.0,-999.0,1,0,맑음,Clear,0,0.0,0.0,1.5,6,서,W,60,0.0,0.0
1,1,24,0,3.0,-999.0,-999.0,1,0,맑음,Clear,0,0.0,0.0,1.3,5,남서,SW,70,0.0,0.0
2,2,3,1,1.0,19.0,0.0,1,0,맑음,Clear,0,0.0,0.0,0.9,5,남서,SW,80,0.0,0.0
3,3,6,1,0.0,19.0,0.0,3,0,구름 많음,Mostly Cloudy,20,0.0,0.0,0.8,4,남,S,85,0.0,0.0
4,4,9,1,4.0,19.0,0.0,3,0,구름 많음,Mostly Cloudy,20,0.0,0.0,0.8,5,남서,SW,75,0.0,0.0
5,5,12,1,13.0,19.0,0.0,4,0,흐림,Cloudy,30,0.0,0.0,2.4000000000000004,6,서,W,55,0.0,0.0
6,6,15,1,18.0,19.0,0.0,4,0,흐림,Cloudy,30,0.0,0.0,3.0,6,서,W,40,0.0,0.0
7,7,18,1,16.0,19.0,0.0,1,0,맑음,Clear,0,0.0,0.0,1.2000000000000002,6,서,W,45,0.0,0.0
8,8,21,1,9.0,19.0,0.0,1,0,맑음,Clear,0,0.0,0.0,1.5,6,서,W,75,0.0,0.0
9,9,24,1,6.0,19.0,0.0,1,0,맑음,Clear,0,0.0,0.0,0.6000000000000001,6,서,W,85,0.0,0.0


In [21]:
pd.DataFrame(weather_list).to_csv('weather.csv', index=False)

In [23]:
pd.DataFrame(weather_list).to_csv('weather.csv', index=False, encoding='utf-8-sig')

# 한글이 깨져서 저장됨
# 엑셀로 열었을 때 깨지고, 메모장으로 열었을 때 안깨짐 -> 즉 데이터가 손상된 것은 아님
# 글자 깨짐이란 저장하는 방식과 읽어들이는 방식이 다를 때 발생
# 저장하는 방식과 읽어들이는 방식을 맞게 하면 해결!
# 대표적인 encoding 방식: euc-kr, latin, cp949, iso.., utf-8
# https://docs.python.org/ko/3/library/codecs.html

In [31]:
# 'encoding=' 옵션에 넣을 수 있는 경우의 수를 가져오기

path = 'https://docs.python.org/ko/3/library/codecs.html'


# pd.read_html method returns the list of tables in the html file

tables = pd.read_html(path)

In [25]:
# number of tables in the html file

len(tables)

8

In [30]:
# Fourth table that is listed in the imported html

tables[4]

Unnamed: 0,코덱,별칭,언어
0,ascii,"646, us-ascii",영어
1,big5,"big5-tw, csbig5",중국어 번체
2,big5hkscs,"big5-hkscs, hkscs",중국어 번체
3,cp037,"IBM037, IBM039",영어
4,cp273,"273, IBM273, csIBM273",독일어 버전 3.4에 추가.
...,...,...,...
92,utf_16_be,UTF-16BE,모든 언어
93,utf_16_le,UTF-16LE,모든 언어
94,utf_7,"U7, unicode-1-1-utf-7",모든 언어
95,utf_8,"U8, UTF, utf8, cp65001",모든 언어


In [34]:
# access first column values for the table

tables[4]['코덱']

0         ascii
1          big5
2     big5hkscs
3         cp037
4         cp273
        ...    
92    utf_16_be
93    utf_16_le
94        utf_7
95        utf_8
96    utf_8_sig
Name: 코덱, Length: 97, dtype: object

# Google News: https://news.google.com/rss/search?q=펭수&hl=ko&gl=KR&ceid=KR%3Ako

In [3]:
import requests
import xmltodict

url = "https://news.google.com/rss/search?q=펭수&hl=ko&gl=KR&ceid=KR%3Ako"
response = requests.get(url).text
gnews_data = xmltodict.parse(response)

gnews_list = gnews_data['rss']['channel']['item']
gnews_list[2].keys()

#gnews_list is a list that has dictionaries inside 

odict_keys(['title', 'link', 'guid', 'pubDate', 'description', 'source'])

In [6]:
import pandas as pd

pd.DataFrame(gnews_list)

Unnamed: 0,title,link,guid,pubDate,description,source
0,EBS 사장 퇴임식서 포착된 ‘존재감 폭발’ 펭수 “김명중 가지마” - 세계일보,https://m.segye.com/view/20220313507298,"{'@isPermaLink': 'false', '#text': 'CBMiJ2h0dH...","Sun, 13 Mar 2022 06:49:03 GMT","<a href=""https://m.segye.com/view/202203135072...","{'@url': 'https://m.segye.com', '#text': '세계일보'}"
1,"이재명, 홍대서 '펭수 성대모사'하며 청년 격려 [쿡정치 포토] - 쿠키뉴스",https://www.kukinews.com/newsView/kuk202202170231,"{'@isPermaLink': 'false', '#text': 'CBMiMWh0dH...","Fri, 18 Feb 2022 08:00:00 GMT","<a href=""https://www.kukinews.com/newsView/kuk...","{'@url': 'https://www.kukinews.com', '#text': ..."
2,펭수를 성공시킨 '퍼스트 펭귄' 정신 - 단비뉴스,http://m.danbinews.com/news/articleView.html?i...,"{'@isPermaLink': 'false', '#text': 'CBMiOGh0dH...","Wed, 27 Oct 2021 07:00:00 GMT","<a href=""http://m.danbinews.com/news/articleVi...","{'@url': 'http://m.danbinews.com', '#text': '단..."
3,"펭수, 권익위 국민콜110 특별상담사로 활약…고충·보람 느껴 - 아주경제",https://m.ajunews.com/view/20211126092436394,"{'@isPermaLink': 'false', '#text': 'CBMiLGh0dH...","Fri, 26 Nov 2021 08:00:00 GMT","<a href=""https://m.ajunews.com/view/2021112609...","{'@url': 'https://m.ajunews.com', '#text': '아주..."
4,"붱철 조교, 펭수 잇는 EBS 간판 스타 되나? - 머니투데이",https://news.mt.co.kr/mtview.php?no=2021120909...,"{'@isPermaLink': 'false', '#text': 'CBMiN2h0dH...","Thu, 09 Dec 2021 08:00:00 GMT","<a href=""https://news.mt.co.kr/mtview.php?no=2...","{'@url': 'https://news.mt.co.kr', '#text': '머니..."
...,...,...,...,...,...,...
95,"CU, 펭수 이미지 무단사용 논란… ""바로 삭제, 소통 부재로 오해"" - 조선비즈 ...",https://biz.chosun.com/site/data/html_dir/2020...,"{'@isPermaLink': 'false', '#text': 'CBMiR2h0dH...","Mon, 13 Jan 2020 08:00:00 GMT","<a href=""https://biz.chosun.com/site/data/html...","{'@url': 'https://biz.chosun.com', '#text': '조..."
96,"펭수, 카카오 이적? `찐경규`-`펭TV` 이색 트레이드…모르모트PD와 케미 기대 ...",https://www.mk.co.kr/star/hot-issues/view/2020...,"{'@isPermaLink': 'false', '#text': 'CBMiOmh0dH...","Mon, 23 Nov 2020 08:00:00 GMT","<a href=""https://www.mk.co.kr/star/hot-issues/...","{'@url': 'https://www.mk.co.kr', '#text': '매일경제'}"
97,"펭수, 이번엔 힙합 가수 된다 - 조선비즈 - 조선비즈",https://biz.chosun.com/site/data/html_dir/2020...,"{'@isPermaLink': 'false', '#text': 'CBMiR2h0dH...","Fri, 17 Apr 2020 07:00:00 GMT","<a href=""https://biz.chosun.com/site/data/html...","{'@url': 'https://biz.chosun.com', '#text': '조..."
98,"[뮤지컬] 비틀쥬스 X 펭수, 100만 뷰…화제의 특급 콜라보 공개 - 조선일보 -...",https://www.chosun.com/culture-life/culture_ge...,"{'@isPermaLink': 'false', '#text': 'CBMiWmh0dH...","Tue, 01 Jun 2021 07:00:00 GMT","<a href=""https://www.chosun.com/culture-life/c...","{'@url': 'https://www.chosun.com', '#text': 'T..."


In [8]:
# Q) 사용자가 입력한 키워드에 대한 뉴스 정보를 파일에 저장하세요.
# A)

def news_get(arg):
    url = f'https://news.google.com/rss/search?q={arg}&hl=ko&gl=KR&ceid=KR%3Ako'
    response = requests.get(url).text
    news_data = xmltodict.parse(response)
    news_list = news_data['rss']['channel']['item']
    pd.DataFrame(news_list).to_csv(arg+'.csv', index=False, encoding='utf-8-sig')

news_get('아기상어') # 아기상어.csv

## BeautifulSoup
#### -module that imports or analyzes necessary data from HTML, XML
#### -document: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
#### -installation: pip install beautifulsoup4

#### essentially xmltodict -> BeautifulSoup this time (unlike dictionary's values and keys, this time we assess through tags, <>) as above 
#### information in html are not stored in dictionaries, but they are stored in tags, so BeautifulSoup should be used to crawl the data in HTML 

## Local Weather Report rss - BeautifulSoup Application
#### weather report rss: http://www.kma.go.kr/wid/queryDFSRSS.jsp?zone=1111061500

In [9]:
# xmltodict method

import requests
import xmltodict

url = 'http://www.kma.go.kr/wid/queryDFSRSS.jsp?zone=1111061500'
response = requests.get(url).text
data = xmltodict.parse(response)

weather_list = data['rss']['channel']['item']['description']['body']['data']
weather_list[0]['wfKor']

'흐림'

In [10]:
# BeautifulSoup method

import requests
import bs4

url = 'http://www.kma.go.kr/wid/queryDFSRSS.jsp?zone=1111061500'
response = requests.get(url).text

# convert xml document into a BeautifulSoup class object
# we also have to store the document parser (program that executes parsing 
# that analyzes grammatical relations within the document), and
# in this case, 'xml' is the parser we need.
data = bs4.BeautifulSoup(response, features='xml')

# "find()" searches the very fist input tag (so in order to search all tags, one must use .find_all() function)
# not in the form of dictionary tag but in the tag form
# .text gets rid of the tag
print(data.find('wfKor'))
print(data.find('wfKor').text)
print(type(data))


<wfKor>흐림</wfKor>
흐림
<class 'bs4.BeautifulSoup'>


In [11]:
# Finding the data in the desired position
# ex) body - 두 번째 data - wfKor

# .find('body') -> tag 개념으로 해석한 전체 데이터에서 body 태그를 하나 찾음
# .find_all('data') -> body 태그 안을 대상으로 data 태그를 전부 찾음
# find_all()은 전부 찾기 때문에 결과가 여러 개 -> 결과를 List로 표현
# [1].find('wfKor') -> 두 번째 index 의 data 태그 안에서 wfKor 태그를 찾음
# .text -> tag를 제거
data.find('body').find_all('data')[0].find('wfKor').text

'흐림'

In [12]:
# find all "data" tags (here data tags mean the <data>s within the xml file, don't get confused)

weather_list = data.find('body').find_all('data')
weather_list


[<data seq="0">
 <hour>18</hour>
 <day>0</day>
 <temp>11.0</temp>
 <tmx>-999.0</tmx>
 <tmn>-999.0</tmn>
 <sky>4</sky>
 <pty>0</pty>
 <wfKor>흐림</wfKor>
 <wfEn>Cloudy</wfEn>
 <pop>30</pop>
 <r12>0.0</r12>
 <s12>0.0</s12>
 <ws>1.6</ws>
 <wd>5</wd>
 <wdKor>남서</wdKor>
 <wdEn>SW</wdEn>
 <reh>65</reh>
 <r06>0.0</r06>
 <s06>0.0</s06>
 </data>,
 <data seq="1">
 <hour>21</hour>
 <day>0</day>
 <temp>10.0</temp>
 <tmx>-999.0</tmx>
 <tmn>-999.0</tmn>
 <sky>3</sky>
 <pty>0</pty>
 <wfKor>구름 많음</wfKor>
 <wfEn>Mostly Cloudy</wfEn>
 <pop>20</pop>
 <r12>0.0</r12>
 <s12>0.0</s12>
 <ws>1.2000000000000002</ws>
 <wd>4</wd>
 <wdKor>남</wdKor>
 <wdEn>S</wdEn>
 <reh>70</reh>
 <r06>0.0</r06>
 <s06>0.0</s06>
 </data>,
 <data seq="2">
 <hour>24</hour>
 <day>0</day>
 <temp>9.0</temp>
 <tmx>-999.0</tmx>
 <tmn>-999.0</tmn>
 <sky>3</sky>
 <pty>0</pty>
 <wfKor>구름 많음</wfKor>
 <wfEn>Mostly Cloudy</wfEn>
 <pop>20</pop>
 <r12>0.0</r12>
 <s12>0.0</s12>
 <ws>1.3</ws>
 <wd>4</wd>
 <wdKor>남</wdKor>
 <wdEn>S</wdEn>
 <reh>70</reh

In [15]:
import pandas as pd
weather_data = []

for x in weather_list:
    day = x.find('day').text
    hour = x.find('hour').text
    wfKor = x.find('wfKor').text
    weather_dict = {'day':day, 'hour':hour, 'wfKor':wfKor}
    # append the data from the dictionary into the previously created empty list, weather_data 
    weather_data.append(weather_dict)

pd.DataFrame(weather_data).to_csv('weather.csv', index=False, encoding='utf-8-sig')

# print dataframe just for visualization purpose
df = pd.DataFrame(weather_data)
df

Unnamed: 0,day,hour,wfKor
0,0,18,흐림
1,0,21,구름 많음
2,0,24,구름 많음
3,1,3,구름 많음
4,1,6,구름 많음
5,1,9,구름 많음
6,1,12,구름 많음
7,1,15,구름 많음
8,1,18,흐림
9,1,21,흐림
