In [32]:
from bs4 import BeautifulSoup
from ebooklib import epub
import re
import datetime

import urllib.request
with urllib.request.urlopen('http://magic.wizards.com/en/articles/columns/magic-story') as response:
    html_doc = response.read()

soup = BeautifulSoup(html_doc, 'html.parser')

In [41]:
def getDate(string):
    match = re.search(r'\d{4}-\d{2}-\d{2}', string)
    date = datetime.datetime.strptime(match.group(), '%Y-%m-%d').date()
    return date

In [50]:
links = list()

for link in soup.find_all('a'):
    links.append(str(link.get('href')))
    
links_filtered = [x for x in links if 'story' in x]
links_filtered = [x for x in links if 'en/articles/archive/' in x]
links_filtered = list(set(links_filtered))
links_sorted = sorted(links_filtered, key=getDate)
#links_filtered

In [3]:
def makeTitlePretty(soupTitle):
    stringTitle = str(soupTitle)
    stringTitle = stringTitle.split('>')[1].split('|')[0]
    return stringTitle

In [66]:
def createChapter(url):
    response = urllib.request.urlopen(url)
    temp_doc = response.read()
    sub_soup = BeautifulSoup(temp_doc, 'html.parser')
    c1_title = str(makeTitlePretty(sub_soup.title))

    content = sub_soup.find_all("div", id="content-detail-page-of-an-article")
    c1_content = str(content[0].find_all('body')).replace('[','').replace(']','')
    return c1_title, c1_content

# Gesamtcrawler

In [None]:
chapters = dict()   
for link in links_sorted:
    c_title, c_content = createChapter('http://magic.wizards.com' + link)
    print(c_title + ' done.')
    chapters[c_title] = c_content
    


# Create Epub

In [108]:
def create_and_add_chapter(c_title, c_content, book):
    
    c_xhtml = c_title+'.xhtml'

    chapter = epub.EpubHtml(title=c_title, file_name=c_xhtml, lang='hr')
    chapter.content=u'<h1>'+ c_title + '</h1>' + str(c_content)

    ### add chapter
    magic_stories_epub.add_item(chapter)
    return chapter, book

In [109]:
magic_stories_epub = epub.EpubBook()
magic_stories_epub.set_title('Magic the Gathering Stories') 
magic_stories_epub.set_language('en')

In [110]:
ebook_chapters = list()

for c_title in chapters:
    chap, magic_stories_epub = create_and_add_chapter(c_title, chapters[c_title], magic_stories_epub)
    ebook_chapters.append(chap)


In [111]:
### define Table Of Contents
magic_stories_epub.toc = (epub.Link('Test1', 'Test2', 'Test3'),(epub.Section('Stories'),ebook_chapters))

### add default NCX and Nav file
magic_stories_epub.add_item(epub.EpubNcx())
magic_stories_epub.add_item(epub.EpubNav())

### define CSS style
style = 'BODY {color: white;}'
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)

### add CSS file
magic_stories_epub.add_item(nav_css)

### basic spine
spine = ['nav']
spine.extend(ebook_chapters)
magic_stories_epub.spine = spine

### write to the file
epub.write_epub('test.epub', magic_stories_epub, {})

## Notes

http://magic.wizards.com/en/articles/archive -  Alle Artikel

http://magic.wizards.com/en/articles/columns/magic-story - Story Auflistungsseite


from ebooklib import epub

magic_stories_epub = epub.EpubBook()

### set metadata
book.set_identifier('id123456')
book.set_title('Sample book')
book.set_language('en')

book.add_author('Author Authorowski')
book.add_author('Danko Bananko', file_as='Gospodin Danko Bananko', role='ill', uid='coauthor')

### create chapter
c1 = epub.EpubHtml(title='Intro', file_name='chap_01.xhtml', lang='hr')
c1.content=u'<h1>Intro heading</h1><p>Žaba je skočila u baru.</p>'

### add chapter
book.add_item(c1)

### define Table Of Contents
book.toc = (epub.Link('chap_01.xhtml', 'Introduction', 'intro'),
(epub.Section('Simple book'),
(c1, ))
)

### add default NCX and Nav file
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())

### define CSS style
style = 'BODY {color: white;}'
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)

### add CSS file
book.add_item(nav_css)

### basic spine
book.spine = ['nav', c1]

### write to the file
epub.write_epub('test.epub', book, {})