In [1]:
from bs4 import BeautifulSoup
from ebooklib import epub
import re
import datetime
import pandas as pd
import requests

response = requests.get('https://mtg.gamepedia.com/Magic_Story')
html_doc = response.text

soup = BeautifulSoup(html_doc, 'html.parser')

In [2]:
def getDate(string):
    match = re.search(r'\d{4}-\d{2}-\d{2}', string)
    date = datetime.datetime.strptime(match.group(), '%Y-%m-%d').date()
    return date

In [3]:
table = soup.find('table', attrs={'class':'wikitable sortable'})

data = []
colNames = table.find_all('th')
colNames = [ele.text.strip() for ele in colNames]
colNames.append('Link')
rows = table.find_all('tr')
for row in rows:
    try:
        cols = row.find_all('td')
        link = cols[0].find('a').get('href')
        col_data = [ele.text.strip() for ele in cols]
        col_data.append(link)
        data.append(col_data)
    except:
        pass

In [4]:
data_table = pd.DataFrame(columns=colNames, data=data)
grouped_table = data_table.groupby("Set")
groups = data_table.Set.unique()

In [5]:
def makeTitlePretty(soupTitle):
    stringTitle = str(soupTitle)
    stringTitle = stringTitle.split('>')[1].split('|')[0]
    return stringTitle

In [6]:
def createChapter(url):
    response = requests.get(url)
    temp_doc = response.text
    sub_soup = BeautifulSoup(temp_doc, 'html.parser')
    c1_title = str(makeTitlePretty(sub_soup.title))

    content = sub_soup.find_all("div", id="content-detail-page-of-an-article")
    c1_content = str(content[0].find_all('body')).replace('[','').replace(']','')
    return c1_title, c1_content

# Create Epub

In [31]:
# Gesamtcrawler

def create_chapter(c_title, c_content):
    
    c_xhtml = c_title+'.xhtml'

    chapter = epub.EpubHtml(title=c_title, file_name=c_xhtml, lang='hr')
    chapter.content=u'<h1>'+ c_title + '</h1>' + str(c_content)

    return chapter

In [32]:
def create_ebook(title, chapters):
    magic_stories_epub = epub.EpubBook()
    magic_stories_epub.set_title(title) 
    magic_stories_epub.set_language('en')

    ebook_chapters = list()

    for c_title in chapters:
        chap = create_chapter(c_title, chapters[c_title])
        ### add chapter
        magic_stories_epub.add_item(chap)
        ebook_chapters.append(chap)


    ### define Table Of Contents
    magic_stories_epub.toc = (epub.Link('Test1', 'Test2', 'Test3'),(epub.Section('Stories'),ebook_chapters))

    ### add default NCX and Nav file
    magic_stories_epub.add_item(epub.EpubNcx())
    magic_stories_epub.add_item(epub.EpubNav())

    ### define CSS style
    style = 'BODY {color: white;}'
    nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)

    ### add CSS file
    magic_stories_epub.add_item(nav_css)

    ### basic spine
    spine = ['nav']
    spine.extend(ebook_chapters)
    magic_stories_epub.spine = spine

    ### write to the file
    epub.write_epub(title + '.epub', magic_stories_epub, {})
    print(title + '.epub exported.')
    return magic_stories_epub

# Gesamtcrawler

In [31]:
for groupName in groups:
    groupData = grouped_table.get_group(groupName)
    chapters = dict()
    for link in groupData.Link:
        c_title, c_content = createChapter(link)
        print(c_title + ' - created.')
        chapters[c_title] = c_content
    create_ebook(groupName, chapters)

Odric, Master Tactician  - created.
Xathrid Gorgon  - created.
Chronomaton  - created.
Krenko, Mob Boss  - created.
Threadbare  - created.
Talrand, Sky Summoner  - created.
The Stonekiller  - created.
The Stonekiller, Part 2  - created.
Magic 2013.epub done.
The Shadows of Prahv, Part 1  - created.
The Shadows of Prahv, Part 2  - created.
Epic Experiment  - created.
In Praise of the Worldsoul, Part 1  - created.
In Praise of the Worldsoul, Part 2  - created.
In Praise of the Worldsoul, Part 3  - created.
Slaughter Games  - created.
The Great Concourse  - created.
The Azorius Ten Most Wanted  - created.
The Seven Bells, Part 1  - created.
The Seven Bells, Part 2  - created.
Rogue's Passage  - created.
Return to Ravnica.epub done.
Gruul Ingenuity  - created.
The Fathom Edict  - created.
The Absolution of the Guildpact  - created.
Persistence of Memory  - created.
The Burying, Part 1  - created.
The Greater Good  - created.
The Guild of Deals  - created.
Experiment One  - created.
Fblthp 

'http://magic.wizards.com/en/articles/archive/magic-story/odric-master-tactician-2012-06-26'

In [None]:
def saveImages(url):
    esponse = requests.get(url)
    temp_doc = response.text
    sub_soup = BeautifulSoup(temp_doc, 'html.parser')
    c1_title = str(makeTitlePretty(sub_soup.title))

    content = sub_soup.find_all("img", id="content-detail-page-of-an-article")

In [None]:
for groupName in groups:
    groupData = grouped_table.get_group(groupName)
    chapters = dict()
    for link in groupData.Link:
        saveImages(link)
        print(c_title + ' - saved.')
    #    chapters[c_title] = c_content
    #create_ebook(groupName, chapters)

In [50]:
groupName = groups[0]
groupData = grouped_table.get_group(groupName)
link = groupData.Link[0]
response = requests.get(link)
temp_doc = response.text
sub_soup = BeautifulSoup(temp_doc, 'html.parser')
c1_title = str(makeTitlePretty(sub_soup.title))

content = sub_soup.find_all("div", id="content-detail-page-of-an-article")
test = content[0]
test.find_all("img")

[<img align="right" cap="" src="http://media.wizards.com/images/magic/tcg/products/m13/1l1lk1mylk_en.jpg" width="225"><p><img style="magic">A town crier hollered the evening news on the cobblestone street below the open window. "Execution at Bloodless Wall! Tomorrow at sunrise! The Healers of Heron are at Child's Wall tomorrow..."</img></p>
 <p>When Odric last looked out the window, it had been early afternoon. Now a cold mist had settled on Thraben, and the city was cloaked in evening shadows. <i>Where is the moon?</i> Odric's arm jerked involuntarily, nearly upending his ink jar. <i>No</i>, he reminded himself. <i>It no longer matters</i>. The phases of the moon were no longer predictors of life and death now that Avacyn had returned and cleansed the world. <i>Or at least begun to...</i></p>
 <p>He glanced across the oak table at Grete, his lieutenant, who looked surprised by his sudden movement. Sir Odric, Master Tactician, Commander of the Gavony Riders and Recipient of the Moonsil

## Notes

http://magic.wizards.com/en/articles/archive -  Alle Artikel

http://magic.wizards.com/en/articles/columns/magic-story - Story Auflistungsseite


from ebooklib import epub

magic_stories_epub = epub.EpubBook()

### set metadata
book.set_identifier('id123456')
book.set_title('Sample book')
book.set_language('en')

book.add_author('Author Authorowski')
book.add_author('Danko Bananko', file_as='Gospodin Danko Bananko', role='ill', uid='coauthor')

### create chapter
c1 = epub.EpubHtml(title='Intro', file_name='chap_01.xhtml', lang='hr')
c1.content=u'<h1>Intro heading</h1><p>Žaba je skočila u baru.</p>'

### add chapter
book.add_item(c1)

### define Table Of Contents
book.toc = (epub.Link('chap_01.xhtml', 'Introduction', 'intro'),
(epub.Section('Simple book'),
(c1, ))
)

### add default NCX and Nav file
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())

### define CSS style
style = 'BODY {color: white;}'
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)

### add CSS file
book.add_item(nav_css)

### basic spine
book.spine = ['nav', c1]

### write to the file
epub.write_epub('test.epub', book, {})