|
| 1 | +import json |
| 2 | +import sys |
| 3 | +import os |
| 4 | +import io |
| 5 | +import re |
| 6 | + |
| 7 | +def join(lines): |
| 8 | + return ''.join(lines) |
| 9 | + |
| 10 | +def in_prompt(prompt_number): |
| 11 | + return 'In [%d]: ' % prompt_number |
| 12 | + |
| 13 | +def out_prompt(prompt_number): |
| 14 | + return 'Out[%d]: ' % prompt_number |
| 15 | + |
| 16 | +def add_prompt(lines, prompt): |
| 17 | + "add the prompt on the first line, indent all other lines accordingly" |
| 18 | + indentation = ' '*len(prompt) |
| 19 | + return [prompt+lines[0]] + [indentation+l for l in lines[1:]] |
| 20 | + |
| 21 | +def indent(lines): |
| 22 | + "add indentation required for code samples in Markdown" |
| 23 | + return [' '+l for l in lines] |
| 24 | + |
| 25 | +def code(lines): |
| 26 | + return join(indent(lines)) |
| 27 | + |
| 28 | + |
| 29 | +formulas = re.compile(r'(\$\$?)([^\$]+)(\$\$?)') |
| 30 | +def replace_formulas(text): |
| 31 | + "In Leanpub Markdown, formulas are delimited by {$$}...{/$$}" |
| 32 | + return formulas.sub(r'{$$}\2{/$$}', text) |
| 33 | + |
| 34 | +def text(lines): |
| 35 | + return replace_formulas(join(lines)) |
| 36 | + |
| 37 | + |
| 38 | +def convert_markdown(cell, out): |
| 39 | + content = text(cell['source']) |
| 40 | + if content.startswith('#'): |
| 41 | + # a heading |
| 42 | + out.write(u'\n') |
| 43 | + out.write(content) |
| 44 | + out.write(u'\n\n') |
| 45 | + |
| 46 | + |
| 47 | +def convert_raw(cell, out): |
| 48 | + out.write(join(cell['source'])) |
| 49 | + out.write(u'\n\n') |
| 50 | + |
| 51 | + |
| 52 | +def convert_code(cell, out, base_name, output_dir): |
| 53 | + prompt_number = cell['execution_count'] |
| 54 | + if cell['source']: |
| 55 | + out.write(code(add_prompt(cell['source'], |
| 56 | + in_prompt(prompt_number)))) |
| 57 | + out.write(u'\n') |
| 58 | + last_output_type = None |
| 59 | + for output in cell['outputs']: |
| 60 | + output_type = output['output_type'] |
| 61 | + if output_type == 'execute_result': |
| 62 | + convert_result(output, out, prompt_number, |
| 63 | + continued = (output_type == last_output_type)) |
| 64 | + elif output_type == 'stream': |
| 65 | + convert_stream(output, out, prompt_number, |
| 66 | + continued = (output_type == last_output_type)) |
| 67 | + elif output_type == 'error': |
| 68 | + convert_error(output, out, prompt_number) |
| 69 | + elif output_type == 'display_data': |
| 70 | + if last_output_type in ['execute_result', 'stream']: |
| 71 | + out.write(u'\n\n') |
| 72 | + convert_image(output, out, base_name, output_dir, prompt_number) |
| 73 | + else: |
| 74 | + raise Exception('unknown output type: %s' % output_type) |
| 75 | + last_output_type = output_type |
| 76 | + if last_output_type in ['execute_result', 'stream'] and not ( |
| 77 | + 'data' in output and 'text/html' in output['data']): |
| 78 | + out.write(u'\n\n') |
| 79 | + out.write(u'\n') |
| 80 | + |
| 81 | +def convert_result(output, out, prompt_number, continued=False): |
| 82 | + out.write(u' \n') |
| 83 | + if 'data' in output and 'text/html' in output['data']: |
| 84 | + if not continued: |
| 85 | + out.write(code(add_prompt([u''], |
| 86 | + out_prompt(prompt_number)))) |
| 87 | + out.write(u'\n') |
| 88 | + convert_html(join(output['data']['text/html']), out) |
| 89 | + else: |
| 90 | + prompt = out_prompt(prompt_number) |
| 91 | + if continued: |
| 92 | + # we don't want the prompt, but we need to indent as if it |
| 93 | + # was there. |
| 94 | + prompt = ' '*len(prompt) |
| 95 | + out.write(code(add_prompt(output['data']['text/plain'], prompt))) |
| 96 | + |
| 97 | +def convert_stream(output, out, prompt_number, continued=False): |
| 98 | + out.write(u' \n') |
| 99 | + prompt = out_prompt(prompt_number) |
| 100 | + if continued: |
| 101 | + # we don't want the prompt, but we need to indent as if it |
| 102 | + # was there. |
| 103 | + prompt = ' '*len(prompt) |
| 104 | + out.write(code(add_prompt(output['text'], prompt))) |
| 105 | + |
| 106 | +table_html = re.compile(r'<table.*?>(.*)</table>', re.DOTALL) |
| 107 | +def convert_html(html, out): |
| 108 | + match = table_html.search(html) |
| 109 | + if match: |
| 110 | + convert_table(match.group(1), out) |
| 111 | + else: |
| 112 | + raise Exception('Unknown html: %s' % html) |
| 113 | + |
| 114 | +row_html = re.compile(r'<tr.*?>(.*?)</tr>', re.DOTALL) |
| 115 | +cell_html = re.compile(r'<t[dh].*?>(.*?)</t[dh]>', re.DOTALL) |
| 116 | +def convert_table(table, out): |
| 117 | + data = [] |
| 118 | + rows = row_html.findall(table) |
| 119 | + for r in rows: |
| 120 | + data.append([ x.strip() for x in cell_html.findall(r) ]) |
| 121 | + |
| 122 | + widths = [ max(len(d[i]) for d in data) |
| 123 | + for i in range(len(data[0])) ] |
| 124 | + format = '|' + '|'.join([' %%%ds ' % w for w in widths ]) + '|\n' |
| 125 | + |
| 126 | + total_width = len(format % tuple('' for e in widths)) |
| 127 | + if total_width <= 60: |
| 128 | + width = "narrow" |
| 129 | + elif total_width >= 80: |
| 130 | + width = "wide" |
| 131 | + else: |
| 132 | + width = "default" |
| 133 | + out.write(u'\n{width="%s"}\n' % width) |
| 134 | + |
| 135 | + out.write(format % tuple(data[0])) |
| 136 | + out.write('|' + '|'.join([ u'-'*(w+2) for w in widths ]) + '|\n') |
| 137 | + for d in data[1:]: |
| 138 | + out.write(format % tuple(d)) |
| 139 | + out.write(u'\n\n') |
| 140 | + |
| 141 | + |
| 142 | +terminal_codes = re.compile(r'.\[[01](;\d\d)?m') |
| 143 | +def convert_error(output, out, prompt_number): |
| 144 | + def unescape_terminal_codes(line): |
| 145 | + return terminal_codes.sub('', line) |
| 146 | + out.write(u' \n') |
| 147 | + # There are embedded \n in the lines... |
| 148 | + lines = [ l+'\n' for line in output['traceback'] for l in line.split('\n') ] |
| 149 | + # ...and control codes for the terminal |
| 150 | + out.write(code(add_prompt([ unescape_terminal_codes(l) for l in lines ], |
| 151 | + out_prompt(prompt_number)))) |
| 152 | + out.write(u'\n\n') |
| 153 | + |
| 154 | +def convert_image(output, out, base_name, output_dir, prompt_number): |
| 155 | + ext = extension(output) |
| 156 | + images_dir = os.path.join(output_dir, 'images') |
| 157 | + if not os.path.exists(images_dir): |
| 158 | + os.mkdir(images_dir) |
| 159 | + image_name = '%s-%d.%s' % (base_name.replace(' ','_'), |
| 160 | + prompt_number, ext) |
| 161 | + image_path = os.path.join(images_dir, image_name) |
| 162 | + with open(image_path, 'w') as image: |
| 163 | + image.write(output['data']['image/%s' % ext].decode('base64')) |
| 164 | + out.write(u'\n') |
| 165 | + out.write(u'' % image_name) |
| 166 | + out.write(u'\n\n') |
| 167 | + |
| 168 | +def extension(output): |
| 169 | + candidates = set(output['data'].keys()) - {'text/plain'} |
| 170 | + # whatever key remains should be the extension |
| 171 | + if len(candidates) > 1: |
| 172 | + raise Exception('multiple extensions found: %s' % candidates) |
| 173 | + candidate = str(candidates.pop()) |
| 174 | + if not candidate.startswith('image/'): |
| 175 | + raise Exception('not an image type: %s' % candidate) |
| 176 | + return candidate[6:] |
| 177 | + |
| 178 | +def convert(path, output_dir): |
| 179 | + _, filename = os.path.split(path) |
| 180 | + base_name, _ = os.path.splitext(filename) |
| 181 | + base_name = base_name.lower() |
| 182 | + md_name = base_name + '.md' |
| 183 | + |
| 184 | + with open(path) as f: |
| 185 | + data = json.load(f) |
| 186 | + cells = data['cells'] |
| 187 | + |
| 188 | + with io.open(os.path.join(output_dir, md_name), 'w') as out: |
| 189 | + for cell in cells: |
| 190 | + cell_type = cell['cell_type'] |
| 191 | + if cell_type == 'markdown': |
| 192 | + convert_markdown(cell, out) |
| 193 | + elif cell_type == 'code': |
| 194 | + convert_code(cell, out, base_name, output_dir) |
| 195 | + elif cell_type == 'raw': |
| 196 | + convert_raw(cell, out) |
| 197 | + else: |
| 198 | + raise Exception('unknown cell type: %s' % cell_type) |
| 199 | + |
| 200 | + |
| 201 | +if __name__ == '__main__': |
| 202 | + if len(sys.argv) != 3: |
| 203 | + print(""" |
| 204 | + Usage: %s notebook.ipynb output_dir |
| 205 | +
|
| 206 | + The file notebook.md will be created in output_dir; if the |
| 207 | + notebook contains images, they will be extracted and stored |
| 208 | + in the output_dir/images folder. |
| 209 | + """ % sys.argv[0]) |
| 210 | + sys.exit(1) |
| 211 | + |
| 212 | + convert(os.path.abspath(sys.argv[1]), sys.argv[2]) |
| 213 | + |
0 commit comments