# Import needed libraries

For security reasons you need to create credentials.py file with dict like the following:  
credentials = {  
    'kindle_email': "YOUR_EMAIL@kindle.com",  
    'your_gmail': "YOUR_GMAIL@gmail.com",  
    'gmailpass': "YOUR_PASS",  
               }  

For creating gmailpass, please follow the [link](https://myaccount.google.com/apppasswords). Note that the account security settings will have to "allow unsecure apps" for permission to use the Gmail SMTP server with TLS.  

**Do not forget to add this file to .gitignore**

In [2]:
# for not to lose progress and for updating without reloading
%autosave 180
%load_ext autoreload
%autoreload 2

import requests
import lxml.html as html
import re
import urllib
import os, sys, subprocess, os.path
import wget
import glob
import getpass
import tempfile
import tarfile

from credentials import credentials

# for sending msg
import smtplib

os.getcwd()

Autosaving every 180 seconds
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'C:\\Users\\Oleg\\Documents\\SourceTree\\arxiv2kindle'

# Parameters
## Inputr parameters

In [3]:
# the needed_article_url can be an arxiv URL or any string containing 
# an arxiv ID.
needed_article_url = "http://arxiv.org/abs/1709.03856"

# paper settings (decrease width/height to increase font)
landscape = False #horizontal
width = "4.2in"
height = "6.55in"
margin = "0.1in"

show_generated_pdf = True
send = False

## Other parameters

In [4]:
kindle_email = credentials['kindle_email']
your_gmail = credentials['your_gmail']
gmailpass = credentials['gmailpass']

# settings for latex geometry package:
if landscape:
    geom_settings = dict(paperwidth=width, paperheight=height, margin=margin)
else:
    geom_settings = dict(paperwidth=height, paperheight=width, margin=margin)

# Download archived arxiv archive

In [5]:
arxiv_id = re.match(r'(http://.*?/)?(?P<id>\d{4}\.\d{4,5}(v\d{1,2})?)', needed_article_url).group('id')
arxiv_abs = 'http://arxiv.org/abs/' + arxiv_id
arxiv_pdf = 'http://arxiv.org/pdf/' + arxiv_id
arxiv_pgtitle = html.fromstring(requests.get(arxiv_abs).text.encode('utf8')).xpath('/html/head/title/text()')[0]
arxiv_title = re.sub(r'\s+', ' ', re.sub(r'^\[[^]]+\]\s*', '', arxiv_pgtitle), re.DOTALL)
arxiv_title_scrubbed = re.sub('[^-_A-Za-z0-9]+', '_', arxiv_title, re.DOTALL)

print('arxiv_id:', arxiv_id)
print('arxiv_title:', arxiv_title)

arxiv_id: 1709.03856
arxiv_title: StarSpace: Embed All The Things!


In [6]:
# create temporary directory
d = tempfile.mkdtemp(prefix='arxiv2kindle_')

archive_url = 'http://arxiv.org/e-print/' + arxiv_id

# download tar.gz file and add file extension
tar_filename = wget.download(archive_url, out=os.path.join(d, ''.join([arxiv_title, '.tar.gz'])))
tar_filename

  0% [                                                                              ]     0 / 28399 28% [......................                                                        ]  8192 / 28399 57% [............................................                                  ] 16384 / 28399 86% [...................................................................           ] 24576 / 28399100% [..............................................................................] 28399 / 28399

'C:\\Users\\Oleg\\AppData\\Local\\Temp\\arxiv2kindle_gtvllduh\\StarSpace: Embed All The Things!.tar.gz'

In [7]:
# extract file contents
os.chdir(d)
tf = tarfile.open(tar_filename)
tf.extractall()

# find tex files
texfiles = glob.glob(os.path.join(d, '*.tex'))
display('found files with .tex extension', texfiles)

for texfile in texfiles:
    with open(texfile, 'r') as f:
        src = f.readlines()
    if 'documentclass' in src[0]:
        print('correct file: ' + texfile)
        break

'found files with .tex extension'

['C:\\Users\\Oleg\\AppData\\Local\\Temp\\arxiv2kindle_gtvllduh\\aaai_main.tex']

In [8]:
# filter comments/newlines for easier debugging:
src = [line for line in src if line[0] != '%' and len(line.strip()) > 0]

# strip column stuff and stuff in documentclass line:
src[0] = re.sub(r'\b\d+pt\b', '', src[0]) # strip font size, ex. "11pt"
src[0] = re.sub(r'\b\w+column\b', '', src[0]) # strip 
src[0] = re.sub(r'\b\w+paper\b', '', src[0]) # strip paper size, ex. "letterpaper" or "a4paper"
src[0] = re.sub(r'(?<=\[),', '', src[0]) # remove extraneous starting commas
src[0] = re.sub(r',(?=[\],])', '', src[0]) # remove extraneous middle/ending commas

In [9]:
# find begin{document}:
begindocs = [i for i, line in enumerate(src) if re.match(r'\s*\\begin{document}', line)]

try:
    assert(len(begindocs) == 1)
    src.insert(begindocs[0], '\\usepackage['+','.join(k+'='+v for k,v in geom_settings.items())+']{geometry}\n')
    src.insert(begindocs[0], '\\usepackage{times}\n')
    src.insert(begindocs[0], '\\pagestyle{empty}\n')
    if landscape:
        src.insert(begindocs[0], '\\usepackage{pdflscape}\n')
except:
    print('assert occured')

In [10]:
# shrink figures to be at most the size of the page:
for i in range(len(src)):
    line = src[i]
    m = re.search(r'\\includegraphics\[width=([.\d]+)\\(line|text)width\]', line)
    if m:
        mul = m.group(1)
        print(m)
        src[i] = re.sub(r'\\includegraphics\[width=([.\d]+)\\(line|text)width\]',
                   r'\\includegraphics[width={mul}\\textwidth,height={mul}\\textheight,keepaspectratio]'.format(mul=mul),
                   line)

In [11]:
# replace phrases or words fount in dictionary
def normalize_by_dictionary(word, dictionary):
    #print('normalizing', word)
    
    result = []
    for word in word.split():
        # if word is in uppercase
        if word == word.upper():
            if word.lower() in dictionary:
                result.append(dictionary[word.lower()].upper())
            else:
                result.append(word)
        else:
            if word.lower() in dictionary:
                result.append(dictionary[word.lower()])
            else:
                result.append(word)
    
    return " ".join(result)


latex_shit_mapping = {'пјЏ'.lower(): '/', 'In': 'in'}

# replace this fucking latex shit
for i in range(len(src)):
    line = src[i].split()
    for j, word in enumerate(line):
#         if word.lower() in list(latex_shit_mapping.keys()):
            normalized_word = normalize_by_dictionary(word, latex_shit_mapping)
            if normalized_word != word:
                print(word, normalized_word)
                print(i)
            src[i] = src[i].replace(word, normalized_word)

пјЏ /
480


In [12]:
# find non-compiling file extensions
folder_files = os.listdir()
bad_images = [file for file in folder_files if '.ps' in file or '.eps' in file]
print('bad_images', bad_images)

print('all files in article dif', os.listdir())

# replace strange formats with pdf images
for file in bad_images:
    print(file)
    filename = file.split('.')[0]
    ! ps2pdf {file} {''.join([filename,'.pdf'])}
    ! rm {file}

bad_images []
all files in article dif ['aaai.bst', 'aaai18.sty', 'aaai_main.bbl', 'aaai_main.tex', 'StarSpace']


In [19]:
# src -> tex.bak and compile
os.rename(texfile, texfile+'.bak')
with open(texfile, 'w') as f:
    f.writelines(src)

texout = !pdflatex {texfile} && pdflatex {texfile} && pdflatex {texfile}
texout[:10]

FileExistsError: [WinError 183] Невозможно создать файл, так как он уже существует: 'C:\\Users\\Oleg\\AppData\\Local\\Temp\\arxiv2kindle_gtvllduh\\aaai_main.tex' -> 'C:\\Users\\Oleg\\AppData\\Local\\Temp\\arxiv2kindle_gtvllduh\\aaai_main.tex.bak'

In [14]:
pdffilename = texfile[:-4] + '.pdf'
if sys.platform == 'darwin':
    os.system('open ' + pdffilename)
else:
    os.system('xdg-open ' + pdffilename)

In [17]:
if show_generated_pdf:
    # for displaying pdf
    from wand.image import Image as WImage
    
    print('landscape', landscape, 'width', width, 'height', height, 'margin', margin)
    #! {pdffilename}
    i = 0
    filename, file_extension = os.path.splitext(pdffilename)
    while(1):
        try:
            filename_to_open = ''.join([filename, file_extension + '[{}]'.format(i)])
            #print('filename_to_open', filename_to_open)
            img = WImage(filename=filename_to_open)
            display(img)
            i += 1
        except:
            break

landscape False width 4.2in height 6.55in margin 0.1in


# Sending message from google mail to kindle

In [15]:
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart

msg = MIMEMultipart()
pdf_part = MIMEApplication(open(texfile[:-4]+'.pdf', 'rb').read(), _subtype='pdf')
pdf_part.add_header('Content-Disposition', 'attachment', filename=arxiv_title_scrubbed+".pdf")
msg.attach(pdf_part)

In [18]:
if send:
    with smtplib.SMTP_SSL('smtp.gmail.com') as server:
        server.set_debuglevel(0)
        server.ehlo()
        #server.starttls()  
        server.login(your_gmail, gmailpass)
        server.sendmail(your_gmail, kindle_email, msg.as_string())
        server.close()