# Import needed libraries

For security reasons you need to create credentials.py file with dict like the following:  
credentials = {  
    'kindle_email': "YOUR_EMAIL@kindle.com",  
    'your_gmail': "YOUR_GMAIL@gmail.com",  
    'gmailpass': "YOUR_PASS",  
               }  

For creating gmailpass, please follow the [link](https://myaccount.google.com/apppasswords). Note that the account security settings will have to "allow unsecure apps" for permission to use the Gmail SMTP server with TLS.  

**Do not forget to add this file to .gitignore**

In [1]:
# for not to lose progress and for updating without reloading
%autosave 180
%load_ext autoreload
%autoreload 2

import requests
import lxml.html as html
import re
import os, sys
import wget
import glob
import tempfile
import tarfile

from credentials import credentials

Autosaving every 180 seconds


# Parameters
## Input parameters

In [2]:
# the needed_article_url can be an arxiv URL or any string containing an arxiv ID.
needed_article_url = "https://arxiv.org/pdf/2107.12708.pdf"

# paper settings (decrease width/height to increase font)
landscape = False # False=horizontal, True=vertical
width = "4.2in"
height = "6.55in"
margin = "0.1in"

show_generated_pdf = True
send = False

## Other parameters

In [3]:
kindle_email = credentials['kindle_email']
your_gmail = credentials['your_gmail']
gmailpass = credentials['gmailpass']

# settings for latex geometry package:
if not landscape:
    width, height = height, width
geom_settings = dict(paperwidth=width, paperheight=height, margin=margin)

# Download archived arxiv archive

In [5]:
arxiv_id = re.match(r'((http|https)://.*?/)?(?P<id>\d{4}\.\d{4,5}(v\d{1,2})?)', needed_article_url).group('id')
arxiv_abs = 'http://arxiv.org/abs/' + arxiv_id
arxiv_pdf = 'http://arxiv.org/pdf/' + arxiv_id
arxiv_pgtitle = html.fromstring(requests.get(arxiv_abs).text.encode('utf8')).xpath('/html/head/title/text()')[0]
arxiv_title = re.sub(r'\s+', ' ', re.sub(r'^\[[^]]+\]\s*', '', arxiv_pgtitle), re.DOTALL)
arxiv_title_scrubbed = re.sub('[^-_A-Za-z0-9]+', '_', arxiv_title, re.DOTALL)

print('arxiv_id:', arxiv_id)
print('arxiv_title:', arxiv_title)

arxiv_id: 2107.12708
arxiv_title: QA Dataset Explosion: A Taxonomy of NLP Resources for Question Answering and Reading Comprehension


In [6]:
# create temporary directory
d = tempfile.mkdtemp(prefix='arxiv2kindle_')

archive_url = 'http://arxiv.org/e-print/' + arxiv_id

# download tar.gz file and add file extension
tar_filename = wget.download(archive_url, out=os.path.join(d, ''.join([arxiv_title, '.tar.gz'])))
tar_filename

100% [..........................................................................] 1819262 / 1819262

'C:\\Users\\Oleg\\AppData\\Local\\Temp\\arxiv2kindle_72w6ntgm\\QA Dataset Explosion: A Taxonomy of NLP Resources for Question Answering and Reading Comprehension.tar.gz'

# Process the files

In [7]:
# extract file contents
os.chdir(d)
tf = tarfile.open(tar_filename)
tf.extractall()

# find tex files
texfiles = glob.glob(os.path.join(d, '*.tex'))
display('found files with .tex extension', texfiles)

'found files with .tex extension'

['C:\\Users\\Oleg\\AppData\\Local\\Temp\\arxiv2kindle_72w6ntgm\\02_explosion.tex',
 'C:\\Users\\Oleg\\AppData\\Local\\Temp\\arxiv2kindle_72w6ntgm\\03_probing-vs-info-seeking.tex',
 'C:\\Users\\Oleg\\AppData\\Local\\Temp\\arxiv2kindle_72w6ntgm\\043_format_question.tex',
 'C:\\Users\\Oleg\\AppData\\Local\\Temp\\arxiv2kindle_72w6ntgm\\04_format.tex',
 'C:\\Users\\Oleg\\AppData\\Local\\Temp\\arxiv2kindle_72w6ntgm\\05_modality.tex',
 'C:\\Users\\Oleg\\AppData\\Local\\Temp\\arxiv2kindle_72w6ntgm\\06_amount.tex',
 'C:\\Users\\Oleg\\AppData\\Local\\Temp\\arxiv2kindle_72w6ntgm\\07_domain.tex',
 'C:\\Users\\Oleg\\AppData\\Local\\Temp\\arxiv2kindle_72w6ntgm\\08_discourse.tex',
 'C:\\Users\\Oleg\\AppData\\Local\\Temp\\arxiv2kindle_72w6ntgm\\09_languages.tex',
 'C:\\Users\\Oleg\\AppData\\Local\\Temp\\arxiv2kindle_72w6ntgm\\10_reasoning.tex',
 'C:\\Users\\Oleg\\AppData\\Local\\Temp\\arxiv2kindle_72w6ntgm\\11_discussion.tex',
 'C:\\Users\\Oleg\\AppData\\Local\\Temp\\arxiv2kindle_72w6ntgm\\12_conclusi

In [8]:
def reorder_list(ls):
    # works also for 'domain'. may be fixed with regexp
    main_files = [file for file in ls if 'main.tex' in file]
    for file in main_files:
        ls.remove(file)
        ls.insert(0, file)

    template_files = [file for file in ls if 'template' in file]
    for file in template_files:
        ls.remove(file)
        ls.append(file)
    
    return ls

# finding the main tex file
def find_main_tex(texfiles):
    texfiles = reorder_list(texfiles)
    
    for texfile in texfiles:
        with open(texfile, 'r') as f:
            src = f.readlines()
        for line in src:
            if line.startswith('\documentclass'):
                print('correct file: ' + texfile)
                return texfile
            else:
                continue
    print('correct file not found')
    
texfile = find_main_tex(texfiles)

correct file: C:\Users\Oleg\AppData\Local\Temp\arxiv2kindle_72w6ntgm\main.tex


In [9]:
with open(texfile, 'r') as f:
    src = f.readlines()

In [10]:
# filter comments/newlines for easier debugging:
src = [line for line in src if line[0] != '%' and len(line.strip()) > 0]

# strip column stuff and stuff in documentclass line:
src[0] = re.sub(r'\b\d+pt\b', '', src[0]) # strip font size, ex. "11pt"
src[0] = re.sub(r'\b\w+column\b', '', src[0]) # strip 
src[0] = re.sub(r'\b\w+paper\b', '', src[0]) # strip paper size, ex. "letterpaper" or "a4paper"
src[0] = re.sub(r'(?<=\[),', '', src[0]) # remove extraneous starting commas
src[0] = re.sub(r',(?=[\],])', '', src[0]) # remove extraneous middle/ending commas

In [11]:
# for the case if we use the package after it was initialized with other params
src.insert(0, "\\PassOptionsToPackage{" + ','.join(k+'='+v for k,v in geom_settings.items()) +"}{geometry}\n")

In [12]:
# find begin{document}:
begindocs = [i for i, line in enumerate(src) if re.match(r'\s*\\begin{document}', line)]

try:
    assert(len(begindocs) == 1)
    src.insert(begindocs[0], '\\usepackage['+','.join(k+'='+v for k,v in geom_settings.items())+']{geometry}\n')
    src.insert(begindocs[0], '\\usepackage{times}\n')
    src.insert(begindocs[0], '\\pagestyle{empty}\n')
    if landscape:
        src.insert(begindocs[0], '\\usepackage{pdflscape}\n')
except:
    print('Beginning not found. Adding needd packages to the beginning of the file')
    try:
        if landscape:
            src.insert(0, '\\usepackage{pdflscape}\n')
        src.insert(0, '\\pagestyle{empty}\n')
        src.insert(0, '\\usepackage{times}\n')
        src.insert(0, '\\usepackage['+','.join(k+'='+v for k,v in geom_settings.items())+']{geometry}\n')
    except:
        print('assert occured')

In [13]:
# shrink figures to be at most the size of the page:
for i, line in enumerate(src):
    m = re.search(r'\\includegraphics\[width=([.\d]+)\\(line|text)width\]', line)
    if m:
        mul = m.group(1)
        print(m)
        src[i] = re.sub(r'\\includegraphics\[width=([.\d]+)\\(line|text)width\]',
                   r'\\includegraphics[width={mul}\\textwidth,height={mul}\\textheight,keepaspectratio]'.format(mul=mul),
                   line)

In [14]:
# replace phrases or words found in dictionary
def normalize_by_dictionary(word, dictionary):
    result = []
    for word in word.split():
        # if word is in uppercase
        if word == word.upper():
            if word.lower() in dictionary:
                result.append(dictionary[word.lower()].upper())
            else:
                result.append(word)
        else:
            if word.lower() in dictionary:
                result.append(dictionary[word.lower()])
            else:
                result.append(word)
    
    return " ".join(result)


latex_rubbish_mapping = {'пјЏ'.lower(): '/', 'In': 'in'}

# replace this fucking latex shit
for i in range(len(src)):
    line = src[i].split()
    for j, word in enumerate(line):
            normalized_word = normalize_by_dictionary(word, latex_rubbish_mapping)
            if normalized_word != word:
                print(word, normalized_word)
                print(i)
            src[i] = src[i].replace(word, normalized_word)

In [15]:
# find non-compiling file extensions
folder_files = os.listdir()
bad_images = [file for file in folder_files if '.ps' in file or '.eps' in file]
print('bad_images', bad_images)

# replace strange formats with pdf images
for file in bad_images:
    print(file)
    filename = file.split('.')[0]
    ! ps2pdf {file} {''.join([filename,'.pdf'])}
    ! rm {file}

bad_images []


In [16]:
print('all files in article dir')
for path, subdirs, files in os.walk(os.getcwd()):
    for name in files:
        print(os.path.join(path, name))

all files in article dir
C:\Users\Oleg\AppData\Local\Temp\arxiv2kindle_72w6ntgm\02_explosion.tex
C:\Users\Oleg\AppData\Local\Temp\arxiv2kindle_72w6ntgm\03_probing-vs-info-seeking.tex
C:\Users\Oleg\AppData\Local\Temp\arxiv2kindle_72w6ntgm\043_format_question.tex
C:\Users\Oleg\AppData\Local\Temp\arxiv2kindle_72w6ntgm\04_format.tex
C:\Users\Oleg\AppData\Local\Temp\arxiv2kindle_72w6ntgm\05_modality.tex
C:\Users\Oleg\AppData\Local\Temp\arxiv2kindle_72w6ntgm\06_amount.tex
C:\Users\Oleg\AppData\Local\Temp\arxiv2kindle_72w6ntgm\07_domain.tex
C:\Users\Oleg\AppData\Local\Temp\arxiv2kindle_72w6ntgm\08_discourse.tex
C:\Users\Oleg\AppData\Local\Temp\arxiv2kindle_72w6ntgm\09_languages.tex
C:\Users\Oleg\AppData\Local\Temp\arxiv2kindle_72w6ntgm\10_reasoning.tex
C:\Users\Oleg\AppData\Local\Temp\arxiv2kindle_72w6ntgm\11_discussion.tex
C:\Users\Oleg\AppData\Local\Temp\arxiv2kindle_72w6ntgm\12_conclusion.tex
C:\Users\Oleg\AppData\Local\Temp\arxiv2kindle_72w6ntgm\ACM-Reference-Format.bbx
C:\Users\Oleg\AppD

In [17]:
# src -> tex.bak and compile
os.rename(texfile, texfile+'.bak')
with open(texfile, 'w') as f:
    f.writelines(src)

# texout = !pdflatex {texfile} && pdflatex {texfile} && pdflatex {texfile}
texout = !pdflatex {texfile}

In [19]:
texout[-100:]

 'fined on input line 118.',
 '',
 '',
 '15 undefined on input line 118.',
 '',
 '',
 '',
 '',
 "QuAD_20_and_QuAC' on page 15 undefined on input line 120.",
 '',
 '',
 "aluating_Generalization_in_Reading_Comprehension' on page 15 undefined on input",
 ' line 120.',
 '',
 '',
 ' 121.',
 '',
 '',
 'n input line 124.',
 '',
 '',
 "wards_Human-like_Linguistic_Generalization' on page 15 undefined on input line ",
 '124.',
 '',
 '',
 "n_Finding_and_Removing_Artifacts_in_Language_Data' on page 15 undefined on inpu",
 't line 124.',
 '',
 '',
 'al_Language_Processing_Toward_Mitigating_System_Bias_and_Enabling_Better_Scienc',
 "e' on page 15 undefined on input line 127.",
 '',
 '',
 "sets' on page 15 undefined on input line 127.",
 '',
 '',
 "orting' on page 15 undefined on input line 127.",
 '',
 '',
 'on page 15 undefined on input line 127.',
 '',
 ') (12_conclusion.tex) (main.bbl [15.15]',
 'Underfull \\vbox (badness 10000) has occurred while \\output is active [16.16]',
 'Underfull \\vbox (

In [20]:
pdffilename = texfile[:-4] + '.pdf'

In [21]:
if show_generated_pdf:
    # for displaying pdf
    from wand.image import Image as WImage
    
    print('landscape', landscape, 'width', width, 'height', height, 'margin', margin)
    #! {pdffilename}
    # load first page just for check
    i = 0
    filename, file_extension = os.path.splitext(pdffilename)
    while(1):
        try:
            filename_to_open = ''.join([filename, file_extension + '[{}]'.format(i)])
            #print('filename_to_open', filename_to_open)
            img = WImage(filename=filename_to_open)
            display(img)
            i += 1
        except:
            break

landscape False width 6.55in height 4.2in margin 0.1in


# Sending message from google mail to kindle

In [23]:
import smtplib

from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart

msg = MIMEMultipart()
pdf_part = MIMEApplication(open(texfile[:-4]+'.pdf', 'rb').read(), _subtype='pdf')
pdf_part.add_header('Content-Disposition', 'attachment', filename=arxiv_title_scrubbed+".pdf")
msg.attach(pdf_part)

In [25]:
if send:
    with smtplib.SMTP_SSL('smtp.gmail.com') as server:
        server.set_debuglevel(0)
        server.ehlo()
        #server.starttls()  
        server.login(your_gmail, gmailpass)
        server.sendmail(your_gmail, kindle_email, msg.as_string())
        server.close()