In [1]:
import sys
!{sys.executable} -m pip install python-hebrew-numbers
# sys.path
# sys.path.append(r'C:\Users\Aviezer\.virtualenvs\Shatz_Project-Y4ZvSTsa\Lib\site-packages')
# sys.path



In [2]:
#imports various packages
import json, urllib.request
from urllib.parse import urlparse, parse_qs
import os, platform, subprocess, csv
import math
from hebrew_numbers import int_to_gematria

In [3]:
#this function pulls a text from Sefaria's github repo, given a string with the name of the text in the repo's format
#this will presumably be replaced with pulling a text from a downloaded copy of the sefaria repo
def pull_text(string_for_link):
    link = "https://raw.githubusercontent.com/Sefaria/Sefaria-Export/master/json/"+string_for_link+".json"
    with urllib.request.urlopen(link) as url:
        text_json = json.loads(url.read().decode())
    return text_json

In [4]:
#this generates a list of links between texts in sefaria
#this is used to link comments in the gemara to gemara they're on
def pull_links():
    link_list = []#blank list to be filled in
    for i in range(9):#this increments through all the github files that contain links
        with open('links/links'+str(i)+'.csv', encoding="utf-8") as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            next(csv_reader)#skips first row
            for row in csv_reader:
                if row[2] == "commentary":#only interested in commentaries
                    link = []
                    link.append(row[0])
                    link.append(row[1])
                    link_list.append(link)
    return link_list

In [5]:
link_list = pull_links()#generates the list of comment links

In [6]:
def match_comment(comment_str, links):#matches a comment with the gemara it's on
    for link in links:#for every link in the list
        #if the text is in the list, return the text it's linked to
        if comment_str in link[0]:
            return link[1]
        elif comment_str in link[1]:
            return link[0]
    return

In [7]:
def get_index_json(masekhet):
    #this gets the index json for a particular masekhet, which includes info about perakim.
    link = "https://raw.githubusercontent.com/Sefaria/Sefaria-Export/master/schemas/"+masekhet+".json"
    with urllib.request.urlopen(link) as url:
        index_json = json.loads(url.read().decode())
    chaps = index_json["alts"]["Chapters"]["nodes"]
    return chaps

In [8]:
def find_perek(name):
    name = name.split(' ')
    masekhet = name[0]
    daf,line = name[1].split(":")
    chapters = get_index_json(masekhet)
    ref = masekhet+" "+daf
    for chapter in chapters:
        for page in chapter["refs"]:
            if ":" not in page and page==ref:
                return chapter["title"],chapter['heTitle']
            elif ":" in page and ref in page:
                sections = page.split(":")[1]
                sections = sections.split("-")
                if int(sections[0])<=int(line)<=int(sections[1]):
                    return chapter["title"],chapter['heTitle']
    return ref

In [9]:
def match_chapters(text_json, links):
    #adds perek breaks to text json
    refs = []
    text_json["text_perakim"] = []#original text, but with perakim breaks noted
    text_json["chap_list"] = []#list of chapter titles
    daf_i = 1#initializing counters.
    #Sefaria puts a blank spot for daf 1, so it starts with 1 not 2.
    title = text_json["title"]
    current_perek = ""
    j=0
    for daf in text_json["text"]:
        if daf != []:#if daf isn't empty
            comment_i = 1#comment counter
            for comment in daf:
                daf_num = math.floor(daf_i)#rounds daf down from 0.5 to get real number
                if daf_num == daf_i:
                    amud = "a"
                else:
                    amud = "b"#for daf with 0.5, it's daf X amud b
                daf_ref = str(daf_num)+amud #makes davening number, like 4b
                new_ref = title +" "+ daf_ref + ":"+str(comment_i)
                #adds masekhet name to daf reference
                gemara_ref = ""
                for link in links:#for every link in link list
                    if link[0] == new_ref:#if the link is do the relevant daf
                        gemara_ref = link[1]
                        break
                if "-" in gemara_ref:#if a reference spans a daf
                    gemara_ref = gemara_ref.split("-")[0]#returns the first part
                if gemara_ref != "":
                    ref_perek = find_perek(gemara_ref)#looks up the perek of the daf of gemara
                    if ref_perek != current_perek:#if the reference is a new perek
                        current_perek = ref_perek#set current perek
                        perek_info = {"name_en":ref_perek[0],"name_he":ref_perek[1]}
                        text_json["text"][j].insert(comment_i-1,perek_info)
                        #the above adds a dict with info on the perek into the text json
                comment_i += 1
        daf_i += 0.5
        j += 1
    return text_json

In [10]:
def make_body(hebrew_text, english_text, settings):
    output = []
    chap_num = 1
    mishna_num = 1# lav davka mishna, just the smaller divisions of the text
    title = hebrew_text["heTitle"]
    title_command = r"\newcommand{\texttitle}{"+title+"}"#sets title
    divisions_en = hebrew_text["sectionNames"] #gets names of the sections for the specific text
    divisions_he = []
    #the following uses the CSV of section names to get the Hebrew sections names
    with open('resources/section_names.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        next(csv_reader)
        for row in csv_reader:
            for division in divisions_en:
                if row[0] == division:
                    divisions_he.append(row[1])
    if "Daf" in divisions_en:
        #if the text is based on dappim, run the script to add perakim notations
        hebrew_text = match_chapters(hebrew_text,link_list)       
    for perek in hebrew_text["text"]:
        if any(perek):
            if type(perek[0]) == dict and "name_he" in perek[0].keys():
                #if there's a new perek dict note, add the LaTeX code for the new perek
                output.append(r"\newchap{"+parse_perek_title(perek[0])+"}")
            if "Daf" in divisions_en:
                #this adds daf numbers for each new daf, ignoring the amud break
                daf = ((chap_num+1)/2)
                if daf == round(daf):
                    daftitle = int_to_gematria(round(daf), gershayim=False)
                    output.append(r"\newsection{דף "+daftitle+"}")              
            else:
                output.append(r"\newsection{"+divisions_he[0]+int_to_gematria(chap_num, gershayim=False)+"}")
            for par in perek:
                #prints next block of text
                textblock = ""
                if type(par) == dict and "name_he" in par.keys() and par["name_en"] != "Chapter 1":
                    if textblock != "":
                        new_text = make_section(textblock,None, settings, chap_num, mishna_num)
                        output.append(new_text)
                else:
                    while type(par)==list:
                        new_par = ""
                        for item in par:
                            new_par += item
                        par = new_par
                    if type(par) != dict:
                        textblock += par
                mishna_num += 1
            new_text = make_section(textblock,None, settings, chap_num, mishna_num)
            if "twocol" in new_text and "twocol" in output[-1]:
                new_text = new_text.replace(r"\twocol{","\par ")
                output = output[0:-1]+[output[-1][0:-1]]+[new_text]
            else:
                output.append(new_text)
        chap_num += 1
        mishna_num = 1
    return title_command, output

In [11]:
def parse_perek_title(perekDict):
    chap_num = perekDict["name_en"].replace("Chapter ","")
    title = r"פרק \hebrewnumeral{"+chap_num+r"}\quad "+perekDict["name_he"]
    return title

In [12]:
def removeformatting(text):
    while "<" in text and ">" in text:
        loc1 = text.find("<")
        loc2 = text.find(">",loc1)+1
        text = text.replace(text[loc1:loc2],"")
    return text

In [13]:
def make_section(hebrew_text, english, settings, chap_num, mishna_num):
    #turns a block of text into a latex section using the \textblock or \twocol command
    if english != "" and english != None:
        english = english.replace("[","{[")
        english = english.replace("]","]}")
        output = r"\textblock{"+hebrew_text+"}{"+english+"}"
    elif settings["layout"] == "twocol":
        output= r"\twocol{"+hebrew_text+"}"
    else:
        output= r"\textblock{"+hebrew_text+"}"
    with open('resources/html_tags_to_tex.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        next(csv_reader)
        for row in csv_reader:
            if row[0] in output:
                output = output.replace(row[0],row[1])
    output = removeformatting(output)
    return output

In [14]:
def set_format(template_lines,settings):
    output = []
    #this sets the format for the text in the LaTeX preamble.
    #ths line[0:-1] is the text of the line in LaTeX being converted by the script.
    #the -1 is needed to exclude the \n for line break at the end of each line.
    #these work by converting a LaTeX comment for a specific formatting piece to a command, based on what's in the settings json.
    for line in template_lines:
        if line[0:-1] in settings.keys():
            setting_output = line[0:-1] + "="+settings[line[0:-1]]+",\n"
            output.append(setting_output)
        elif line[0:-1] == "%setfontsize":
            fontsize = settings["fontsize"]
            skip = fontsize * settings["spacing"]
            fontsizestr = r"\fontsize{"+str(fontsize)+r"pt}{"+str(round(skip,1))+r"pt} \selectfont"
            output.append(fontsizestr)
        elif line[0:-1] == "%sethebfont":
            if settings["hebboldfont"] == "":
                font = r"\setmainfont{"+settings["hebfont"]+r"}"
            else:
                font = r"\setmainfont[BoldFont = {"+settings["hebboldfont"]+r'}]{'+settings["hebfont"]+r"}"
            output.append(font)
        elif line[0:-1] == "%setengfont" and settings["engfont"] != 0:
            engfont = r'\newfontfamily\englishfont{'+settings["engfont"]+r'}'
            output.append(engfont)
        elif line[0:-1] == "%setparskip" and settings["parskip"] != 0:
            parskip = r'\setlength{\parskip}{'+settings["parskip"]+'}'
            output.append(parskip)
        elif line[0:-1] == "%pagenumber":
            if settings["pagenumloc"] == "topouter":
                pagenum = r"\fancyhead[LO,RE]{num}"
            elif settings["pagenumloc"] == "bottommiddle":
                pagenum = r"\fancyfoot[C]{num}"
            if settings["pagenumheb"] == True:
                pagenum = pagenum.replace("num",r"\hebrewnumeral{\thepage}")
            else:
                pagenum = pagenum.replace("num",r"\thepage")
            output.append(pagenum)
        elif line[0:-1] == "%header":
            if settings["headpos"] == "center":
                odd_header = r"\fancyhead[CO]{"
                even_header = r"\fancyhead[CE]{"
            elif settings["headpos"] == "inner":
                odd_header = r"\fancyhead[RO]{"
                even_header = r"\fancyhead[LE]{"
            if settings["evenhead"] == "title":
                even_header += r"\texttitle"
            elif settings["evenhead"] == "chapter":
                even_header += r"\chapname"
            elif settings["evenhead"] == "titlechapter":
                even_header += r"\texttitle \space\textendash\space \chapname"
            if settings["oddhead"] == "title":
                odd_header += r"\texttitle"
            elif settings["oddhead"] == "chapter":
                odd_header += r"\chapname"
            elif settings["oddhead"] == "titlechapter":
                odd_header += r"\texttitle \space\textendash\space \chapname"
            odd_header += "}"
            even_header += "}"
            output.append(odd_header)
            output.append(even_header)
        elif line[0:-1] == "%chapfontsize":
            if "chapfontsize" in settings.keys():
                headerfontcommand = r"\fontsize{"+settings["chapfontsize"]+"}{"+settings["chapfontsize"]+r"}\selectfont"
            else:
                headerfontcommand = r"\LARGE"
            output.append(headerfontcommand)
        else:
            output.append(line)
    return output

In [15]:
def get_bib_info(json):
    #puts bibliographic info in a dict
    source_data = {}
    source_data["source"] = json["versionSource"]
    source_data["license"] = json["license"]
    source_data["version"] = json["versionTitle"]
    return source_data

def print_source_data(source_list):
    output = []
    output.append(r"\begin{itemize}")
    #puts every piece of bibliographic info into a copyright notice
    for source in source_list:
        if "NC" in source["license"] or "Copyright" in source["license"]:
            return ["NC",source["version"]]
        versiontitle = source["version"].replace("-",r"\textendash ")
        output.append(r"\item "+versiontitle)
        if len(source_list) > 1:
            output.append(r"\begin{itemize}")
        output.append(r"\item License: "+source["license"])
        output.append(r"\item Source: \url{"+source["source"]+"}")
        if len(source_list) > 1:
            output.append(r"\end{itemize}")
    output.append(r"\end{itemize}")
    return output

In [16]:
#reads template file
inputpath = os.path.join("resources","input.tex")
def pullinput(inputpath):
    with open(inputpath, 'r', encoding='utf-8') as infile:
        template_lines = list(infile.readlines())
    return template_lines

#converts input into output
def writeoutput(outputpath, template, formatting):
    sources = []
    template_with_settings = set_format(template,formatting)#reads settings
    sefaria_json = pull_text(formatting["text"])#pulls json from Sefaria
    sources.append(get_bib_info(sefaria_json))#puts bibliographic info in sources list
    if formatting["translation"]!= "":
        #pulls translation, if any, and adds to bibliographic list
        english_json = pull_text(formatting["translation"])
        sources.append(get_bib_info(english_json))
        sefaria_result = make_body(sefaria_json,english_json,formatting)
    else:
        sefaria_result = make_body(sefaria_json, None, formatting)
    body = sefaria_result[1]
    title_command = sefaria_result[0]
    source_listing = print_source_data(sources)
    if source_listing[0] == "NC":#stops the script if the license doesn't allow the text to run
        print(source_listing[1] + " has a license which does not allow creation of this text.")
        return
    with open(outputpath, 'w', encoding='utf-8') as outfile:
        for line in template_with_settings:
            if line == "%title_here\n":
                outfile.write(title_command)
            elif line == "%license info\n":
                for item in source_listing:
                    outfile.write(item)
                    outfile.write("\n")
            elif line == "%body_here\n":
                for newline in body:
                    outfile.write(newline)
                    outfile.write("\n")
            else:
                outfile.write(line)
                if "\n" not in line:
                    outfile.write("\n")

In [17]:
#all this is doing is flipping 
from pdfrw import PdfReader, PdfWriter
#flips PDF for print on demand
def flip_PDF(inpfn):
    rotate = 180

    #ranges = [[int(y) for y in x.split('-')] for x in ranges]
    outname = inpfn.split(".pdf")
    outfn = outname[0]+".rotated."+outname[1]+"pdf"
    print(outfn)
    #outfn = 'rotate.%s' % os.path.basename(inpfn)
    trailer = PdfReader(inpfn)
    pages = trailer.pages

    ranges = [[1, len(pages)]]

    for onerange in ranges:
        onerange = (onerange + onerange[-1:])[:2]
        for pagenum in range(onerange[0]-1, onerange[1]):
            pages[pagenum].Rotate = (int(pages[pagenum].inheritable.Rotate or
                                         0) + rotate) % 360

    outdata = PdfWriter(outfn)
    outdata.trailer = trailer
    outdata.write()
    return

ModuleNotFoundError: No module named 'pdfrw'

In [None]:

template_lines = pullinput(inputpath)
# for line in template_lines:
#     print(line)
outputname = "output.tex"
with open('book_settings.json',encoding='utf=8') as json_file:
    book_settings = json.load(json_file)
writeoutput(outputname,template_lines,book_settings)

In [None]:
subprocess.run(['xelatex', '-interaction=nonstopmode', outputname])
subprocess.run(['xelatex', '-interaction=nonstopmode', outputname])
outputname = outputname.replace(".tex",".pdf")
flip_PDF(outputname)

FileNotFoundError: [WinError 2] The system cannot find the file specified