In [None]:
## Install bs4 and lxml before starting

import urllib, urllib.request

def run_search(search_term,max_results):
    url = f'http://export.arxiv.org/api/query?search_query={search_term.replace(" ", "+")}&sortBy=relevance&start=0&max_results={max_results}'
    data = urllib.request.urlopen(url)
    return(data.read().decode('utf-8'))
    

In [None]:
directory = "./content/"

search_term = 'satellite IOT'
max_results = '3'

xml_data = run_search(search_term,max_results)
print(xml_data)

In [None]:
import requests
import os

def download_latex(content, directory):
    uid = hash(content["url"])
    if uid < 0:
        uid = "{p}{n}".format(p="x",n=abs(uid))

    filename = "{uid}.tar.gz".format(uid=uid)
    fullpath = os.path.join(directory,filename)

    # If hash(url) exists, then skip this
    if os.path.exists(fullpath):
        print("Skipping duplicate: {}".format(fullpath))
        return

    latex_url = content["url"].replace("abs","src")
    latex_url = latex_url.replace("http://arxiv.org/", "http://export.arxiv.org/")

    # Get response object for link
    response = requests.get(latex_url)

    # Write content in pdf file
    latex = open(fullpath, 'wb')
    latex.write(response.content)
    latex.close()
    print("wrote: {}".format(filename))

In [None]:
from bs4 import BeautifulSoup
import json
import os

def fetch_search_resuls(xml_data, directory):

    soup = BeautifulSoup(xml_data, "lxml")

    # Extract the source
    source = soup.find('feed').find('title').text.strip()

    
    entries = soup.find_all('entry')
    for entry in entries:
        # Extract the title of the entry
        title = entry.find('title').text.strip()
        
        # Extract the publication date and format it as 'YYYY-MM-DD'
        date = entry.find('published').text.strip()[:10]
        
        # Extract the primary category term
        def is_primary_category_tag(tag):
            return tag.name.endswith('primary_category')
        primary_category = entry.find(is_primary_category_tag)
        if primary_category:
            type_term = primary_category.get('term')
        else:
            type_term = None
        
        # Extract the authors and join their names into a single string
        authors = entry.find_all('author')
        author_names = []
        for author in authors:
            name_elem = author.find('name')
            if name_elem:
                author_names.append(name_elem.text.strip())
        author_str = ', '.join(author_names)
        
        # Extract the URL linking to the entry's HTML page
        links = entry.find_all('link')
        url = None
        for link in links:
            if link.get('rel') == 'alternate' and link.get('type') == 'text/html':
                url = link.get('href')
                break
        if not url:
            url = entry.find('id').text.strip()
        
        # Extract the summary text of the entry
        text = entry.find('summary').text.strip()
        
        # Extract the DOI from the metadata
        def is_doi_tag(tag):
            return tag.name.endswith('doi')
        arxiv_doi = entry.find(is_doi_tag)
        if arxiv_doi:
            doi = arxiv_doi.text.strip()
        else:
            doi = None
        
        # Extract the entry ID
        entry_id = entry.find('id').text.strip()
        
        # Create a metadata dictionary including DOI and entry ID
        metadata = {}
        if doi:
            metadata['doi'] = doi
        if entry_id:
            metadata['id'] = entry_id
        
        # Compile all extracted data into a single dictionary (almost the same as Rockfish)
        # Note that Rockfish DOI used does not match the arxiv schema to form a valid doi.org link
        content = {
            'title': title,
            'date': date,
            'type': type_term,
            'author': author_str,
            'source': source,
            'url': url,
            'abstract': text,
            'metadata': metadata
        }
        
        # Generate a filename based on the url hash (matching Rockfish method)
        uid = hash(content['url'])
        if uid < 0:
            uid = "{p}{n}".format(p="x",n=abs(uid))

        filename = "{uid}.json".format(uid=uid)
        fullpath = os.path.join(directory,filename)
        
        # Write the data to a JSON file with indentation for readability
        if os.path.exists(fullpath):
            print("Skipping duplicate: {}".format(fullpath))
        else:
            Export = {'Content' : [content]}  ## Match Rockfish json format.
            with open(fullpath, 'w', encoding='utf-8') as json_file:
                json.dump(Export, json_file, ensure_ascii=False, indent=4)
            print("wrote: {}".format(filename))
        
        download_latex(content, directory)

In [None]:
fetch_search_resuls(xml_data, directory)

In [None]:
## Install python-magic before starting
## Also requires latexml: sudo apt-get install latexml

import tarfile
import magic
import os
import subprocess
from bs4 import BeautifulSoup
import shutil


def extract_arxiv_source(src_zip, dest_dir='./temp/'):
    
    # if 'gzip' not in magic.from_file(src_zip):
    #     print("Not a gzip file")
    #     return None
    
    # dir = src_zip.replace(".tar.gz", "")
    dir = src_zip.replace(".tmp", "")
    fn = dir.split('/')[-1]
    out_dir = os.path.join(dest_dir, fn)
    if not os.path.isdir(out_dir):
        os.mkdir(out_dir)
    
    try:  
        with tarfile.open(src_zip) as tar:
            tar.extractall(path=out_dir)
    except Exception as e:
        print("Error extracting file")
        print(str(e))
        return None
    
    return out_dir

def convert_arxiv_latex(latex_dir_in, html_dir_out):
    tex_files = []
    main_tex_file = None
    
    new_dir = latex_dir_in.split('/')[-1]
    for file in os.listdir(latex_dir_in):
        if file.endswith(".tex"):
            tex_files.append(os.path.join(latex_dir_in, file))
            
    if len(tex_files) > 1:
        main_tex_file = [f for f in tex_files if 'main' in tex_files]
        # main_tex_file = [f if 'main' in tex_files else None for f in tex_files]
        if len(main_tex_file) < 1:
            for f in tex_files:
                with open(f) as tex:
                    if '\\title' in tex.read():
                        main_tex_file = f
                        print(f)
                        break
        else:
            main_tex_file = main_tex_file[0]
    else:
        main_tex_file = tex_files[0]
    
    if main_tex_file is None:
        print("Failed to find main tex file")
        return None
    
    output_file = os.path.join(html_dir_out, new_dir + "-html/main.html")
    out = subprocess.run(["latexmlc", main_tex_file, "--destination="+output_file], capture_output=True, text=True)
    #print(out.stderr)
    if out.returncode != 0:
        # print("Failed to convert to html")
        # print("IMPORTANT! If you get an error running latexml, you may need to enable read/write for some ImageMagick policies")
        # print("Check by running...")
        # print("cat /etc/ImageMagic-6/policy.xml | grep PS")
        # print("Edit the policy file and change from rights='none' to rights='read|write' for pattern='PS' and pattern='EPS'")
        # print(out.stderr)
        return None
    # subprocess.run(["latexmlc", d + "/" + main_tex_file, "--destination="+d+"-html/main.html"])
    print("\n\n")
    return output_file

def convert_html_text(html_file_in):
    with open(html_file_in, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')
    plain_text = soup.get_text()

    return plain_text

def convert_cleanup(src_zip, dest_dir):
    fn = src_zip.split('/')[-1]
    directory_path = os.path.join(dest_dir, fn)
    directory_path_html = directory_path + "-html"

    # Check if the directory exists
    if not os.path.exists(directory_path):
        print(f"The directory {directory_path} does not exist.")
    
    if not os.path.exists(directory_path_html):
        print(f"The directory {directory_path_html} does not exist.")
    
    # Prompt user with Yes/No confirmation
    # confirm = input(f"Are you sure you want to delete the directory '{directory_path}' and '{directory_path_html}' and all its contents? (yes/no): ").lower()
    confirm = 'y'

    if confirm in ['y', 'yes']:
        try:
            # Remove the entire directory and its contents
            shutil.rmtree(directory_path)
            print(f"Directory '{directory_path}' and all its contents have been deleted.")
        except Exception as e:
            print(f"Failed to delete directory '{directory_path}'. Reason: {e}")
        
        try:
            # Remove the entire directory and its contents
            shutil.rmtree(directory_path_html)
            print(f"Directory '{directory_path_html}' and all its contents have been deleted.")
        except Exception as e:
            print(f"Failed to delete directory '{directory_path_html}'. Reason: {e}")

    else:
        print("Deletion canceled.")




In [None]:
import os
import glob


# Run all of the functions needed to create the plain text
def text_from_latex_tar_gz(src_zip,temp_dir):
    print(src_zip)
    extracted_latex_dir = extract_arxiv_source(src_zip, temp_dir)
    try: 
        html_output_file = convert_arxiv_latex(extracted_latex_dir, temp_dir)
        print("Converstion from LaTex to HTML Complete")
    except:
        print("Failed to convert LaTeX to HTML")
    
    try: 
        plain_text = convert_html_text(html_output_file)
        print("Converstion from HTML to text Complete")

    except:
        print("Failed to convert HTML to text")
        plain_text = None

    convert_cleanup(src_zip, temp_dir)

    return plain_text

#Find files in a directory by extention
def find_files(directory, ext):
    if not os.path.isdir(directory):
        print(f"Error: The directory '{directory}' does not exist.")
        return []

    files = glob.glob(os.path.join(directory, '*.' + ext))

    return files

In [None]:
import json

# Append data into an existing JSON file
def append_text_to_arxiv_json(file_name, data_to_append):
    try:
        with open(file_name, 'r') as file:
            json_data = json.load(file)
        if 'Content' in json_data:
            json_data['Content'][0].update(data_to_append)
        else:
            # If 'Content' does not exist, create it and add the data
            json_data['Content'] = [data_to_append]

        with open(file_name, 'w') as file:
            json.dump(json_data, file, ensure_ascii=False, indent=4)

        print(f"Successfully appended to the JSON file: {file_name}")

    except FileNotFoundError:
        print(f"Error: The file '{file_name}' was not found.")
    except json.JSONDecodeError:
        print(f"Error: The file '{file_name}' is not a valid JSON file.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [None]:
temp_dir = "./temp/"

json_files = find_files(directory,'json')

updated_files = []
for json_file in json_files:
    tar_gz_file = json_file.replace(".json", ".tar.gz")
    
    plain_text = text_from_latex_tar_gz(tar_gz_file, temp_dir)
    
    if plain_text:
        data_to_append = {'text': plain_text}
        append_text_to_arxiv_json(json_file, data_to_append)
        updated_files.append(json_file)
    else:
        print(f"Error: No plain text returned from conversion. JSON file not updated.")

In [None]:
updated_files

In [None]:
file_name = './content/6847810983221028824.json'
with open(file_name, 'r') as file:
    json_data = json.load(file)

json_data