In [5]:
import feedparser
from docx import Document
from datetime import datetime
import re

# Basic LaTeX to plain text converter
def clean_latex(text):
    # Replace common LaTeX math symbols with Unicode
    latex_to_unicode = {
        r'\alpha': 'α',
        r'\beta': 'β',
        r'\gamma': 'γ',
        r'\delta': 'δ',
        r'\epsilon': 'ε',
        r'\pi': 'π',
        r'\mu': 'μ',
        r'\sigma': 'σ',
        r'\lambda': 'λ',
        r'\rightarrow': '→',
        r'\infty': '∞',
        r'\approx': '≈',
        r'\leq': '≤',
        r'\geq': '≥'
    }
    for latex, uni in latex_to_unicode.items():
        text = text.replace(latex, uni)

    # Remove math mode symbols like $, \[ \], \( \)
    text = re.sub(r'\$+', '', text)
    text = re.sub(r'\\\[|\\\]|\\\(|\\\)', '', text)

    # Remove LaTeX commands like \cite{}, \ref{}
    text = re.sub(r'\\(cite|ref|eqref|emph|textbf|textit)\{.*?\}', '', text)

    # Remove remaining LaTeX commands
    text = re.sub(r'\\[a-zA-Z]+\*?(?:\[[^\]]*\])?(?:\{[^\}]*\})?', '', text)

    # Remove double spaces
    text = re.sub(r'\s{2,}', ' ', text)

    return text.strip()

def search_arxiv_to_word_sorted_cleaned(query, max_results):
    heading = query
    filename = query+".docx"
    query="+".join(query.split())
    base_url = 'http://export.arxiv.org/api/query?'
    query_url = f'search_query=all:{query}&start=0&max_results={max_results}'
    url = base_url + query_url
    
    feed = feedparser.parse(url)

    entries = sorted(
        feed.entries,
        key=lambda entry: datetime.strptime(entry.published, '%Y-%m-%dT%H:%M:%SZ'),
        reverse=True
    )

    doc = Document()
    doc.add_heading(f'arXiv Results for: {heading}', 0)

    for i, entry in enumerate(entries, 1):
        doc.add_heading(f'{i}. {entry.title}', level=1)
        authors = ', '.join(author.name for author in entry.authors)
        cleaned_summary = clean_latex(entry.summary)

        doc.add_paragraph(f"Authors: {authors}")
        doc.add_paragraph(f"Published: {entry.published}")
        doc.add_heading(f'Abstract', level=2)
        doc.add_paragraph(cleaned_summary)
        doc.add_paragraph(f"PDF Link: {entry.link}")
        doc.add_paragraph("-" * 50)

    doc.save(filename)
    print(f"✅ Saved {len(entries)} results to '{filename}', with LaTeX cleaned.")

# Example usage
search_arxiv_to_word_sorted_cleaned("delay doppler domain", 100)
