In [2]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [3]:
import os

job_id = "936719006"
base_path = f"/content/drive/MyDrive/{job_id}"

years = ["2024", "2025"]
journal_names = ["Journal_Example_1", "Journal_Example_2"]  # here i name the journal as ex 1 and ex 2

# folder creates heree in drive
for year in years:
    year_path = os.path.join(base_path, year)
    os.makedirs(year_path, exist_ok=True)
    for journal in journal_names:
        journal_path = os.path.join(year_path, journal)
        os.makedirs(journal_path, exist_ok=True)


In [5]:
import requests

pdf_url = "https://revistas.udca.edu.co/index.php/ruadc/article/view/2951/3531"  # sir here you paste the link
pdf_path = f"{base_path}/2025/Journal_Example_1/sample.pdf"

r = requests.get(pdf_url)
with open(pdf_path, "wb") as f:
    f.write(r.content)

print(f"Downloaded: {pdf_path}")


Downloaded: /content/drive/MyDrive/936719006/2025/Journal_Example_1/sample.pdf


In [6]:
#xml writing to metadata as we discuss during call in the evening
import xml.etree.ElementTree as ET

metadata = {
    "ArticleTitle": "Sample Title",
    "Authors": [{"AuthorName": "John Doe", "Affiliation": ""}],
    "Keywords": "",
    "PDFName": "sample.pdf",
    "FileSize": str(os.path.getsize(pdf_path)),
    "PublicationYear": "2025",
    "Volume": "",
    "Issue": "",
    "SourceID": job_id,
    "ContentProvider": "",
    "DOI": "10.1234/example.doi",
    "PublisherItemType": "",
    "StartPage": "",
    "EndPage": "",
    "PageRange": "",
    "Abstract": "",
    "References": ""
}

root = ET.Element("ArticleMetadata")

for key, value in metadata.items():
    if key == "Authors":
        authors_elem = ET.SubElement(root, "Authors")
        for author in value:
            author_elem = ET.SubElement(authors_elem, "Author")
            for subkey, subval in author.items():
                ET.SubElement(author_elem, subkey).text = subval
    else:
        ET.SubElement(root, key).text = value

tree = ET.ElementTree(root)
xml_path = f"{base_path}/2025/Journal_Example_1/metadata.xml"
tree.write(xml_path)

print(f"XML saved to {xml_path}")


XML saved to /content/drive/MyDrive/936719006/2025/Journal_Example_1/metadata.xml


In [7]:
#now a webpage for show case the result simple view as you mention in the given document in the indeed conversaation
html_content = f"""
<html>
<head><title>{metadata['ArticleTitle']}</title></head>
<body>
<h1>{metadata['ArticleTitle']}</h1>
<p><b>Authors:</b> {', '.join([a['AuthorName'] for a in metadata['Authors']])}</p>
<p><b>DOI:</b> {metadata['DOI']}</p>
<p><b>Publication Year:</b> {metadata['PublicationYear']}</p>
</body>
</html>
"""

html_path = f"{base_path}/2025/Journal_Example_1/metadata.html"
with open(html_path, "w") as f:
    f.write(html_content)

print(f"HTML saved to {html_path}")


HTML saved to /content/drive/MyDrive/936719006/2025/Journal_Example_1/metadata.html
