<a href="https://colab.research.google.com/github/SrijitaThakur/downloadpdf/blob/main/downloadpdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import os
import requests
from weasyprint import HTML
from urllib.parse import urlparse
import concurrent.futures

# Create a directory for saving PDF files
pdf_dir = '/content/sample_data/pdfs'
os.makedirs(pdf_dir, exist_ok=True)

# Read the CSV file
df = pd.read_csv('/content/sample_data/Articles To Share With Pet Parents - Main Topic - Nutrition.csv')

def extract_last_word_from_url(url):
    parsed_url = urlparse(url)
    path_parts = parsed_url.path.split('/')
    last_part = path_parts[-1] if path_parts[-1] else path_parts[-2]
    return last_part


def download_and_save_pdf(url, pdf_dir):
    response = requests.get(url)
    if response.status_code == 200:
        content = response.text

        # Create an HTML object from the content
        html = HTML(string=content)


        filename = extract_last_word_from_url(url)
        pdf_filename = os.path.join(pdf_dir, f'{filename}.pdf')
        html.write_pdf(pdf_filename)

        return pdf_filename
    return None

# Function to be executed in parallel for each URL
def process_article(index, row):
    url = row['Article URL']
    print(url)

    pdf_filename = download_and_save_pdf(url, pdf_dir)

    if pdf_filename:
        # Update the DataFrame with the local file name
        df.at[index, 'local_file'] = pdf_filename

# Use ThreadPoolExecutor for parallel processing
max_workers = 5
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = {executor.submit(process_article, index, row): (index, row) for index, row in df.iterrows()}

    for future in concurrent.futures.as_completed(futures):
        index, row = futures[future]
        try:
            future.result()
        except Exception as exc:
            print(f"Article processing failed for index {index}: {exc}")

# Save the updated DataFrame to a CSV file
df.to_csv('/content/sample_data/articles_pet_updated.csv', index=False)


https://thewildest.com/pet-nutrition/automatic-pet-feederhttps://thewildest.com/pet-nutrition/arya-sit-freeze-dried-treats
https://thewildest.com/pet-nutrition/what-chefs-feed-their-pets

https://thewildest.com/pet-nutrition/free-pet-food
https://thewildest.com/pet-nutrition/grain-free-dog-food-heart-disease


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


https://thewildest.com/pet-nutrition/microbiomes-dog-gut-health
https://thewildest.com/pet-nutrition/pet-food-label
https://thewildest.com/pet-nutrition/warming-cooling-diets




https://thewildest.com/pet-nutrition/sustainable-pet-food-fish




https://thewildest.com/pet-nutrition/dr-lindsey-wendt-holistic-pet-products




https://thewildest.com/pet-nutrition/antoni-jonathan-van-ness-yummers


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


https://thewildest.com/pet-nutrition/petaluma-dog-food
https://thewildest.com/pet-nutrition/sustainable-pet-food




https://thewildest.com/dog-nutrition/dog-not-eating
https://thewildest.com/dog-nutrition/weight-management-made-simple




https://thewildest.com/dog-nutrition/diy-probiotics




In [1]:
!pip install weasyprint

Collecting weasyprint
  Downloading weasyprint-60.1-py3-none-any.whl (268 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/268.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m174.1/268.7 kB[0m [31m5.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.7/268.7 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydyf>=0.8.0 (from weasyprint)
  Downloading pydyf-0.8.0-py3-none-any.whl (7.5 kB)
Collecting cssselect2>=0.1 (from weasyprint)
  Downloading cssselect2-0.7.0-py3-none-any.whl (15 kB)
Collecting Pyphen>=0.9.1 (from weasyprint)
  Downloading pyphen-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting zopfli>=0.1.4 (from fonttools[woff]>=4.0.0->weasyprint)
  Downloading zopfli-0.2.3-cp310-cp310-manylinux_2_12_x86_64.ma