# This notebook will scrape pages from https://paws.org.ph/adopt/

In [23]:
# access the URL via web scraping
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://paws.org.ph/adopt/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# get all div with class "shelter-animals"
pets = soup.find_all(class_="shelter-animals")
pets_links = set()
# get all the links inside pets
for pet in pets:
    for link in pet.find_all('a'):
        pets_links.add(link.get('href'))

print(pets_links)
print(len(pets_links), 'pets found.')

pet_details = []
# get the details of each pet
for pet_link in pets_links:
    response = requests.get(pet_link)
    soup = BeautifulSoup(response.text, "html.parser")
    # get all the images under <div data-elementor-type="wp-post">
    body = soup.find('div', {'data-elementor-type': 'wp-post'})
    # get all the images under body
    pet_images = []
    for image in body.find_all('img'):
      pet_images.append(image.get('src'))
    
    # get the text of entire body
    pet_text = body.get_text()
    # get the text between After and Apply Now
    pet_text = pet_text[pet_text.find('After'):pet_text.find('Apply Now')]
    # clean up whitespaces, newlines, and tabs
    pet_text = ' '.join(pet_text.split())
    # sequence of text is "After"<name>"Age:"<age>"Sex:"<Sex><Description>$
    # get the name
    name = pet_text[pet_text.find('After')+5:pet_text.find('Age:')]
    print('Name:', name)
    # get the age
    age = pet_text[pet_text.find('Age:')+4:pet_text.find('Sex:')]    
    print('Age:', age)
    sex = pet_text[pet_text.find('Sex:')+4:pet_text.find('Sex:')+6]  
    print('Sex', sex)  
    description = pet_text[pet_text.find('Sex:')+7:]
    print('Desc:', description)
    pet_details.append({'name': name.strip(), 'age': age.strip(), 'sex': sex.strip(), 'description': description.strip(), 'images': pet_images})
    print('--------------------------------')
    # add delay of 1 sec
    time.sleep(1)

print(pet_details)
# save pet_details to a csv using pandas
df = pd.DataFrame(pet_details)
df.to_csv('pets.csv', index=False)
print('Data saved to pets.csv')


{'https://paws.org.ph/sasha/', 'https://paws.org.ph/whitey/', 'https://paws.org.ph/meldy/', 'https://paws.org.ph/haley/', 'https://paws.org.ph/lucy/', 'https://paws.org.ph/cow/', 'https://paws.org.ph/novy/', 'https://paws.org.ph/banoi/', 'https://paws.org.ph/tiki/', 'https://paws.org.ph/heath/', 'https://paws.org.ph/cornelia/', 'https://paws.org.ph/friendly/', 'https://paws.org.ph/skyler/', 'https://paws.org.ph/bridget-boy/', 'https://paws.org.ph/cleo/', 'https://paws.org.ph/foggy/', 'https://paws.org.ph/pumi/', 'https://paws.org.ph/sawyer/', 'https://paws.org.ph/post/', 'https://paws.org.ph/jl/', 'https://paws.org.ph/arya/', 'https://paws.org.ph/shadow/', 'https://paws.org.ph/laura/', 'https://paws.org.ph/tender/', 'https://paws.org.ph/princess-anna/', 'https://paws.org.ph/ig/', 'https://paws.org.ph/walter-white/', 'https://paws.org.ph/magdalena/', 'https://paws.org.ph/momo/', 'https://paws.org.ph/alice/', 'https://paws.org.ph/lot/', 'https://paws.org.ph/tala/', 'https://paws.org.ph/m