In [28]:
import requests
from bs4 import BeautifulSoup
import re
import time
import os
from pathlib import Path

In [53]:
uof_page = requests.get("https://sbcountyda.org/categories/news-releases/use-of-force-reviews/page/1").content
soup = BeautifulSoup(uof_page)

In [15]:
def _get_uof_catalog_pages():
    """
    Helper method to get use of force catalog pages from San Bernardino site. Fetches base page, calculates number of pages based on results param located at top of page under "<numeric> results"
    """
    # Get the number of pages from the "post-results field on the "Category: Use of Force Reviews" page
    try:
        uof_page = requests.get(
            "https://sbcountyda.org/categories/news-releases/use-of-force-reviews/"
        ).content
        soup = BeautifulSoup(uof_page)
        # 10 links shown per page
        number_of_pages = (
            int(
                re.search(
                    "\d*", soup.find_all("div", {"class": "post-results"})[0].text
                ).group()
            )
            % 10
        )
        # Generate a list of base pages that we'll need to call for pdf reports
        base_pages = [
            "https://sbcountyda.org/categories/news-releases/use-of-force-reviews/page/{}/".format(
                i
            )
            for i in range(1, number_of_pages + 1)
        ]
        return base_pages
    except requests.exceptions.RequestException as e:
        raise SystemExit(e)


# TODO : Wrap the request in a try/catch
def _get_uof_review_links():
    """
    For each instance of UOF documented by the court, there is an individual page stored on the SB website. This method gives those links, which will then have a link to a pdf of the court doc and police report.
    """
    # Get links to each individiual uof case filing page
    catalog_pages = _get_uof_catalog_pages()
    uof_links = []
    # For each catalog page, we want to grab the set of individual links on each.
    for cat in catalog_pages:
        # We try to request the catalog page 3 times
        tries = 3
        for try_value in range(tries):
            try:
                ind_uof_page = requests.get(cat).content
                soup = BeautifulSoup(ind_uof_page)
                for i in soup.find_all(
                    "h2", {"class": "entry-title bolt-highlight-font"}
                ):
                    regex_match = re.search(r"\<a href=.* rel", str(i)).group()
                    # Strip head and tail of match to get just the link. There's probably a better way to search this with bs4
                    link_cleaned = (
                        str(regex_match).replace('<a href="', "").replace('" rel', "")
                    )
                    uof_links.append(link_cleaned)
                # Courtesy sleep timer
                time.sleep(1)
            except KeyError as e:
                time.sleep(1)
                if try_value < tries - 1:  # i is zero indexed
                    continue
                else:
                    raise
            break
    return uof_links


# TODO : We need to write a function that can grab the .pdf's from each individual use of force case page. Not all pages will have the same format for .pdfs, so we need to search/grep them


In [16]:
rat = _get_uof_review_links()

In [39]:
html = requests.get(rat[0]).content
soup = BeautifulSoup(html)
pdf_links = set()
current_link = ""
for i in set(soup.find_all("a")):
    temp_link = i.get("href")
    # For each distinct pdf, we want to add their link to the set of pdf's that we'll scrape
    if temp_link.endswith("pdf"):
        pdf_links.add(temp_link)

In [40]:
pdf_links

{'https://sbcountyda.org/wp-content/uploads/sites/42/2023/01/Public-Release-Memo-FINAL-Brandon-Rocky.pdf'}

In [32]:
html = requests.get(rat[0]).content
soup = BeautifulSoup(html)
print(soup.find_all("a")[0].get("href"))

#primary


In [41]:
if not os.path.exists("./data/"):
    os.mkdir("./data/")

# REPLACE rat WITH UOF link list
for link in pdf_links:
    # We try to request the uof document page 3 times
    tries = 3
    for try_value in range(tries):
        try:
            with open("./data/{}".format(Path(link).name), "wb") as f:
                response = requests.get(link)
                f.write(response.content)
            # Courtesy sleep timer
            time.sleep(1)
        except KeyError as e:
            time.sleep(1)
            if try_value < tries - 1:  # i is zero indexed
                continue
            else:
                raise
        break
