<a href="https://colab.research.google.com/github/Pradham1/Pradham1/blob/main/ProjectWebScraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from .ad import Ad
from .ad import AdParser
from .ad import fetch_ad

from .search import Search
from .search import SearchParser
from .search import fetch_search

from .utils import get_us_cities

__version__ = "1.1.2"

In [None]:
from bs4 import BeautifulSoup
import requests
import re

from typing import Optional
from typing import Union
from typing import List
from typing import Dict

from .utils import format_price


class Ad:
    def __init__(
        self,
        url: str,
        price: Optional[float] = None,
        title: Optional[str] = None,
        d_pid: Optional[int] = None,
        description: Optional[str] = None,
        attributes: Optional[Dict] = None,
        image_urls: Optional[List[str]] = None
    ) -> None:
        """An abstraction for a Craigslist 'Ad'. At the bare minimum you need a
        `url` to define an ad. Although, at search-time, information such as the
        price, title, and d_pid can additionallly be computed. If not provided,
        these are computed lazily if the user fetches the ad information with
        `ad.fetch()`.

        """
        self.url = url
        self.price = price
        self.title = title
        self.d_pid = d_pid
        self.description = description
        self.attributes = attributes
        self.image_urls = image_urls

    def __repr__(self) -> str:
        if (self.title is None) or (self.price is None):
            return f"< {self.url} >"

        return f"< {self.title} (${self.price}): {self.url} >"

    def fetch(self, **kwargs) -> int:
        """Fetch additional data from the url of the ad."""
        self.request = requests.get(self.url, **kwargs)
        if self.request.status_code == 200:
            parser = AdParser(self.request.content)
            self.price = parser.price
            self.title = parser.title
            self.d_pid = parser.d_pid
            self.description = parser.description
            self.attributes = parser.attributes
            self.image_urls = parser.image_urls
            self.metadata = parser.metadata

        return self.request.status_code

    def to_dict(self) -> Dict:
        return {
            "url": self.url,
            "price": self.price,
            "title": self.title,
            "d_pid": self.d_pid,
            "description": self.description,
            "image_urls": self.image_urls,
            "attributes": self.attributes,
        }


def fetch_ad(url: str, **kwargs) -> Ad:
    """Functional way to fetch the ad information given a url."""
    ad = Ad(url = url)
    ad.fetch(**kwargs)
    return ad


class AdParser:
    def __init__(self, content: Union[str, bytes], **kwargs) -> None:
        self.soup = BeautifulSoup(content, "html.parser", **kwargs)

        # Remove QR text. This is important when parsing the description.
        for qr in self.soup.find_all("p", class_ = "print-qrcode-label"):
            qr.decompose()

    @property
    def url(self) -> str:
        return self.soup.find("meta", property = "og:url")["content"]

    @property
    def price(self) -> Optional[float]:
        element = self.soup.find("span", class_ = "price")
        if element is not None:
            return format_price(element.text)
        return element

    @property
    def title(self) -> str:
        return self.soup.find("span", id = "titletextonly").text

    @property
    def d_pid(self) -> int:
        return int(re.search(r"/(\d+)\.html", self.url).group(1))

    @property
    def description(self) -> str:
        return self.soup.find("section", id = "postingbody").text

    @property
    def attributes(self) -> Dict:
        attrs = {}
        for attr_group in self.soup.find_all("p", class_ = "attrgroup"):
            for attr in attr_group.find_all("span"):
                kv = attr.text.split(": ")

                # Add the attribute if and only if it's a key value attribute.
                if len(kv) == 2: attrs[kv[0]] = kv[1]

        return attrs

    @property
    def image_urls(self) -> List[str]:
        return [a.get("href") for a in self.soup.find_all("a", class_ = "thumb")]

    @property
    def metadata(self) -> List[BeautifulSoup]:
        return self.soup.find_all("meta")



In [None]:
from bs4 import BeautifulSoup
import requests
import re

from typing import Union
from typing import List
from typing import Dict

from .ad import Ad
from .utils import format_price
from .utils import build_url


class Search:
    def __init__(self, query: str, city: str, category: str = "sss") -> None:
        """An abstraction for a Craigslist 'Search'. Similar to the 'Ad' this is
        also lazy and follows the same layout with the `fetch()` and `to_dict()`
        methods.

        """
        self.query = query
        self.city = city
        self.category = category

        self.url = build_url(self.query, self.city, self.category)
        self.ads: List[Ad] = []

    def fetch(self, **kwargs) -> int:
        self.request = requests.get(self.url, **kwargs)
        if self.request.status_code == 200:
            parser = SearchParser(self.request.content)
            self.ads = parser.ads

        return self.request.status_code

    def to_dict(self) -> Dict:
        return {
            "query": self.query,
            "city": self.city,
            "category": self.category,
            "url": self.url,
            "ads": [ad.to_dict() for ad in self.ads]
        }


def fetch_search(query: str, city: str, category: str = "sss", **kwargs) -> Search:
    """Functional implementation of a Craigslist search."""
    search = Search(query = query, city = city, category = category)
    search.fetch(**kwargs)
    return search


class SearchParser:
    def __init__(self, content: Union[str, bytes], **kwargs) -> None:
        self.soup = BeautifulSoup(content, "html.parser", **kwargs)

    @property
    def ads(self) -> List[Ad]:
        ads = []
        for ad_html in self.soup.find_all("li", class_ = "cl-static-search-result"):
            url = ad_html.find("a")["href"]
            title = ad_html.find(class_ = "title").text
            price = format_price(ad_html.find(class_ = "price").text)
            d_pid = int(re.search(r"/(\d+)\.html", url).group(1))

            ads.append(
                Ad(
                    url = url,
                    title = title,
                    price = price,
                    d_pid = d_pid
                )
            )

        return ads



In [None]:
from urllib.parse import quote
import json
import csv
import os

from typing import List
from typing import Dict

# Get the directory of the current file agnositc of library location.
cs_dir = os.path.dirname(os.path.abspath(__file__))


def get_us_cities() -> List[str]:
    path = os.path.join(cs_dir, "data/us_cities.csv")
    with open(path, "r") as file:
        reader = csv.reader(file)
        cities = [city[0] for city in reader]
    return cities


def format_price(price: str) -> float:
    return float(price.replace("$", "").replace(",", ""))


def build_url(query: str, city: str, category: str = "sss") -> str:
    return f"https://{city}.craigslist.org/search/{category}?query={quote(query)}"


def get_areas() -> List[Dict]:
    with open(os.path.join(cs_dir, "data/areas.json"), "r") as file:
        areas = json.load(file)
    return areas


def get_categories() -> List[Dict]:
    with open(os.path.join(cs_dir, "data/categories.json"), "r") as file:
        categories = json.load(file)
    return categories



In [9]:
!pip install craigslistscraper



In [10]:
import craigslistscraper as cs
import json

# Define the search. Everything is done lazily, and so the html is not
# fetched at this step.
search = cs.Search(
    query = "bmw e46",
    city = "minneapolis",
    category = "cto"
)

# Fetch the html from the server. Don't forget to check the status.
status = search.fetch()
if status != 200:
    raise Exception(f"Unable to fetch search with status <{status}>.")

print(f"{len(search.ads)} ads found!")
for ad in search.ads:
    # Fetch additional information about each ad. Check the status again.
    status = ad.fetch()
    if status != 200:
        print(f"Unable to fetch ad '{ad.title}' with status <{status}>.")
        continue

    # There is a to_dict() method for convenience.
    data = ad.to_dict()

    # json.dumps is merely for pretty printing.
    print(json.dumps(data, indent = 4))



0 ads found!
