In [56]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import html
from tqdm import tqdm
from quantulum3 import parser
import pandas as pd

In [74]:
def link2soup(url):
    response = requests.get(url)
    return BeautifulSoup(response.text, 'html.parser')

def norm_spaces(s: str) -> str:
    s = s.replace("\xa0", " ")
    s = html.unescape(s)
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\s*\n\s*", " ", s)
    return s.strip()

def absolutize(BASE: str, u: str) -> str:
    return urljoin(BASE, u)

def clean_name(raw: str) -> str:
    s = norm_spaces(raw)
    s = re.sub(r"\bLinks to product page\b", "", raw, flags=re.I)
    s = re.sub(r"[®™℠]", "", s)
    s = re.sub(r"\s+", " ", s).strip()
    s = re.sub(r"\bCredit\s+Card\b", "Credit Card", s)
    return s

def clean_welcome_text(txt: str) -> str:
    t = norm_spaces(txt)
    t = re.sub(r"\bstrikethrough\b", "", t, flags=re.I)
    t = re.sub(r"\bstrike\s*through\b", "", t, flags=re.I)
    sentences = re.split(r"(?<=[.!?])\s+(?=[A-Z0-9])", t)
    t = " ".join(dict.fromkeys(sentences))
    return re.sub(r"\s+", " ", t).strip()

def clean_apr(txt: str) -> str:
    t = norm_spaces(txt)
    t =  re.sub(r"Min\.?\s*of\s*\([^)]*\)\s*and\s*\d+(?:\.\d+)?", "", t)
    t = re.sub("Opens pricing and terms in new window", "", t)
    t = re.sub("†", "", t)
    p = re.sub(r"\s*[–—−-]\s*", ". ", t)
    parsed = parser.parse(p)
    results = {
        "apr_lower_bound" : None,
        "apr_upper_bound" : None,
        "welcome_apr" : None,
        "welcome_time" : None
    }
    if parsed[0].value == 0:
        results["welcome_apr"] = parsed[0]
        results["welcome_time"] = parsed[1]
        results["apr_lower_bound"] = parsed[2]
        results["apr_upper_bound"] = parsed[3]
    elif len(parsed) > 1:
        results["apr_lower_bound"] = parsed[0]
        results["apr_upper_bound"] = parsed[1]
    else:
        results["apr_lower_bound"] = parsed[0]
        results["apr_upper_bound"] = parsed[0]
    return results

def clean_annual_fee(txt: str) -> str:
    t = norm_spaces(txt)
    t = re.sub("Opens pricing and terms in new window", "", t)
    t = re.sub("applied to first billing statement.", "", t)
    t = re.sub(r"\bfirst\b", "1", t, flags=re.IGNORECASE)
    t = re.sub("†", "", t)
    t = re.sub("\.", "", t).strip()
    parsed = parser.parse(t)
    dollar_amounts = [q for q in parsed if q.unit.entity.name == "currency"]
    results = {
        "base_annual_fee": None,
        "authorized_user": None,
        "welcome_annual_fee": None,
        "welcome_time": None
    }
    if len(dollar_amounts) > 1:
        if dollar_amounts[0].value > dollar_amounts[1].value:
            results["base_annual_fee"] = dollar_amounts[0]
            results["authorized_user"] = dollar_amounts[1]
        else:
            results["base_annual_fee"] = dollar_amounts[1]
            results["welcome_annual_fee"] = parsed[0]
            results["welcome_time"] = parsed[1]
    else:
        results["base_annual_fee"] = parsed[0]
    return results

In [75]:
BASE = "https://creditcards.chase.com"
LINK = "https://creditcards.chase.com/all-credit-cards?CELL=6TKX"

soup = link2soup(LINK)

card_containers = soup.select("div.cmp-cardsummary__inner-container")

card_data = []

for card in tqdm(card_containers):
    title_container = card.select("div.cmp-cardsummary__inner-container__title h2 a")[0]
    name = clean_name(title_container.get_text().strip())
    url = absolutize(BASE, title_container.get("href"))

    summary_container = card.select("div.cmp-cardsummary__inner-container--summary")[0]
    
    offer_block = summary_container.select_one("div.cmp-cardsummary__inner-container--card-member-offer p")
    for el in offer_block.select("s, strike, .strikeThrough, [style*='line-through']"):
        el.decompose()
    welcome_offer = clean_welcome_text(offer_block.get_text(" ", strip=True))
    
    apr = clean_apr(summary_container.select("div.cmp-cardsummary__inner-container--purchase-apr p")[0].get_text())
    annual_fee = clean_annual_fee(summary_container.select("div.cmp-cardsummary__inner-container--annual-fee p")[0].get_text())

    card_scheme = {
        "name" : name,
        "url" : url,
        "welcome esfd _offer" : welcome_offer,
        "apr" : apr,
        "annual_fee" : annual_fee
    }
    card_data.append(card_scheme)

100%|█████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 142.01it/s]


In [76]:
# card_data

In [86]:
def make_df(col, data):
    lst = []
    for card in data:
        info = card[col]
        info["name"] = card["name"]
        lst.append(info)
    return pd.DataFrame(lst)

In [87]:
make_df("annual_fee", card_data)

Unnamed: 0,base_annual_fee,authorized_user,welcome_annual_fee,welcome_time,name
0,"seven hundred and ninety-five dollars, zero cents","one hundred and ninety-five dollars, zero cents",,,The New Chase Sapphire Reserve Credit Card
1,"ninety-five dollars, zero cents",,,,Chase Sapphire Preferred Credit Card
2,"zero dollars, zero cents",,,,Chase Freedom Unlimited Credit Card
3,"zero dollars, zero cents",,,,Chase Freedom Flex Credit Card
4,"zero dollars, zero cents",,,,Chase Freedom Rise Credit Card
5,"zero dollars, zero cents",,,,Slate Edge Credit Card
6,"one hundred and fifty dollars, zero cents",,"zero dollars, zero cents",one year,The New UnitedSM Explorer Card
7,"three hundred and fifty dollars, zero cents",,,,The New United QuestSM Card
8,"zero dollars, zero cents",,,,The New United GatewaySM Card
9,"six hundred and ninety-five dollars, zero cents",,,,The New United ClubSM Card


In [88]:
make_df("apr", card_data)

Unnamed: 0,apr_lower_bound,apr_upper_bound,welcome_apr,welcome_time,name
0,nineteen point nine nine percentages,twenty-eight point four nine percentages,,,The New Chase Sapphire Reserve Credit Card
1,nineteen point seven four percentages,twenty-seven point nine nine percentages,,,Chase Sapphire Preferred Credit Card
2,eighteen point seven four percentages,twenty-eight point two four percentages,zero percentages,fifteen months,Chase Freedom Unlimited Credit Card
3,eighteen point seven four percentages,twenty-eight point two four percentages,zero percentages,fifteen months,Chase Freedom Flex Credit Card
4,twenty-five point seven four percentages,twenty-five point seven four percentages,,,Chase Freedom Rise Credit Card
5,eighteen point seven four percentages,twenty-eight point seven four percentages,zero percentages,eighteen months,Slate Edge Credit Card
6,twenty point two four percentages,twenty-eight point seven four percentages,,,The New UnitedSM Explorer Card
7,twenty point two four percentages,twenty-eight point seven four percentages,,,The New United QuestSM Card
8,twenty point two four percentages,twenty-eight point seven four percentages,zero percentages,twelve months,The New United GatewaySM Card
9,twenty point two four percentages,twenty-eight point seven four percentages,,,The New United ClubSM Card


In [92]:
parser.parse(aprs[0])

[Quantity(19.99, "Unit(name="percentage", entity=Entity("dimensionless"), uri=Percentage)"),
 Quantity(28.49, "Unit(name="percentage", entity=Entity("dimensionless"), uri=Percentage)")]

In [6]:
url = "https://www.nerdwallet.com/credit-cards/best"
soup = link2soup(url)

In [15]:
soup.select("div.MuiBox-root")

[<div class="MuiBox-root css-7t7yfb"><svg fill="none" height="40" viewbox="0 0 41 40" width="41" xmlns="http://www.w3.org/2000/svg"><rect fill="#008254" height="40" rx="20" width="40" x="0.5"></rect><path clip-rule="evenodd" d="M20.5015 24.3758L25.0898 27.1509L23.8741 21.9344L27.932 18.4183L22.5915 17.9649L20.4996 13.0285L18.4077 17.9539L13.068 18.4072L17.1259 21.9233L15.9097 27.142L20.5015 24.3758ZM25.8836 27.631C25.8834 27.6309 25.8839 27.6312 25.8836 27.631V27.631ZM20.5 26.1279L25.1074 28.9145C25.9511 29.4252 26.9836 28.6703 26.7616 27.7155L25.5403 22.4753L29.6148 18.9449C30.3586 18.3009 29.959 17.0797 28.982 17.002L23.6197 16.5468L21.5214 11.5953C21.1439 10.696 19.8561 10.696 19.4786 11.5953L17.3803 16.5357L12.018 16.9909C11.041 17.0686 10.6414 18.2898 11.3852 18.9338L15.4597 22.4642L14.2384 27.7044C14.0164 28.6592 15.0489 29.4141 15.8926 28.9034L20.5 26.1279Z" fill="#EFFCF8" fill-rule="evenodd"></path></svg><p class="MuiTypography-root MuiTypography-body1 css-qqg6rj">Best Cards</p

None


In [24]:
print(soup.find("div", {"data-testid": "bcc-product-card-section"}))

None
