In [1]:
from bs4 import BeautifulSoup
import json
import requests

In [3]:
response = requests.get("https://cointopper.com/news")
response.content



In [4]:
decoded_content = response.content.decode("utf-8")
decoded_content



In [6]:
soup = BeautifulSoup(decoded_content, "html.parser")

In [14]:
target_content = soup.find_all('div', {'class': 'col-xs-12 col-sm-6 col-md-4'})

In [15]:
url_list = list()
for element in target_content:
    links = element.find_all("a")
    for link in links:
        url = link["href"]
        url_list.append(url)

url_list

['https://cointopper.com/news/federal-court-dismisses-lawsuit-against-defi-platform-pooltogether',
 'https://cointopper.com/news/openais-sam-altman-set-to-meet-indian-prime-minister-today',
 'https://cointopper.com/news/unveiling-the-dark-side-of-ai-misuse-and-its-profound-implications',
 'https://cointopper.com/news/the-us-financial-services-committee-sets-date-to-discuss-the-future-of-crypto',
 'https://cointopper.com/news/robinhood-faces-potential-delisting-of-prominent-cryptocurrencies',
 'https://cointopper.com/news/unveiling-the-crypto-revolution-discover-the-latest-trends-and-game-changing-insights-from-risk-off-capital-rotation-week-23-2023',
 'https://cointopper.com/news/what-you-need-to-know-about-the-uk-labour-partys-call-for-licensing-and-regulation-of-ai',
 'https://cointopper.com/news/cryptocurrency-giants-rally-in-support-as-sec-targets-binance-uniting-to-overcome-legal-challenges',
 'https://cointopper.com/news/is-mufgs-deployment-of-stablecoins-on-public-blockchains-th

Now, here in first go, the website only shows 9 articles, and as you press "load more" button, more articles become visible. after inspecting the network transactions from the website after clicking the "load more" button, we found out that it is sending following GET request to the server: https://cointopper.com/ajax/news?offset=9 

In this request, see the parameter "offset", in the next reload, it's value becomes 18 then 27 and so on... so we can use the same request to get the remaining content as well. 

Also, we need to note that in the first load, we receive html content but with the requests we are getting the data in byte type json format, so after the first load, we need to change are extraction method.

Now we will work on the alogrithm for getting the data after the initial load.

In [55]:
offset_value = 9
request_url = f"https://cointopper.com/ajax/news?offset={offset_value}"
response = requests.get(request_url)

In [56]:
decoded_content = response.content.decode("utf-8")
json_data = json.loads(decoded_content)
json_data

{'data': '<div class="col-xs-12 col-sm-6 col-md-4">\n                    <a href="https://cointopper.com/news/revolutionizing-interbank-settlements-can-jpmorgan-and-6-indian-banks-transform-the-finance-industry-with-blockchain">\n                        <div class="blog-list-wrap">\n                            <div class="stories-img">\n                                                                    <img src="https://cointopper.com/uploads/topics/thumb/revolutionizing-interbank-settlements-can-jpmorgan-and-6-indian-banks-transform-the-finance-industry-with-blockchain-16859621695007.jpg"\n                                         alt="Revolutionizing Interbank Settlements: Can JPMorgan and 6 Indian Banks Transform the Finance Industry with Blockchain?"/>\n                                                            </div>\n                            <div class="stories-con clearfix">\n                                <h4> Revolutionizing Interbank Settlements: Can JPMorgan and 6 India

If you observe the JSON data here, there are two elements inside the object, data and counts. what we need is data and also note that the data is actually an html, so we will have to process that too

In [39]:
data = json_data.get("data")

In [40]:
# now we will parse this html content and extract the URLs from here
soup = BeautifulSoup(data, "html.parser")
elements = soup.find_all("div", class_ = "col-xs-12 col-sm-6 col-md-4")
article_url_list = list()
for element in elements:
    links = element.find_all("a")
    for link in links:
        article_url = link["href"]
        article_url_list.append(article_url)
article_url_list


['https://cointopper.com/news/mt-gox-offloads-230-million-towards-rehabilitation-of-bankrupt-investors',
 'https://cointopper.com/news/three-pro-crypto-bills-to-debut-in-the-us-parliament',
 'https://cointopper.com/news/rbi-strengthens-stand-on-crypto-trading-ban',
 'https://cointopper.com/news/a-bullish-wave-in-the-crypto-market-ripple-price-surges-by-50',
 'https://cointopper.com/news/crypto-exchange-poloniex-to-delist-eight-crypto-tokens-from-its-platform',
 'https://cointopper.com/news/sagar-sarbhai-says-xrp-piloted-with-12-banks-worldwide-before-xrapid-launch',
 'https://cointopper.com/news/chinese-1-billion-blockchain-fund-to-raise-13-million-for-japanese-yen-stablecoin',
 'https://cointopper.com/news/worlds-second-largest-stock-exchange-to-acquire-swedish-fintech-cinnober',
 'https://cointopper.com/news/morgan-stanley-plans-to-offer-bitcoin-swaps-to-its-clients']

Now that we have established basic algorithm, we will now use the algorith in loop to iterate through all the requests and collect all urls on the website

In [2]:
offset_value = 9
is_data_available = True
article_url_list = list()

while is_data_available:
    try:
        request_url = f"https://cointopper.com/ajax/news?offset={offset_value}"
        print(f"Accessing URL: {request_url}")
        response = requests.get(request_url)
        decoded_content = response.content.decode("utf-8")
        json_data = json.loads(decoded_content)
        data = json_data.get("data")
        if data:
            soup = BeautifulSoup(data, "html.parser")
            elements = soup.find_all("div", class_ = "col-xs-12 col-sm-6 col-md-4")
            for element in elements:
                links = element.find_all("a")
                for link in links:
                    article_url = link["href"]
                    article_url_list.append(article_url)
            offset_value += 9
        else:
            is_data_available = False
    except Exception as e:
        print(f"Couldn't get data, Error: {e}")
        is_data_available = False
print(article_url_list)

Accessing URL: https://cointopper.com/ajax/news?offset=9
Accessing URL: https://cointopper.com/ajax/news?offset=18
Accessing URL: https://cointopper.com/ajax/news?offset=27
Accessing URL: https://cointopper.com/ajax/news?offset=36
Accessing URL: https://cointopper.com/ajax/news?offset=45
Accessing URL: https://cointopper.com/ajax/news?offset=54
Accessing URL: https://cointopper.com/ajax/news?offset=63
Accessing URL: https://cointopper.com/ajax/news?offset=72
Accessing URL: https://cointopper.com/ajax/news?offset=81
Accessing URL: https://cointopper.com/ajax/news?offset=90
Accessing URL: https://cointopper.com/ajax/news?offset=99
Accessing URL: https://cointopper.com/ajax/news?offset=108
Accessing URL: https://cointopper.com/ajax/news?offset=117
Accessing URL: https://cointopper.com/ajax/news?offset=126
Accessing URL: https://cointopper.com/ajax/news?offset=135
Accessing URL: https://cointopper.com/ajax/news?offset=144
Accessing URL: https://cointopper.com/ajax/news?offset=153
Accessing

In [3]:
article_url_list

['https://cointopper.com/news/revolutionizing-interbank-settlements-can-jpmorgan-and-6-indian-banks-transform-the-finance-industry-with-blockchain',
 'https://cointopper.com/news/protect-yourself-cardano-founder-warns-you-about-new-ada-scam-tactics',
 'https://cointopper.com/news/atomic-wallet-hit-by-massive-hack-35-million-worth-of-cryptocurrency-stolen',
 'https://cointopper.com/news/mark-cuban-takes-publishing-to-the-next-level-releases-book-as-nft-on-polygon-network',
 'https://cointopper.com/news/dont-fall-for-phishing-scams-learn-from-peter-schiffs-twitter-hack-and-protect-yourself-online',
 'https://cointopper.com/news/revolutionizing-finance-first-digital-introduces-fdusd-stablecoin-on-bnb-chain',
 'https://cointopper.com/news/safeguard-your-investments-discover-the-latest-flare-network-discord-hack-update',
 'https://cointopper.com/news/the-million-dollar-surprise-crypto-influencers-empty-promise-flooded-with-11m-in-cryptocurrencies',
 'https://cointopper.com/news/africas-curr

Now we have the list of URLs of all the articles on cointopper, so we will now take a look inside the articles and see how we can scrap the required data. Following is the required data:
- Title
- Image Link
- Post Content
- Date

In [47]:
post_link = article_url_list[0]
post_link

'https://cointopper.com/news/revolutionizing-interbank-settlements-can-jpmorgan-and-6-indian-banks-transform-the-finance-industry-with-blockchain'

In [5]:

response = requests.get("https://cointopper.com/news/you-can-overcome-information-overload-vitalik-buterins-guide")
soup = BeautifulSoup(response.content, "html.parser")
# getting title and image url
elements = soup.find_all("div", class_ = "post-image")
for element in elements:
    data = element.find_all("img")
    for d in data:
        title = d["title"]
        image_link = d["src"]
        print(f"title: {title}")
        print(f"image url: {image_link}")
    # getting date
    date_data = element.find("a")
    for d in date_data:
        date = d.get_text()
        print(f"date: {date}")
        if date:
            break
    if date is not None and title is not None and image_link is not None:
        break
# getting the content
twitter_content = list()
elementsTobeRemoved = soup.find_all("div", dir = "ltr")
for element in elementsTobeRemoved:
    data = element.find_all("blockquote", class_ = "twitter-tweet")
    for d in data:
        d.decompose()
elements = soup.find_all("div", dir = "ltr")
content = ""
print(elements)
for element in elements:
    content_data = element.find_all("p")
    for d in content_data:
        para = d.get_text()
        content += para
# print(f"content: {content}")

title: You Can Overcome Information Overload: Vitalik Buterin's Guide
image url: https://cointopper.com/uploads/topics/you-can-overcome-information-overload-vitalik-buterins-guide-16849115802312.png
date: May 24, 2023
[<div dir="ltr">- The article on the website vitalik.ca titled "<a href="https://vitalik.ca/general/2023/05/21/dont_overload.html">Don't Overload</a>!" discusses the concept of information overload and its negative impact on decision-making.<br/>- The author, Vitalik Buterin, emphasizes the need to avoid overwhelming oneself with excessive information in today's fast-paced digital age.<br/>- Information overload is described as a consequence of the abundance of information available online, leading to decreased productivity and increased stress levels.<br/>- The article mentions the importance of managing one's attention effectively and filtering out irrelevant or low-value information.<br/>- Vitalik Buterin suggests setting boundaries and consciously limiting the amount 

There are some websites where the structure of the webpage changes. They're the following:
1. https://cointopper.com/news/you-can-overcome-information-overload-vitalik-buterins-guide
    - in this webpage, content is essentially a bullet list with "-" as bullet symbols. In this structure, there are no <p> tags, but most of the webpages are regular so we don't need to modify code to include this. what we will do is manually add the content on db
    - also, we need to discuss how this kind of data is required? because once it passes through our code, it will be stripped of all html content