In [2]:
from newspaper import build
from newspaper import Article
from textblob import TextBlob
import nltk
from wordcloud import WordCloud, STOPWORDS
from datetime import datetime
import schedule
import time

nltk.download("punkt")


[nltk_data] Downloading package punkt to /home/fox/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
fave_news = [
    {
        "outlet": "The Wall Street Journal",
        "url": "https://www.wsj.com/",
        "slugs": ["blabla"],
        "bad_urls": ["blabla"],
    },
    {
        "outlet": "The Financial Times",
        "url": "https://www.ft.com/",
        "slugs": ["blabla"],
        "bad_urls": ["blabla"],
    },
    {
        "outlet": "Le Monde",
        "url": "https://www.lemonde.fr/",
        "slugs": ["blabla"],
        "bad_urls": ["blabla"],
    },
    {
        "outlet": "The China Daily",
        "url": "https://www.chinadaily.com.cn/",
        "slugs": ["blabla"],
        "bad_urls": ["blabla"],
    },
]


date = datetime.today().strftime("%Y-%m-%d")
data = []
urls_set = set()
bad_urls = []


def crawl():
    print("\nI'm working...")

    for item in fave_news:
        news_outlet = item["outlet"]
        print("\nnewspaper: ", news_outlet)
        articles = []
        paper_articles = build(
            item["url"], memoize_articles=False, fetch_images=False, MIN_WORD_COUNT=400
        )
        for article in paper_articles.articles:
            # check to see if the article url doesn't already exist in the list
            if article.url not in urls_set:
                # and that the url is not in the bad_urls list
                if article.url not in item["bad_urls"]:
                    # and that the url doesn't lead to a video or a comment section
                    if any(slug in article.url for slug in item["slugs"]):
                        bad_urls.append(article.url)
                    else:
                        urls_set.add(article.url)
                        articles.append(article.url)
                else:
                    bad_urls.append(article.url)
            if len(articles) == 3:
                break

        ### Retrieve metadata and text for each article ###

        for i in articles:
            my_article = Article(i, language="en")

            try:
                my_article.download()
                my_article.parse()
                my_article.nlp()
            except:
                print("\narticle failed to download")
                print(i)
                continue

            url = i
            title = my_article.title
            text = TextBlob(my_article.text)
            keywords = my_article.keywords

            """
            Subjectivity is the output that lies within [0,1] and refers to personal opinions and judgments.
            Polarity is the output that lies between [-1,1], where -1 refers to negative sentiment and +1 refers to positive sentiment.
            """
            polarity = text.sentiment.polarity
            subjectivity = text.sentiment.subjectivity

            print("\ntitle: ", title)
            print("url: ", url)
            # print("\nkeywords: ", keywords)
            # print("\npolarity: ", polarity)
            # print("\nsubjectivity: ", subjectivity)
            # print("\nsummary: ", my_article.summary)

            data.append(
                [date, news_outlet, url, title, text, keywords, polarity, subjectivity]
            )

        print("\ntime: ", datetime.now())
        print("\nnumber of articles in the database: ", len(data))
        print("\nbad urls:")
        for i in bad_urls:
            print(i)
    return


# schedule.every().day.at("21:00").do(crawl)
# schedule.every(1).minutes.do(crawl)

# while True:
#     schedule.run_pending()
#     time.sleep(60)  # wait one minute

crawl()


I'm working...

newspaper:  The Wall Street Journal

article failed to download
https://www.wsj.com/news/latest-headlines?mod=wsjheader

article failed to download
https://www.wsj.com/news/world?mod=nav_top_section

article failed to download
https://www.wsj.com/news/types/africa-news?mod=nav_top_subsection

time:  2022-09-27 17:01:23.821510

number of articles in the database:  0

bad urls:

newspaper:  The Financial Times

title:  Financial Times
url:  https://www.ft.com/companies/media

title:  Bitcoin mining: Watt is money? | FT Standpoint
url:  https://www.ft.com/video/4195bbed-8749-481e-a3a4-94815057df5f

title:  Will Tesla's Optimus robot become a reality? | FT Tech
url:  https://www.ft.com/video/51e4bfd4-9d50-43a7-afa1-53030dcf65fc

time:  2022-09-27 17:01:29.985632

number of articles in the database:  3

bad urls:

newspaper:  Le Monde

title:  La livre sterling tombe à un plus-bas historique face au dollar
url:  https://www.lemonde.fr/economie/article/2022/09/27/la-livre-st

In [4]:
# [date, news_outlet, url, title, text, keywords, polarity, subjectivity]
i = 1
print(data[i][1])
print(data[i][3])
print(data[i][4])

The Financial Times
Financial Times
Cookies on FT Sites

We use cookies and other data for a number of reasons, such as keeping FT Sites reliable and secure, personalising content and ads, providing social media features and to analyse how our Sites are used.
The Financial Times
Bitcoin mining: Watt is money? | FT Standpoint
Explainer

Bitcoin mining: Watt is money? | FT Standpoint

Using exclusive data from the Cambridge Centre for Alternative Finance, Rhizomatiks gives fresh insights into the impact of the vast energy used to mine cryptocurrency - a business that consumes the same electricity as a medium-sized country. This film, with original music, was made to be viewed as a live video installation and as an online film
The Financial Times
Will Tesla's Optimus robot become a reality? | FT Tech
You can enable subtitles (captions) in the video player

[MUSIC PLAYING]

RICHARD WATERS: And when he first talked about this, he had a human dressed up as a robot dancing on stage. And I thi