In [1]:
# 1. Imports and topic setup
from newsapi import NewsApiClient
from newsplease import NewsPlease
from sqlalchemy import create_engine, ForeignKey, Column, Integer, String, select
from sqlalchemy.orm import declarative_base, Session

newsapi = NewsApiClient(api_key='fbfb692eb3844ce59e10eea6069d1161')

topic_list = ['crypto',
              'bitcoin',
              
             'covid',
             'vaccine',
              
             'startups',
              'technology',
              'e-commerce',
              
             'Nokia',
             'Apple',
             'Microsoft']

In [2]:
# 2. Pull 100 initial articles of all chosen topics
# NewsAPI doesn't allow free accounts to access results after number 100.
# NewsAPI also doesn't allow free accounts to access results from before >1month ago.
all_articles = {topic: newsapi.get_everything(q=topic, page_size=100, page=1, language='en') for topic in topic_list}
print(all_articles.keys())

dict_keys(['crypto', 'bitcoin', 'covid', 'vaccine', 'startups', 'technology', 'e-commerce', 'Nokia', 'Apple', 'Microsoft'])


In [3]:
# 3. Check totalResults num and length of downloaded content for integrity
print(*(all_articles[topic]['totalResults'] for topic in topic_list))
print(*(len(all_articles[topic]['articles']) for topic in topic_list))

11084 12532 220847 63229 5609 103643 12587 977 37099 19923
100 100 100 100 100 100 100 100 100 100


In [4]:
# 4. Save urls of every topic to their own dict
topic_urls = {}
for topic in topic_list:
    current_topic_articles = all_articles[topic]['articles']
    current_topic_urls = [article['url'] for article in current_topic_articles]
    topic_urls[topic] = current_topic_urls


print(topic_urls.keys())

dict_keys(['crypto', 'bitcoin', 'covid', 'vaccine', 'startups', 'technology', 'e-commerce', 'Nokia', 'Apple', 'Microsoft'])


In [5]:
test_url = topic_urls['bitcoin'][0]
article = NewsPlease.from_url(test_url)
print(article.get_dict().keys())
print(article.date_publish)

dict_keys(['authors', 'date_download', 'date_modify', 'date_publish', 'description', 'filename', 'image_url', 'language', 'localpath', 'maintext', 'source_domain', 'text', 'title', 'title_page', 'title_rss', 'url'])
2021-05-12 18:17:08




In [6]:
test_url2 = topic_urls['Apple'][15]
article2 = NewsPlease.from_url(test_url2)
print(article2.get_dict().keys())
#print(article2.maintext)

dict_keys(['authors', 'date_download', 'date_modify', 'date_publish', 'description', 'filename', 'image_url', 'language', 'localpath', 'maintext', 'source_domain', 'text', 'title', 'title_page', 'title_rss', 'url'])


In [14]:
# 5. Create Article classes.
Base = declarative_base()

class Article(Base):
    __tablename__ = "articles"
    
    id = Column(Integer, primary_key=True)
    parent_topic = Column(String)
    source = Column(String)
    title = Column(String)
    description = Column(String)
    publication_date = Column(String)
    url = Column(String)
    
    def __init__(self, parent_topic, source, title, description, publication_date, url):
        self.parent_topic = parent_topic
        self.source = source
        self.title = title
        self.description = description
        self.publication_date = publication_date
        self.url = url
        
    def __str__(self):
        return f'Article(topic={self.parent_topic}, title={self.title}, description={self.description} url={self.url})'


class ArticleContent(Base):
    __tablename__ = "articles_content"
    
    id = Column(Integer, primary_key=True)
    article_id = Column(Integer, 
                         ForeignKey("articles.id", ondelete='CASCADE'), 
                         nullable=False
                         )
    content = Column(String)
    
    def __init__(self, article_id, content):
        self.article_id = article_id
        self.content = content
        
    def __str__(self):
        return self.content
        
        
class ArticleFeatures(Base):
    __tablename__ = "articles_features"
    
    id = Column(Integer, primary_key=True)
    article_id = Column(Integer, 
                        ForeignKey("articles.id", ondelete='CASCADE'),
                        nullable=False
                        )
    keywords_pca = Column(String)
    keywords_nmf = Column(String)
    
    def __init__(self, keywords_pca, keywords_nmf):
        self.keywords_pca = keywords_pca
        self.keywords_nmf = keywords_nmf
        
    def __str__(self):
        return f'pca: {self.keywords_pca}\n nmf: {self.keywords_nmf}'

In [15]:
# 6. Establish session to database.
engine = create_engine(f"sqlite:///NewsAPI_articles.db")
Base.metadata.create_all(engine)

In [16]:
with Session(engine) as session:
    objects = []
    for topic in topic_list:
        for url in topic_urls[topic]:
            try:
                article = NewsPlease.from_url(url)
            except: #ArticleException: #- some download error happened, skip this article
                continue
            mdl_article = Article(
                parent_topic = topic,
                source = article.source_domain,
                title = article.title,
                description = article.description,
                publication_date = article.date_publish,
                url = url
            )
            mdl_article_content = ArticleContent(
                article_id = mdl_article,
                content = article.maintext
            )
            objects += [mdl_article, mdl_article_content]
            # mdl_article_features = ArticleFeatures()
    session.bulk_save_objects(objects)
    session.commit()

not a 200 response: 404
not a 200 response: 503
not a 200 response: 404
not a 200 response: 503


In [17]:
row = []
with Session(engine) as session:
    row = session.query(Article).all()
    for article in row:
        print(article)

Article(topic=crypto, title=Multicoin Capital debuts new $100M fund to bet on crypto startups and tokens, url=http://techcrunch.com/2021/05/04/multicoin-capital-debuts-100-million-venture-fund-to-bet-on-crypto-startups-and-tokens/)
Article(topic=crypto, title=Crypto exchange Binance faces US money laundering probe, url=https://www.engadget.com/crypto-exchange-binance-money-laundering-140037194.html)
Article(topic=crypto, title=Most new NVIDIA RTX gaming GPUs will be crypto-limited, url=https://www.engadget.com/nvidia-rtx-gpu-crypto-limit-122007238.html)
Article(topic=crypto, title=Bitcoin crashes as investors fear crypto bull market could be nearing its end, url=http://techcrunch.com/2021/05/19/bitcoin-crashes-as-investors-fear-crypto-bull-market-could-be-nearing-its-end/)
Article(topic=crypto, title=blockchain fund backed by Bpifrance, Ubisoft – TechCrunch, url=http://techcrunch.com/2021/05/20/white-star-capital-launches-new-50m-crypto-blockchain-fund-backed-by-bpifrance-ubisoft/)
Art

In [18]:
print(len(row))

996
