### **Scraping**

In [None]:
!pip install feedparser

Collecting feedparser
[?25l  Downloading https://files.pythonhosted.org/packages/1c/21/faf1bac028662cc8adb2b5ef7a6f3999a765baa2835331df365289b0ca56/feedparser-6.0.2-py3-none-any.whl (80kB)
[K     |████                            | 10kB 14.0MB/s eta 0:00:01[K     |████████                        | 20kB 19.2MB/s eta 0:00:01[K     |████████████▏                   | 30kB 15.9MB/s eta 0:00:01[K     |████████████████▏               | 40kB 14.0MB/s eta 0:00:01[K     |████████████████████▎           | 51kB 9.3MB/s eta 0:00:01[K     |████████████████████████▎       | 61kB 10.0MB/s eta 0:00:01[K     |████████████████████████████▍   | 71kB 8.7MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 5.3MB/s 
[?25hCollecting sgmllib3k
  Downloading https://files.pythonhosted.org/packages/9e/bd/3704a8c3e0942d711c1299ebf7b9091930adae6675d7c8f476a7ce48653c/sgmllib3k-1.0.0.tar.gz
Building wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?2

In [None]:
!pip install newspaper3k

Collecting newspaper3k
[?25l  Downloading https://files.pythonhosted.org/packages/d7/b9/51afecb35bb61b188a4b44868001de348a0e8134b4dfa00ffc191567c4b9/newspaper3k-0.2.8-py3-none-any.whl (211kB)
[K     |█▌                              | 10kB 14.3MB/s eta 0:00:01[K     |███                             | 20kB 20.3MB/s eta 0:00:01[K     |████▋                           | 30kB 12.0MB/s eta 0:00:01[K     |██████▏                         | 40kB 10.6MB/s eta 0:00:01[K     |███████▊                        | 51kB 8.0MB/s eta 0:00:01[K     |█████████▎                      | 61kB 7.6MB/s eta 0:00:01[K     |██████████▉                     | 71kB 8.6MB/s eta 0:00:01[K     |████████████▍                   | 81kB 9.3MB/s eta 0:00:01[K     |██████████████                  | 92kB 8.3MB/s eta 0:00:01[K     |███████████████▌                | 102kB 8.5MB/s eta 0:00:01[K     |█████████████████               | 112kB 8.5MB/s eta 0:00:01[K     |██████████████████▋             | 122kB 8.5

In [None]:
import os
from google.colab import drive

# Mount google drive
DRIVE_MOUNT='/content/gdrive'
drive.mount(DRIVE_MOUNT)

# create folder to write data to
B11=os.path.join(DRIVE_MOUNT, 'My Drive', 'B11_2021')
HOMEWORK_FOLDER=os.path.join(B11, 'Project')
os.makedirs(HOMEWORK_FOLDER, exist_ok=True)

Mounted at /content/gdrive


In [None]:
import json

In [None]:
dictionary = {
  "cnn": {
    "link": "http://edition.cnn.com/"
  },
  "bbc": {
    "rss": "http://feeds.bbci.co.uk/news/rss.xml",
    "link": "http://www.bbc.com/"
  },
  "theguardian": {
    "rss": "https://www.theguardian.com/uk/rss",
    "link": "https://www.theguardian.com/international"
  },
  "breitbart": {
    "link": "http://www.breitbart.com/"
  },
  "infowars": {
    "link": "https://www.infowars.com/"
  },
  "foxnews": {
    "link": "http://www.foxnews.com/"
  },
  "nbcnews": {
    "link": "http://www.nbcnews.com/"
  },
  "washingtonpost": {
    "rss": "http://feeds.washingtonpost.com/rss/world",
    "link": "https://www.washingtonpost.com/"
  },
  "theonion": {
    "link": "http://www.theonion.com/"
  }
}

In [None]:
json_object = json.dumps(dictionary, indent = 4) 

In [None]:
with open("NewsPapers.json", "w") as outfile: 
    outfile.write(json_object) 

In [None]:
import feedparser as fp
import json
import newspaper
from newspaper import Article
from time import mktime
from datetime import datetime

# Set the limit for number of articles to download
LIMIT = 14500

data = {}
data['newspapers'] = {}

# Loads the JSON files with news sites
with open('NewsPapers.json') as data_file:
    companies = json.load(data_file)

count = 1

# Iterate through each news company
for company, value in companies.items():
    # If a RSS link is provided in the JSON file, this will be the first choice.
    # Reason for this is that, RSS feeds often give more consistent and correct data.
    # If you do not want to scrape from the RSS-feed, just leave the RSS attr empty in the JSON file.
    if 'rss' in value:
        d = fp.parse(value['rss'])
        print("Downloading articles from ", company)
        newsPaper = {
            "rss": value['rss'],
            "link": value['link'],
            "articles": []
        }
        for entry in d.entries:
            # Check if publish date is provided, if no the article is skipped.
            # This is done to keep consistency in the data and to keep the script from crashing.
            if hasattr(entry, 'published'):
                if count > LIMIT:
                    break
                article = {}
                article['link'] = entry.link
                date = entry.published_parsed
                article['published'] = datetime.fromtimestamp(mktime(date)).isoformat()
                try:
                    content = Article(entry.link)
                    content.download()
                    content.parse()
                except Exception as e:
                    # If the download for some reason fails (ex. 404) the script will continue downloading
                    # the next article.
                    print(e)
                    print("continuing...")
                    continue
                article['title'] = content.title
                article['text'] = content.text
                newsPaper['articles'].append(article)
                print(count, "articles downloaded from", company, ", url: ", entry.link)
                count = count + 1
    else:
        # This is the fallback method if a RSS-feed link is not provided.
        # It uses the python newspaper library to extract articles
        print("Building site for ", company)
        paper = newspaper.build(value['link'], memoize_articles=False)
        newsPaper = {
            "link": value['link'],
            "articles": []
        }
        noneTypeCount = 0
        for content in paper.articles:
            if count > LIMIT:
                break
            try:
                content.download()
                content.parse()
            except Exception as e:
                print(e)
                print("continuing...")
                continue
            # Again, for consistency, if there is no found publish date the article will be skipped.
            # After 10 downloaded articles from the same newspaper without publish date, the company will be skipped.
            if content.publish_date is None:
                print(count, " Article has date of type None...")
                noneTypeCount = noneTypeCount + 1
                if noneTypeCount > 100:
                    print("Too many noneType dates, aborting...")
                    noneTypeCount = 0
                    break
                count = count + 1
                continue
            article = {}
            article['title'] = content.title
            article['text'] = content.text
            article['link'] = content.url
            article['published'] = content.publish_date.isoformat()
            newsPaper['articles'].append(article)
            print(count, "articles downloaded from", company, " using newspaper, url: ", content.url)
            count = count + 1
            noneTypeCount = 0
    count = 1
    data['newspapers'][company] = newsPaper

# Finally it saves the articles as a JSON-file.
try:
    with open('scraped_articles.json', 'w') as outfile:
        json.dump(data, outfile)
except Exception as e: print(e)

Building site for  cnn
1  Article has date of type None...
2  Article has date of type None...
3  Article has date of type None...
4  Article has date of type None...
5  Article has date of type None...
6  Article has date of type None...
7  Article has date of type None...
8  Article has date of type None...
9  Article has date of type None...
10  Article has date of type None...
11  Article has date of type None...
12  Article has date of type None...
13  Article has date of type None...
14  Article has date of type None...
15  Article has date of type None...
16  Article has date of type None...
17  Article has date of type None...
18  Article has date of type None...
19  Article has date of type None...
20  Article has date of type None...
21  Article has date of type None...
22  Article has date of type None...
23  Article has date of type None...
24  Article has date of type None...
25  Article has date of type None...
26  Article has date of type None...
27  Article has date of 

In [None]:
with open('scraped_articles.json') as json_data:
    d = json.load(json_data)

In [None]:
for i, site in enumerate((list(d['newspapers']))):
    print(i, site)

0 cnn
1 bbc
2 theguardian
3 breitbart
4 infowars
5 foxnews
6 nbcnews
7 washingtonpost
8 theonion


In [None]:
import pandas as pd
for i, site in enumerate((list(d['newspapers']))):
    articles = list(d['newspapers'][site]['articles'])
    if i == 0:
        df = pd.DataFrame.from_dict(articles)
        df["site"] = site
    else:
        new_df = pd.DataFrame.from_dict(articles)
        new_df["site"] = site
        df = pd.concat([df, new_df], ignore_index = True)     

In [None]:
df.shape

(1438, 5)

In [None]:
df

Unnamed: 0,title,text,link,published,site
0,"Eyewitnesses recount bloody crackdown in Bago,...",At least 82 anti-coup protestors were killed b...,http://edition.cnn.com/videos/world/2021/04/16...,2021-04-16T00:00:00,cnn
1,This Welsh river turned white due to a milk spill,Photos and videos of the River Dulais in Wales...,http://edition.cnn.com/videos/world/2021/04/16...,2021-04-16T00:00:00,cnn
2,Hong Kong police showcase 'Chinese-style goose...,Hong Kong marked the first National Security E...,http://edition.cnn.com/videos/world/2021/04/16...,2021-04-16T00:00:00,cnn
3,"In Brazil, coronavirus killed 3 people every m...","Experts warn Brazil could soon suffer an ""unim...",http://edition.cnn.com/videos/world/2021/04/16...,2021-04-16T00:00:00,cnn
4,New sanctions imposed on Russia in response to...,The Biden administration targeted Russia with ...,http://edition.cnn.com/videos/politics/2021/04...,2021-04-15T00:00:00,cnn
...,...,...,...,...,...
1433,12 Steps to Starting a Small Business,Getty Images\n\nBeing your own boss can be imm...,https://www.nbcnews.com/veteran-services/next-...,2019-05-02T18:39:00+00:00,nbcnews
1434,Military families say this is their top concern,Members of the military face hurdles every day...,https://www.nbcnews.com/veteran-services/next-...,2019-05-28T17:17:00+00:00,nbcnews
1435,CNBC AND ACORNS ANNOUNCE STRATEGIC PARTNERSHIP,CNBC TO PROVIDE EDITORIAL AND PRODUCTION EXPER...,https://www.cnbc.com/2019/01/28/cnbc-and-acorn...,2019-01-28T00:00:00,nbcnews
1436,'Captain Tom': Funeral held for U.K. war veter...,"LONDON — ""I told you I was old,"" will be the e...",https://www.nbcnews.com/news/world/captain-tom...,2021-02-27T16:19:00+00:00,nbcnews


In [None]:
!cp scraped_articles.json "/content/gdrive/My Drive/B11_2021/Project/"