In [1]:
import feedparser as fp
import json
import newspaper
from newspaper import Article
from time import mktime
from datetime import datetime
import pandas as pd

In [2]:
# Set the limit for number of articles to download
LIMIT = 4
data = {}
data['newspapers'] = {}


In [3]:
# Loads the JSON files with news sites
with open('NewsPapers.json') as data_file:
    companies = json.load(data_file)

count = 1

In [5]:
# Iterate through each news company
for company, value in companies.items():
    # If a RSS link is provided in the JSON file, this will be the first choice.
    # Reason for this is that, RSS feeds often give more consistent and correct data.
    # If you do not want to scrape from the RSS-feed, just leave the RSS attr empty in the JSON file.
    if 'rss' in value:
        d = fp.parse(value['rss'])
        print("Downloading articles from ", company)
        newsPaper = {
            "rss": value['rss'],
            "link": value['link'],
            "articles": []
        }
        for entry in d.entries:
            # Check if publish date is provided, if no the article is skipped.
            # This is done to keep consistency in the data and to keep the script from crashing.
            if hasattr(entry, 'published'):
                if count > LIMIT:
                    break
                article = {}
                article['link'] = entry.link
                date = entry.published_parsed
                article['published'] = datetime.fromtimestamp(mktime(date)).isoformat()
                try:
                    content = Article(entry.link)
                    content.download()
                    content.parse()
                except Exception as e:
                    # If the download for some reason fails (ex. 404) the script will continue downloading
                    # the next article.
                    print(e)
                    print("continuing...")
                    continue
                article['title'] = content.title
                article['text'] = content.text
                newsPaper['articles'].append(article)
                print(count, "articles downloaded from", company, ", url: ", entry.link)
                count = count + 1
    else:
        # This is the fallback method if a RSS-feed link is not provided.
        # It uses the python newspaper library to extract articles
        print("Building site for ", company)
        paper = newspaper.build(value['link'], memoize_articles=False)
        newsPaper = {
            "link": value['link'],
            "articles": []
        }
        noneTypeCount = 0
        for content in paper.articles:
            if count > LIMIT:
                break
            try:
                content.download()
                content.parse()
            except Exception as e:
                print(e)
                print("continuing...")
                continue
            # Again, for consistency, if there is no found publish date the article will be skipped.
            # After 10 downloaded articles from the same newspaper without publish date, the company will be skipped.
            if content.publish_date is None:
                print(count, " Article has date of type None...")
                noneTypeCount = noneTypeCount + 1
                if noneTypeCount > 10:
                    print("Too many noneType dates, aborting...")
                    noneTypeCount = 0
                    break
                count = count + 1
                continue
            article = {}
            article['title'] = content.title
            article['text'] = content.text
            article['link'] = content.url
            article['published'] = content.publish_date.isoformat()
            newsPaper['articles'].append(article)
            print(count, "articles downloaded from", company, " using newspaper, url: ", content.url)
            count = count + 1
            noneTypeCount = 0
    count = 1
    data['newspapers'][company] = newsPaper

# Finally it saves the articles as a JSON-file.
try:
    with open('scraped_articles.json', 'w') as outfile:
        json.dump(data, outfile)

except Exception as e: print(e)






Building site for  cnn
1  Article has date of type None...
2  Article has date of type None...
Downloading articles from  bbc
1 articles downloaded from bbc , url:  https://www.bbc.co.uk/news/uk-53091856
2 articles downloaded from bbc , url:  https://www.bbc.co.uk/news/uk-53093127
3 articles downloaded from bbc , url:  https://www.bbc.co.uk/news/uk-scotland-53083995
4 articles downloaded from bbc , url:  https://www.bbc.co.uk/news/uk-politics-53093244
Downloading articles from  theguardian
1 articles downloaded from theguardian , url:  https://www.theguardian.com/politics/2020/jun/18/dominic-raab-taking-the-knee-feels-like-symbol-of-subjugation
2 articles downloaded from theguardian , url:  https://www.theguardian.com/education/2020/jun/18/campaign-doesnt-end-with-rhodes-statue-says-oxford-group
3 articles downloaded from theguardian , url:  https://www.theguardian.com/uk-news/2020/jun/18/blm-protests-prompt-edinburgh-to-reassess-fate-of-golliwog-mural
4 articles downloaded from thegua

In [9]:
# Finally it saves the articles as a JSON-file.
try:
    with open('scraped_articles.csv', 'w') as outfile:
        json.dump(data, outfile)
except Exception as e: print(e)




In [None]:
'''ab humare pass do dataset hai 
ek to wahi purana waala 
aur dusra 
jo abhi upar hai
ab hum log nltk aur dl aur ml ek aadh algo lagayenge
we may also use yoda parser in order to shift noun ahead

'''


In [11]:
import pandas as pd
df_one=pd.read_pickle("/home/rd/Desktop/prj/cleaned_df.pkl")

In [12]:
df_one

Unnamed: 0,link,published,title,text,author,label,clean_title,clean_text
0,https://www.nytimes.com/2019/11/20/business/me...,2019-11-21T04:22:14,Media Workers Call Out Pay Gaps in Crowdsource...,With income inequality a focus of the current ...,"[Marc Tracy, Tiffany Hsu]",0,media workers call pay gaps crowdsourced sprea...,income inequality focus current presidential c...
1,https://www.nytimes.com/2019/11/20/business/gm...,2019-11-21T00:07:24,G.M. Sues Rival Over Bribery Scheme as Union S...,Fiat Chrysler “was able to obtain unique advan...,[Neal E. Boudette],0,gm sues rival bribery scheme union scandal exp...,"fiat chrysler ""was able obtain unique advantag..."
2,https://www.nytimes.com/2019/11/20/business/ho...,2019-11-21T01:58:42,U.S. Bill Supporting Hong Kong Rights Heads to...,A bill compelling the United States to support...,"[David Yaffe-Bellany, Alan Rappeport]",0,us bill supporting hong kong rights heads trum...,bill compelling united states support pro demo...
3,https://www.nytimes.com/2019/11/20/business/tr...,2019-11-21T02:08:37,Trump Floats Tariff Exclusions for Apple at Te...,"Jonathan Gold, a spokesman for Americans for F...",[Ana Swanson],0,trump floats tariff exclusions apple texas man...,jonathan gold spokesman americans free trade l...
4,https://www.nytimes.com/2019/11/21/business/de...,2019-11-21T08:00:10,Henry Paulson Sounds Alarm: U.S.-China Relatio...,The United States and China will eventually se...,[Andrew Ross Sorkin],0,henry paulson sounds alarm us china relations ...,united states china eventually settle differen...
5,https://www.nytimes.com/2019/11/20/business/te...,2019-11-20T22:28:32,Tesla’s Winding Road to Berlin,"GRÜNHEIDE, Germany — The visitors from Palo Al...","[Christopher F. Schuetze, Jack Ewing]",0,teslas winding road berlin,grünheide germany visitors palo ao calif shown...
6,https://www.nytimes.com/2019/11/20/technology/...,2019-11-20T22:22:38,Google Hires Firm Known for Anti-Union Efforts,"In August, the company handed down new “commun...","[Noam Scheiber, Daisuke Wakabayashi]",0,google hires firm known anti union efforts,"august company handed new ""community guideline..."
7,https://www.nytimes.com/2019/11/20/business/bu...,2019-11-20T16:19:21,"One Whopper Jr., Hold the Toy","In the climactic scene of “Toy Story 3,” Woody...",[David Yaffe-Bellany],0,one whopper jr hold toy,"climactic scene ""toy story 3"" woody buzz light..."
8,https://www.nytimes.com/2019/11/21/business/jo...,2019-11-21T10:00:20,"Leading AARP, With No Plans to Retire",How has Washington changed during that time?\n...,[David Gelles],0,leading aarp plans retire,washinon changed time? big difference communic...
9,https://www.nytimes.com/2019/11/21/business/de...,2019-11-21T12:07:55,Trump’s Bizarre Apple Factory Visit,Charles Schwab is reportedly in talks to buy T...,[],0,trumps bizarre apple factory visit,charles schwab reportedly talks buy td ameritr...


In [13]:
df_two=pd.read_json("/home/rd/Desktop/prj/scraped_articles.json")

In [14]:
df_two


Unnamed: 0,newspapers
bbc,{'rss': 'http://feeds.bbci.co.uk/news/rss.xml'...
breitbart,"{'link': 'http://www.breitbart.com/', 'article..."
cnn,"{'link': 'http://edition.cnn.com/', 'articles'..."
foxnews,"{'link': 'http://www.foxnews.com/', 'articles'..."
infowars,"{'link': 'https://www.infowars.com/', 'article..."
nbcnews,"{'link': 'http://www.nbcnews.com/', 'articles'..."
theguardian,"{'rss': 'https://www.theguardian.com/uk/rss', ..."
washingtonpost,{'rss': 'http://feeds.washingtonpost.com/rss/w...


In [None]:
df_