In [1]:
import re
import pandas as pd
import numpy as np
import requests
from lxml import html, etree
import time
import json
import datetime

PROXIES = {
 ## 'http': 'http://VPN4726:@ar.finevpn.org',
 ## 'https': 'https://VPN4726:@ar.finevpn.org',
}


In [14]:
## http://docs.python-guide.org/en/latest/scenarios/scrape/
##

class news_forbes_parser():
    def __init__(self):
        self.link = 'https://www.forbes.com/search/?q=bitcoin'
        self.save_dir = "news_forbes/"
    
    def read_article_text(self,article_href, verbose=False):
        
        text = ""
        has_next = True  ## if need to read next page
        cur_page = 0
        
        while has_next:
            ## read current page text
            cur_page = cur_page + 1
            has_next = False
            try:
                article_page = requests.get(article_href + str(cur_page), proxies=PROXIES)
                article_html = html.fromstring(article_page.content)
            except:
                print ("can't read page ",article_href)
                
            try:
                article_text = article_html.xpath('//article-body-container[@class="article-body fs-article fs-responsive-text"]')[0]
                article_text = etree.tostring(article_text).decode("utf-8") 
            except:
                print ("error reading text from page: ",article_href)
                article_text = ""           
            text = text +" " + article_text
            
            ## has next page
            try:
                pages_nav = article_html.xpath('//footer[@class="article-footer"]')[0]
                pages_nav = article_html.xpath('.//pagination-nav')[0]
                pages_nav = article_html.xpath('.//a[@class="next"]')
                if len(pages_nav)>0: has_next = True
            except:
                ##print ("error reading page_num ",article_href)
                pass

        text = re.sub("(\n|\t)"," ",text)
        ## text = re.sub("&lt;","<",text)
        ## text = re.sub("&gt;",">",text)
        ## stext = re.sub("&quot;",'"',text)
        text = re.sub("<tweet-quotes.*?</tweet-quotes>","",text)    ## examples with twitter
        text = re.sub("<fbs-accordion.*?</fbs-accordion>","",text)  ## text under images
        text = re.sub("<sig-file .*?</sig-file>","",text)  ## end of article
        text = re.sub('<small class="article-photo-credit.*?</small>',"",text)
        
        
        ##if verbose: print (text)
        return text
    
    def clean_text (self,article_text):
        article_text = re.sub("<script>.*?</script>","",article_text)
        article_text = re.sub("<.*?>","",article_text)
        article_text = re.sub("&#8217;","'",article_text)
        article_text = re.sub("&amp;","&",article_text)
        article_text = re.sub("&#[0-9]+;|\n"," ",article_text)
        article_text = re.sub(" {1,}"," ",article_text)

        return article_text

    def parse_page(self,page_num, verbose = True, docs_per_page = 10):
        
        page_link = "https://www.forbes.com/forbesapi/search/all.json"
        data_val = {
        'limit':docs_per_page,
        'query':'bitcoin',
        'retrievedfields':'author,date,description,title,type,uri',
        'sort':'date',
        'start':(page_num-1)*docs_per_page}

        r = requests.get(page_link, params = data_val, proxies = PROXIES)    

        data_dump = json.loads(r.text)
        if 'contentList' not in data_dump.keys():
            print ("error on requesting page ", page_num)

        for i_article in data_dump['contentList']:

            ## parsing date
            ##
            art_href =  i_article['uri']
            
            art_time = i_article['date']
            art_time = str(datetime.datetime.utcfromtimestamp(art_time/1000).strftime('%Y-%m-%dT%H:%M:%S'))
            art_date = art_time[0:10]    

            art_title = i_article['title']
            art_title = bc_parser.clean_text(art_title)

            ### description
            try:
                art_cont = i_article['description']
                art_cont = bc_parser.clean_text(art_cont)
            except:
                print("Description not available",art_href)
                art_cont = ""
            
            ### author
            try:
                art_authour = i_article['author']
            except:
                print ("author not available ", art_href)
                art_authour = 'None'
                
            art_type = i_article['type']


            if verbose: print (art_date,art_time,art_type,art_authour, art_title, art_cont, art_href)
                
            ## read article text
            article_text = self.read_article_text(art_href,verbose=verbose)
            article_text = self.clean_text(article_text)
            
            ## saving article
            fn = re.sub("[^a-zA-Z0-9 ]","",art_title[0:20])
            file_name = self.save_dir + art_date + "_" + 'p_' + '{num:03d}'.format(num=page_num) + "_" + fn + ".txt"
            if verbose: print ('FILE NAME: ',file_name)
            f = open(file_name,'w', encoding = 'utf-8')
            f.write(art_title + "\n")
            f.write(art_time + "\n")
            f.write("<uri>" + art_href + "</uri>\n")
            f.write("<brief>" + art_cont + "</brief>\n")
            f.write("<author>" + art_authour + "</author>\n")
            f.write("<type>" + art_type + "</type>\n")
            f.write(article_text)
            f.close()
            
        print ("page " + str(page_num) + " parsed")



            
bc_parser = news_forbes_parser()

for i in range(350,400):
    bc_parser.parse_page(i,verbose=True, docs_per_page = 10)
    pass

2011-06-28 2011-06-28T20:08:00 blog Peter Cohan Can Bitcoin Survive? Is It Legal?  Bitcoin -- the online currency used to buy Alpaca wool socks and illegal drugs whose value dropped from $17.50 to "pennies" after a June 19 hack into its currency exchanger, Mt.Gox --has gotten plenty of media attention. But unless consumers and merchants can be persuaded that adopting it will make them [...] https://www.forbes.com/sites/petercohan/2011/06/28/can-bitcoin-survive-is-it-legal/
FILE NAME:  news_forbes/2011-06-28_p_350_Can Bitcoin Survive.txt
2011-06-20 2011-06-20T14:01:58 blog Peter Cohan Are Bitcoins Worth Their Weight in Gold?  Currencies are a marvelous invention. Without them, every economic transaction would be a time-consuming barter. But the emergence of bitcoins -- a new so-called virtual currency -- and gold as legal tender make me wonder why people choose to believe in some currencies. Especially since the bitcoins could go away. Until last week, I was [...] https://www.forbes.com

KeyError: 'contentList'

In [5]:

a_href = "https://www.forbes.com/sites/rachelwolfson/2018/03/29/an-explanation-for-the-rise-of-stable-coins-as-a-low-volatility-cryptocurrency/"
a_text = bc_parser.read_article_text(a_href)    
##bc_parser.clean_text(a_text)