In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
def addToDB(DB_NAME,COL_NAME,PATH,FILE):
    '''
    Imports a file into mongoDB
    
    Parameters:
    --------
    DB_NAME : Name of the database to connect to
    COL_NAME: Name of the collection to create
    PATH    : Path to folder with the file
    FILE  : Filename
    
    Returns:
    --------
    Collection COL_NAME in DB_NAME database
    '''
    !mongoimport --db {DB_NAME} --collection {COL_NAME} --file {PATH+FILE} --batchSize 1
    print(f'Collection {COL_NAME} in {DB_NAME} database created')

In [None]:
def feature_extractor(html):
    '''
    Parse html using newspaper
    
    Parameters:
    --------
    html   : 'string'
    
    Returns:
    --------
    result : dictionary
    
    '''
    import newspaper
    from bs4 import BeautifulSoup as bs
    from datetime import datetime
    
    article = newspaper.Article('')
    article.set_html(html)
    
    try:
        article.build()
    except Exception as e:
        print(f'feature_extractor: None features found. Exception: {e}')
        return {'text':''}
        
    
    # parse date manually if it wasn't found by newspaper
    if not article.publish_date:
        sp = bs(html, 'lxml')
        # 'dek___3AQpw' class appears on 30% of msnbc websites
        try:
            # msnbc.com date
            publish_date = datetime.strptime(\
                               (sp.find('p', class_='dek___3AQpw').span.text),\
                               '%b.%d.%Y'\
                            )
        except:
            publish_date = ''
    else:
        publish_date = article.publish_date


    return {
            'date'    :publish_date,
            'title'   :article.title,
            'text'    :article.text,
            'authors' :article.authors,
            'keywords':article.keywords
    }

In [None]:
def docs_parser(htmlCol, skip=True):
    '''
    Parse mongo docs, extract features and update the doc with the features
    
    Parameters:
    --------
    htmlCol : mongodb collection, has to have documents with 'html' key
    skip    : skip html processing if meta key exists in a record, default "True"
    
    Returns:
    --------
    updates all documents in the collection
    '''
    
    try: 
        for doc in htmlCol.find():
            print(f"{htmlCol.name}:{doc['_id']}:")

            if 'html' in doc:
                if 'meta' in doc and skip: 
                    print('Meta exists, skipping')
                else:
                    # extract metadata from html
                    meta = feature_extractor(doc['html'])

                    try:
                        if meta == doc['meta']:
                            print(f"has same meta")
                    except:
                        # if there is newer meta data or meta key is not existing
                        htmlCol.update_one(
                            {'_id':ObjectId(doc['_id'])},
                            {'$set' : {
                                      'meta' : meta
                                      }
                            }
                        )
                        print(f"saved meta")
            else:
                print(f"does not have html")
            print('----------')
    except:
        print(f"docs_parser: couldn't find docs in collection {htmlCol.name}")

In [None]:
def show_doc(db, collection, id):
    '''
    Finds a document by 'id' and prints contents to the console
    
    Parameters:
    --------
    db         : database name
    collection : mongodb collection
    id         : mongodb document id
    
    Returns:
    --------
    Prints first 100 symbols of each document's key to console
    '''
    from bson.objectid import ObjectId
    doc = db['collection'].find_one({'_id':ObjectId(id)})
    for k in doc:
        print(f"{k} : {str(doc[k])[:100]}")

Some comands to keep dbs clean

In [None]:
# deletes all 'meta' fields from all docs
# htmlCol.update({}, {$unset: {meta:1}}, false, true); # mongo shell comand
htmlCol.update({}, {'$unset': {'meta':1}}, multi=True) # pymongo way

In [None]:
# leaves only unique documents by 'url' field

htmlCol.create_index(
    "url",
    unique=True
)

In [None]:
# pymongo 'find' returns cursor that allows iterating through results
# calling first object [0] allows accessing the dictionary with results
# the ['html'] is the key in the dictionary
html = htmlCol.find({'url':'http://www.msnbc.com/velshi-ruhle/watch/jeff-sessions-is-justifying-harsh-immigration-policy-with-the-bible-1256689731629'},\
            projection={'html':True, '_id':False})[0]['html']

In [None]:
# find documents NOT containing a 'tag': regex expression
import re
tag = re.compile('dek___3AQpw.')
docs = htmlCol.find({"html" : {'$not': tag}})
for d in docs[:20]: print(d['url'])

In [None]:
from multiprocessing import Process

# use multiprocessing to extract features
def func():
    DB_NAME = 'scrape'
    db = pm.MongoClient(host='localhost', port=27017, maxPoolSize=500)[DB_NAME]

    for collection in ['left','right']: docs_parser(db[collection])

proc = Process(target=func)
proc.start()

# **Production code**

In [1]:
# For debugging turn on logging to console
import logging
logging.basicConfig(level=logging.DEBUG)

# dependencies
import pymongo as pm

from resources.newspaper import newspaper
from resources.newspaper.newspaper.source import Source
# dir(Source)

# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pm.MongoClient(conn, maxPoolSize=200)

# define db 
# DB_NAME = 'scrape'
DB_NAME = 'FINALP'
db = client[DB_NAME]

SOURCES = {
    'left'  : [
        'https://newrepublic.com',
        'https://www.motherjones.com'
# 3. Slate
# 4. The Intercept
# 5. Daily Beast
# 6. The Atlantic
# 7. Washington Post
# 8. Politico
# 9. The Guardian
# 10. BBC
    ],
    'right' : [
        'https://www.breitbart.com'
# 2. Fox News
# 3. New York Post
# 4. The American Conservative
# 5. Washington Times
# 6. Daily Wire
# 7. The Fiscal Times
# 8. The Hill
# 9. The Daily Caller
# 10. Reason
    ]
}

def saveToDB(db, collection, url, html, meta={}):
    """
    Saves a document to mongoDB, making sure there are no duplicates by 
    'url' value
    
    Parameters:
    --------
    db, collection  : mongo db connection
    url, html, meta : values to store
    
    Returns:
    --------
    Saved document
    """
    collection = db[collection]
    collection.update_one(
        {'url' : url},
        {
            '$set':
                {'url' : url,
                 'html' : html,
                 'meta' : meta
                }
        }
        ,
        upsert=True
    )

In [2]:
def scrape(url, db, collection, latest_date=None):
    '''
    Scrapes all articles from a 'url' up to the 'latest_date'
    
    Parameters:
    --------
    url         : main news website url
    latest_date : YYYY-MM-DD
    db          : database name
    collection  : mongodb collection
    verbose     : turn loggint to stdout
    
    Returns:
    --------
    Articles' urls and html to media
    
    '''

    source = Source(url)
    source.download()
    source.parse()
    
    if source.html:
        saveToDB(
            db=db, collection=collection,
            url=source.url, 
            html=source.html
        )

    logging.debug(f'{source.url} parsed')
    
    source.set_categories()
    source.download_categories()
    source.parse_categories()
    
    return source
    
#     new_rep.download_categories()
#     new_rep.parse_categories()
#     new_rep.set_feeds()
#     new_rep.download_feeds()
#     new_rep.generate_articles()
#     new_rep.download_articles()
#     new_rep.parse_articles()
#     new_rep.print_summary()

In [3]:
url = SOURCES['left'][0]
collection = 'LEFT'
paper = scrape(url, db=db, collection=collection)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET / HTTP/1.1" 200 41262
DEBUG:root:https://newrepublic.com parsed
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepubli

In [4]:
[c.url for c in paper.categories]

['https://newrepublic.com/tags/photography',
 'https://newrepublic.com/authors/lauren-oyler',
 'https://newrepublic.com',
 'https://newrepublic.com/tags/books',
 'https://newrepublic.com/pages/people',
 'https://newrepublic.com/authors/josephine-huetlin',
 'https://newrepublic.com/authors/win-mccormack',
 'https://newrepublic.com/authors/alex-shephard',
 'https://newrepublic.com/tags/press-release',
 'https://newrepublic.com/authors/vegas-tenold',
 'https://newrepublic.com/authors/rachel-wetzler',
 'https://newrepublic.com/authors/matt-ford',
 'https://newrepublic.com/authors/emily-atkin',
 'https://newrepublic.com/authors/jennifer-wilson-2',
 'https://newrepublic.com/tags/politics',
 'https://newrepublic.com/pages/rss',
 'https://newrepublic.com/authors/marin-cogan',
 'https://newrepublic.com/authors/anthony-elghossain',
 'https://newrepublic.com/authors/sarah-jones',
 'https://newrepublic.com/authors/bryce-covert',
 'https://newrepublic.com/authors/josephine-livingstone',
 'https://n

In [40]:
paper.parse_categories()

DEBUG:resources.newspaper.newspaper.source:We are extracting from 4 categories


In [None]:
paper.set_categories()

In [None]:
paper.categories


In [None]:
import tempfile
tempfile.gettempdir()

In [24]:
import tldextract

In [33]:
path = '/tags/politics'
path

'/tags/politics'

In [34]:
path_chunks = [x for x in path.split('/') if len(x) > 0]
path_chunks

['tags', 'politics']

In [32]:
if len(path_chunks) == 1 and len(path_chunks[0]) < 14:
    valid_categories.append(domain + path)
else:
    if self.config.verbose:
        print(('elim category url %s for >1 path chunks '
               'or size path chunks' % p_url))

ExtractResult(subdomain='', domain='', suffix='')

In [36]:
paper.categories.append(url+path)

In [43]:
urls = [c.url for c in paper.categories]
urls

['https://newrepublic.com',
 'https://newrepublic.com/minutes',
 'https://newrepublic.com/latest',
 'https://newrepublic.com/magazine']

In [44]:
urls.append(url+path)
urls

['https://newrepublic.com',
 'https://newrepublic.com/minutes',
 'https://newrepublic.com/latest',
 'https://newrepublic.com/magazine',
 'https://newrepublic.com/tags/politics']

In [47]:
from resources.newspaper.newspaper.source import Category


paper.categories = [Category(url=url) for url in urls]
paper.categories

[<resources.newspaper.newspaper.source.Category at 0x1053c2630>,
 <resources.newspaper.newspaper.source.Category at 0x1053c2390>,
 <resources.newspaper.newspaper.source.Category at 0x1053c22e8>,
 <resources.newspaper.newspaper.source.Category at 0x1053c2c50>,
 <resources.newspaper.newspaper.source.Category at 0x1053c20b8>]

In [48]:
urls = [c.url for c in paper.categories]
urls

['https://newrepublic.com',
 'https://newrepublic.com/minutes',
 'https://newrepublic.com/latest',
 'https://newrepublic.com/magazine',
 'https://newrepublic.com/tags/politics']

In [5]:
stops = [
            'about', 'help', 'privacy', 'legal', 'feedback', 'sitemap',
            'profile', 'account', 'mobile', 'sitemap', 'facebook', 'myspace',
            'twitter', 'linkedin', 'bebo', 'friendster', 'stumbleupon',
            'youtube', 'vimeo', 'store', 'mail', 'preferences', 'maps',
            'password', 'imgur', 'flickr', 'search', 'subscription', 'itunes',
            'siteindex', 'events', 'stop', 'jobs', 'careers', 'newsletter',
            'subscribe', 'academy', 'shopping', 'purchase', 'site-map',
            'shop', 'donate', 'newsletter', 'product', 'advert', 'info',
            'tickets', 'coupons', 'forum', 'board', 'archive', 'browse',
            'howto', 'how to', 'faq', 'terms', 'charts', 'services',
            'contact', 'plus', 'admin', 'login', 'signup', 'register',
            'developer', 'proxy']

In [7]:
print(sorted(stops))

['about', 'academy', 'account', 'admin', 'advert', 'archive', 'bebo', 'board', 'browse', 'careers', 'charts', 'contact', 'coupons', 'developer', 'donate', 'events', 'facebook', 'faq', 'feedback', 'flickr', 'forum', 'friendster', 'help', 'how to', 'howto', 'imgur', 'info', 'itunes', 'jobs', 'legal', 'linkedin', 'login', 'mail', 'maps', 'mobile', 'myspace', 'newsletter', 'newsletter', 'password', 'plus', 'preferences', 'privacy', 'product', 'profile', 'proxy', 'purchase', 'register', 'search', 'services', 'shop', 'shopping', 'signup', 'site-map', 'siteindex', 'sitemap', 'sitemap', 'stop', 'store', 'stumbleupon', 'subscribe', 'subscription', 'terms', 'tickets', 'twitter', 'vimeo', 'youtube']
