In [1]:
# Dependencies
import re
import pymongo
from bs4 import BeautifulSoup as bs
import pandas as pd
import time

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Get database from Mongo
db = client['FINALP']

In [3]:
def GetLinks(collection_name, site_name):
    
    # Get starting time
    start = time.time()
    
    # Print database name
    print()
    print(collection_name)
    print('---------------------------')
    
    # Get all documents from HTML_pages collection
    documents = db[collection_name].find()
    
    # Count the number of documents
    count = documents.count()

    # Create dictionary to count the number of external links
    # This must match the chord diagram order
    links = {'theatlantic.com':0, 'bbc.com':0, 'motherjones.com':0, 'newrepublic.com':0, 'politico.com':0,
             'slate.com':0, 'thedailybeast.com':0, 'theguardian.com':0, 'theintercept.com':0,
             'theamericanconservative.com':0, 'breitbart.com':0, 'dailywire.com':0, 'economist.com':0,
             'thefiscaltimes.com':0, 'foxnews.com':0, 'nypost.com':0, 'reason.com':0, 'thehill.com':0,
             'washingtontimes.com':0}

    # Counter used to track progress
    i = 1
    
    # Counter used to track documents that aren't HTML
    non_html = 0

    # Iterate through each document
    for document in documents:

        # Get the HTML item in the document
        html = document['html']
        
        # Test if document contains HTML
        if ("<!DOCTYPE HTML" in html[0:50].upper()):
        
            # Print progress
            #print(str(round((i / count)*100, 2))+"%")
            #print(str(i) + "/" + str(count))

            # Convert string to BeautifulSoup object 
            soup = bs(html, 'lxml')

            # Find all anchor tags and loop through each one
            for url in soup.findAll('a'): 

                # Convert url to string
                url = str(url)

                try:
                    # Find where 'www.' and '.com' are in the url string
                    start = url.index("www.") + 4
                    end = url.index(".com") + 4

                    # Splice external link from url
                    ext_link = url[start:end]

                    # Increment dictionary key/value pair
                    links[ext_link] += 1

                # Error handling
                except (KeyError, ValueError):
                    pass
        
        # What to do if document isn't HTML        
        else:
            
            # Increment counter and print out what the non_HTML looks like
            non_html += 1
            #print(str(non_html) + ": " + html[0:25])

        # Increment progress counter
        i += 1
        
    # Print the total number of documents that weren't HTML    
    #print("Non HTML: " + str(non_html))
    
    # Create empty dictionary for scaled external links
    scaled_links = {}
    
    # Create scale variable to standardize number of pages per site
    # For example, if a collection has 5,000 documents, then reduce external links by 20%
    scale = (1000 / count)
    
    # Loop through each key/value pair in the links dictionary
    for key, value in links.items():
        
        # Create a new key/value pair that is "scaled"
        scaled_links[key] = round(value * scale)
    
    # Set own site's count to zero (e.g., {BBC.com : 0})
    scaled_links[site_name] = 0
    
    # Print out the count, scale and links before/after scaling
    print(links)
    print("Count: ", count)
    print("Scale: ", str(round(scale,2)))
    print(scaled_links)
    
    # Intialize blank list
    link_list = []
    
    # Loop through each key/value in the scaled links dictionary
    for key, value in scaled_links.items():
        
        # Append value to list
        link_list.append(value)
        
    print(link_list)
    
    end = time.time()
    print("Minutes: " + str((end - start)/60))

    # Return results
    return(link_list)

In [4]:
start = time.time()

atlantic_links = GetLinks("atlantic", "theatlantic.com")
americanconservative_links = GetLinks("americanconservative", "theamericanconservative.com")
bbc_links = GetLinks("BBC", "bbc.com")
breitbart_links = GetLinks("breitbart", "breitbart.com")
thedailybeast_links = GetLinks("thedailybeast", "thedailybeast.com")
dailywire_links = GetLinks("dailywire", "dailywire.com")
economist_links = GetLinks("economist", "economist.com")
foxnews_links = GetLinks("foxnews", "foxnews.com")
theguardian_links = GetLinks("theguardian", "theguardian.com")
thehill_links = GetLinks("thehill", "thehill.com")
theintercept_links = GetLinks("theintercept", "theintercept.com")
motherjones_links = GetLinks("mjones", "motherjones.com")
newrepublic_links = GetLinks("newrepublic", "newrepublic.com")
nypost_links = GetLinks("nypost", "nypost.com")
politico_links = GetLinks("politico", "politico.com")
reason_links = GetLinks("reason", "reason.com")
slate_links = GetLinks("slate", "slate.com")
fiscaltimes_links = GetLinks("fiscaltimes", "thefiscaltimes.com")
washpost_links = GetLinks("washpost", "washingtonpost.com")       
washtimes_links = GetLinks("washtimes", "washingtontimes.com")

end = time.time()
print("Minutes: " + str((end - start)/60))


atlantic
---------------------------
{'theatlantic.com': 316527, 'bbc.com': 536, 'motherjones.com': 153, 'newrepublic.com': 0, 'politico.com': 1729, 'slate.com': 248, 'thedailybeast.com': 237, 'theguardian.com': 1160, 'theintercept.com': 0, 'theamericanconservative.com': 44, 'breitbart.com': 120, 'dailywire.com': 8, 'economist.com': 133, 'thefiscaltimes.com': 2, 'foxnews.com': 186, 'nypost.com': 0, 'reason.com': 0, 'thehill.com': 0, 'washingtontimes.com': 50}
Count:  9173
Scale:  0.11
{'theatlantic.com': 0, 'bbc.com': 58, 'motherjones.com': 17, 'newrepublic.com': 0, 'politico.com': 188, 'slate.com': 27, 'thedailybeast.com': 26, 'theguardian.com': 126, 'theintercept.com': 0, 'theamericanconservative.com': 5, 'breitbart.com': 13, 'dailywire.com': 1, 'economist.com': 14, 'thefiscaltimes.com': 0, 'foxnews.com': 20, 'nypost.com': 0, 'reason.com': 0, 'thehill.com': 0, 'washingtontimes.com': 5}
[0, 58, 17, 0, 188, 27, 26, 126, 0, 5, 13, 1, 14, 0, 20, 0, 0, 0, 5]

americanconservative
-------

KeyboardInterrupt: 

In [17]:
# Make a list of dictionaries to output as JSON
data_json = [{
    "Atlantic":atlantic_links, "BBC":bbc_links, "MotherJones":mj_links, "NewRepublic":newrepublic_links, 
    "Politico":politico_links, "Slate":slate_links, "DailyBeast":db_links, "Guardian":guardian_links,
    "Intercept":intercept_links, "WashPost":washpost_links, "AmerCons":amercons_links, "Breitbart":breitbart_links, 
    "DailyWire": dw_links, "Economist":economist_links, "FiscalTimes":tft_links, "FoxNews":foxnews_links, 
    "NYPost":nypost_links, "Reason":reason_links, "Hill":hill_links, "WashTimes":washtimes_links
    }]
    
data_json

[{'Atlantic': [0,
   58,
   17,
   0,
   188,
   27,
   26,
   126,
   0,
   5,
   13,
   1,
   14,
   0,
   20,
   0,
   0,
   0,
   5]}]

In [19]:
import json

# Write list of dictionaries to JS file
with open ('chord-data.json', 'w') as outfile:
    json.dump(data_json, outfile)