** 02805 Social graphs and interactions **

# Network Construction

In [1]:
# IPython global cell magic
%reset
%matplotlib inline

# import all necessary packages
import bs4 # HTML parser
from collections import Counter, OrderedDict # counting elements and ordering keys in dictionaries
import community # python-louvain package
import datetime # handle date objects
import dateparser # parse any (also foreign) date format to object: https://pypi.python.org/pypi/dateparser
from __future__ import division # all numbers are float
import gc # garbage collector
import geoplotlib # plot points on tiled maps
from geoplotlib.utils import BoundingBox
import geopy # get geo location according to addresses
from geopy.exc import GeocoderServiceError
from infomap import infomap # python infomap algorithm, needs to be in same directory
import itertools # iterators for efficient looping
import json # JSON parser
import math # math operations
from matplotlib import pyplot as plt # plotting figures
import mwparserfromhell # parse MediaWiki syntax: https://github.com/earwig/mwparserfromhell
from nameparser import HumanName # parse a human name
import networkx as nx # networks creation library
import nltk # natural language processing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import operator # efficient operator functions
import os # operating system operations, e.g.: with files and folders
import pandas as pd # use easy-to-use data frames for data analysis
import pickle # python data structures as files
from pprint import pprint # print data structures prettier
import re # regex
import requests # request URL content
import sys # system operations
import time # sleep timer
from tqdm import tqdm_notebook # make a nice progressbar
import urllib # handle special URL chars

# make working directory
directory = os.getcwd() + '/companies'
if not os.path.exists(directory):
    os.makedirs(directory)

# files from data crawling
ex1_fdat = directory + '/extraction1_data.pkl'
ex2_fdat = directory + '/extraction2_data.pkl'
ex3_tmp_fdat = directory + '/tmp_extraction3_data.pkl'
ex3_fdat = directory + '/extraction3_data.pkl'
merged = directory + '/merged_data.pkl'

# network files
network_f = directory + '/network.pkl'
network_red_f = directory + '/reduced_network.pkl'
gephi_f = directory + '/gehpi.gexf'

# specify nltk data dir, otherwise LookupError
nltk.data.path.append(os.getcwd() + '/../nltk_data')
from nltk.corpus import names

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


## 1. Construct the Company Network



In [3]:
net_dat = dict()
# load network data if available
if os.path.isfile(network_f):
    with open(network_f, 'rb') as f:
        net_dat = pickle.load(f)

In [55]:
def create_graph(com_dat):

    # init directed graph object
    c_graph = nx.DiGraph()
    
    # iterate through company data
    for c_name, comp in com_dat.iteritems():
        
        # don't add non companies
        if 'is_company' not in comp or not comp['is_company']:
            continue
            
        # check the coordinates
        lat = 0.0
        lng = 0.0
        if 'location_gps' in comp:
            if comp['location_gps'][0]:
                lat = comp['location_gps'][0]
            if comp['location_gps'][1]:
                lng = comp['location_gps'][1]
                
        # convert the date to a string
        founded = 'NaN'
        defunct = 'NaN'
        if 'founded' in comp and comp['founded']:
            founded = comp['founded'].strftime('%Y-%m-%d')
        if 'defunct' in comp and comp['defunct']: 
            defunct = comp['defunct'].strftime('%Y-%m-%d')
            
        # number of employees
        num_employees = 0
        if 'num_employees' in comp and comp['num_employees']:
            num_employees = comp['num_employees']
            
        # just first element for lists
        _type = 'NaN'
        if 'num_employees' in comp and comp['num_employees']:
            _type = comp['type'][0]
        industry = 'NaN'
        if 'industry' in comp and comp['industry']:
            industry = comp['industry'][0]

        # create one node per company name (keys of data)
        c_graph.add_node(
            # only attriutes where data interferes
            c_name,
            name=comp['name'], 
            type=_type, 
            industry=comp['industry'][0],
            founded=founded,
            defunct=defunct,
            location_city=comp['location_city'],
            location_country=comp['location_country'],
            latitude=lat,
            longitude=lng,
            num_employees=num_employees,
            parent=comp['parent']
        )

        # show example of related company
        if c_name == 'Apple Inc.':
            print "Link list of", c_name, "company:\n", comp['links']

        # add an edge for the company, but from the original dictionary
        for e in comp['links']:
            if e in com_dat and 'is_company' in com_dat[e] and com_dat[e]['is_company']:
                c_graph.add_edge(c_name, e)
                
    return c_graph

In [56]:
c_graph_reduced = create_graph(merged_companies)

Nordisk Mobiltelefon (Sweden)


AttributeError: 'str' object has no attribute 'strftime'

In [7]:
if 'graph' in net_dat and net_dat['graph']:
    c_graph =  net_dat['graph']
elif 'raw_dat' in net_dat and net_dat['raw_dat']:
    c_graph = create_graph(net_dat['raw_dat'])
else:
    c_graph = create_graph(merged_companies)

NameError: name 'merged_companies' is not defined

In [5]:
print "There are {0} nodes in the network.".format(len(c_graph.nodes()))
print "There are {0} edges in the network.".format(len(c_graph.edges()))

There are 57590 nodes in the network.
There are 1032557 edges in the network.


## 2. Analysis

#### Weakly Connected Component

The weakly connected component (WCC) is a maximal subgraph of a directed graph such that for every pair of vertices $u$, $v$ in the subgraph, there is an undirected path from $u$ to $v$ and a directed path from $v$ to $u$. 

In [27]:
def get_wcc(G):
    return sorted(nx.weakly_connected_component_subgraphs(G), key=len, reverse=True)[0]

In [7]:
if 'wcc' in net_dat and net_dat['wcc']:
    c_weak =  net_dat['wcc']
else:
    c_weak = get_wcc(c_graph)

In [8]:
print 'The size (number of edges) of subgraph with largest weakly connected component is:', c_weak.size()

The size (number of edges) of subgraph with largest weakly connected component is: 1031478


#### Degree Centrality

Historically first and conceptually simplest is degree centrality, which is defined as the number of links incident upon a node (e.g.: the number of ties that a node has). The degree can be interpreted in terms of the immediate risk of a node for catching whatever is flowing through the network (such as a virus, or some information). In the case of a directed network (where ties have direction), usually two separate measures of degree centrality are defined, namely in-degree and out-degree.

In [28]:
def deg_cents(G_wcc):
    """Get degree centrality."""
    
    # in-edges
    node_in_degree = nx.in_degree_centrality(G_wcc)
    # out-edges
    node_out_degree = nx.out_degree_centrality(G_wcc)
    return node_in_degree, node_out_degree

In [10]:
if 'deg_cent_in' in net_dat and \
    'deg_cent_out' in net_dat and \
    net_dat['deg_cent_in'] and \
    net_dat['deg_cent_out']:
        node_in_degree =  net_dat['deg_cent_in']
        node_out_degree =  net_dat['deg_cent_out']
else:
    node_in_degree, node_out_degree = deg_cents(c_weak)

In [11]:
print "10 most central companies according to in-edges degree centrality:"
pprint(Counter(node_in_degree).most_common(10))

print "\n10 most central companies according to out-edges degree centrality:"
pprint(Counter(node_out_degree).most_common(10))

10 most central companies according to in-edges degree centrality:
[(u'Microsoft', 0.028508823049488333),
 (u'Reuters', 0.027040257348656144),
 ('Sony', 0.020489988111610992),
 ('Google', 0.01934777034429707),
 (u'IBM', 0.01806568917690389),
 (u'HSBC', 0.017040024242989345),
 (u'Vodafone', 0.01678360800951071),
 (u'Apple Inc.', 0.01673698687615096),
 (u'Nintendo', 0.01631739667591319),
 (u'Intel', 0.015897806475675423)]

10 most central companies according to out-edges degree centrality:
[('Telia Company', 0.012331289773654396),
 ('Hitachi', 0.012214736940255017),
 (u'Vodafone', 0.012098184106855637),
 ('Sony', 0.01191169957341663),
 ('Panasonic', 0.011398867106459357),
 (u'Toshiba', 0.011375556539779482),
 (u'Kyocera', 0.011352245973099605),
 ('Mitsubishi Electric', 0.010862724072822209),
 ('Comcast', 0.01074617123942283),
 ('Fujitsu', 0.010722860672742954)]


Technology companies have a lead when it comes to most in/out degree centrality.

In [29]:
def eigen_cents(G_wcc):
    """Get degree centrality."""
    
    # in-edges
    node_in_eigen = nx.eigenvector_centrality(G_wcc)
    # out-edges, needs reversed wcc graph
    node_out_eigen = nx.eigenvector_centrality(G_wcc.reverse())
    return node_in_eigen, node_out_eigen

In [13]:
if 'eigen_cent_in' in net_dat and \
    'eigen_cent_out' in net_dat and \
    net_dat['eigen_cent_in'] and \
    net_dat['eigen_cent_out']:
        node_in_eigen =  net_dat['eigen_cent_in']
        node_out_eigen =  net_dat['eigen_cent_out']
else:
    node_in_eigen, node_out_eigen = eigen_cents(c_weak)

In [14]:
print "10 most central companies according to in-edges eigenvector centrality:"
pprint(Counter(node_in_eigen).most_common(10))

print "\n10 most central companies according to out-edges eigenvector centrality:"
pprint(Counter(node_out_eigen).most_common(10))

10 most central companies according to in-edges eigenvector centrality:
[('Whole Foods Market', 0.06513803920756327),
 ('Costco', 0.06467179872391232),
 (u'Lidl', 0.0645838642026889),
 (u'Aldi', 0.06457330500187025),
 (u'Albertsons', 0.06457029576072434),
 (u'Ahold Delhaize', 0.06455900783217165),
 (u'Giant-Carlisle', 0.06450263306524448),
 (u'Pavilions (supermarket)', 0.06448307712349564),
 (u'M\xe1s Club', 0.06445324850334076),
 (u'Walmart', 0.06444629182605319)]

10 most central companies according to out-edges eigenvector centrality:
[(u'Albertsons', 0.06500364713613616),
 (u'Delhaize Group', 0.06497264344789812),
 (u'Kroger', 0.06490845641815714),
 (u'SuperValu (United States)', 0.06471331197720417),
 ('Costco', 0.06469132155925905),
 ('Whole Foods Market', 0.06468862077172617),
 (u'Star Market', 0.06437887738082511),
 (u'Acme Markets', 0.0643407200103748),
 (u'Aldi', 0.06432459116870955),
 (u'Stop & Shop', 0.06431557975668435)]


After seeing the result for eigenvector centrality it was weird that only retail and food companies show up. Once we took a closer look at the article of [Delhaize Group](https://en.wikipedia.org/wiki/Delhaize_Group#See_also) we noted that a lot of external links exists:
* Supermarket chains in the United States
* Top 20 companies in Belgium (BEL20 companies of Belgium)
* etc.

Eigenvector centrality is a measure of the influence of a node in a network. It assigns relative scores to all nodes in the network based on the concept that connections to high-scoring nodes contribute more to the score of the node in question than equal connections to low-scoring nodes. Since Delhaize Group has for example Stop & Shop as subsidiary these high eigenvector centrality nodes link to other high eigenvector centrality nodes. 

Thus the decision was made to exclude the sections:
* See also
* References 
* External links

They can be useful like in the case of [Microsoft](https://en.wikipedia.org/wiki/Microsoft#External_links) but for the majority these sections are rather misleading.

In [15]:
# store network data to file
network = {
    'graph': c_graph,
    'raw_dat': pickle_companies,
    'wcc': c_weak,
    'deg_cent_in': node_in_degree,
    'deg_cent_out': node_out_degree,
    'eigen_cent_in': node_in_eigen,
    'eigen_cent_out': node_out_eigen
}

In [16]:
# save the file for fast reprocessing
with open(network_f, 'wb') as f:
    pickle.dump(network, f)

In [17]:
# free up some memory
del network, c_graph, net_dat
gc.collect()

0

### 3.1. Alternative construction

The same strategy as above is used to get the data and construct the network from it. All of the calculations were saved in order to execute the final notebook much faster. There will be less links after rebuilding the network because of the skipped sections. 

In [11]:
# load network data if available
net_dat_red = dict()
# load network data if available
if os.path.isfile(network_red_f):
    with open(network_red_f, 'rb') as f:
        net_dat_red = pickle.load(f)

In order to rebuild the whole network the wikitext is reparsed while the problematic sections are cut from the string. The current link list will then be intersected with the new list of all links on the page that is cross checked with the list of all links that were returned by the Wikipedia API.

This new list of links will also be populated to the dataframe.

In [53]:
if 'raw_dat' in net_dat_red and net_dat_red['raw_dat']:
    merged_companies = net_dat['raw_dat']
else:
    for c_name, comp in tqdm_notebook(merged_companies.iteritems(), desc='Links'):

        # don't add non companies
        if 'is_company' not in comp or not comp['is_company']:
            continue

        # parse wikitext, remove references
        wiki_raw_cleaned = re.sub(r'<ref.+?</ref>|<ref>.+?</ref>', '', comp['wiki_raw'])
        wiki_code = mwparserfromhell.parse(wiki_raw_cleaned, skip_style_tags=True)
        exclude_sections = [u'==See also==', u'==References==', u'==External links==']
        # cut the raw wiki text at the found section
        for e in exclude_sections:
            if e in wiki_code.filter_headings():
                # take the first part of the wikicode excluding the unwanted section
                wiki_raw_cleaned = wiki_raw_cleaned.split(e)[0]

        wiki_code = mwparserfromhell.parse(wiki_raw_cleaned, skip_style_tags=True)
        # go through each link in wikitext and extract
        links = set()
        for link in wiki_code.filter_wikilinks():
            # [[Template:Infobox company]]
            if re.match(r'\[\[.+:.+\]\]', unicode(link)):
                continue
            # [[T\xe4by]], [[Hertz|MHz]]
            matched_wiki_name = re.match(r'\[\[(.+?)(?:\|.+)?\]\]', unicode(link))
            if matched_wiki_name:
                if matched_wiki_name.group(1) in comp['all_links']:
                    links.add(matched_wiki_name.group(1))

        if c_name == 'Delhaize Group':
            print "Any link in sections without", exclude_sections, ":\n", links, "\n"
            print "All company links, all sections:\n", comp['links'], "\n"
            comp['links'] = comp['links'].intersection(links)
            print "Final company links without unwanted sections:\n", comp['links'], "\n"

        # list of links from specific sections intersected with old link list
        comp['links'] = comp['links'].intersection(links)
        comp['all_links'] = links

In [48]:
if 'graph' in net_dat_red and net_dat_red['graph']:
    c_graph_reduced =  net_dat_red['graph']
else:
    c_graph_reduced = create_graph(merged_companies)

In [25]:
print "There are {0} nodes in the network.".format(len(c_graph_reduced.nodes()))
print "There are {0} edges in the network.".format(len(c_graph_reduced.edges()))

There are 56898 nodes in the network.
There are 150662 edges in the network.


As you can see the size of the number of links decreased dramatically from over a million to ca. 150 thousand. That is only 15% of the original links.

#### Weakly Connected Component

The first two steps of creating the WCC and calculating some centralities will be like above. However new centrality measures are introduced and evaluated.

In [22]:
if 'wcc' in net_dat_red and net_dat_red['wcc']:
    c_weak_red =  net_dat_red['wcc']
else:
    c_weak_red = get_wcc(c_graph_reduced)

In [30]:
print 'The size (number of edges) of subgraph with largest weakly connected component is:', c_weak_red.size()

The size (number of edges) of subgraph with largest weakly connected component is: 149251


#### Degree Centrality

In [31]:
if 'deg_cent_in' in net_dat_red and \
    'deg_cent_out' in net_dat_red and \
    net_dat_red['deg_cent_in'] and \
    net_dat_red['deg_cent_out']:
        node_in_degree =  net_dat_red['deg_cent_in']
        node_out_degree =  net_dat_red['deg_cent_out']
else:
    node_in_degree, node_out_degree = deg_cents(c_weak_red)

In [35]:
print "10 most central companies according to in-edges degree centrality:"
pprint(Counter(node_in_degree).most_common(10))

print "\n10 most central companies according to out-edges degree centrality:"
pprint(Counter(node_out_degree).most_common(10))

10 most central companies according to in-edges degree centrality:
[(u'Microsoft', 0.022857725962396998),
 (u'IBM', 0.014439143855710605),
 (u'Google', 0.013265644531748258),
 (u'Apple Inc.', 0.011556417255542233),
 (u'Sony', 0.01089313502895482),
 (u'General Motors', 0.010306385366973647),
 (u'Intel', 0.009464527156305007),
 (u'BBC', 0.008954310058930074),
 (u'General Electric', 0.008903288349192582),
 (u'Hewlett-Packard', 0.008597158090767621)]

10 most central companies according to out-edges degree centrality:
[(u'Sony Interactive Entertainment', 0.0025255746320059186),
 (u'General Motors', 0.0022959769381871984),
 (u'Electronic Arts', 0.0020918900992372255),
 (u'The Blackstone Group', 0.0020663792443684787),
 (u'Participant Media', 0.0020408683894997323),
 (u'Bain Capital', 0.0020153575346309855),
 (u'Vodafone', 0.0019898466797622387),
 (u'Kohlberg Kravis Roberts', 0.0018878032602872522),
 (u'Sega development studios', 0.0018878032602872522),
 (u'Dell', 0.001836781550549759)]


For example Microsoft, Google and Apple still appear in the top 10 of degree centrality after the approach of the network generation was shifted. Now there is also a greater mix with not only technology companies.

First we also had Los Angeles Times and Reuters under the top 10 of degree centrality but that was mainly due to the fact of HTML markup in the reference links, thus it's important to remove it. For example there is a reference in Ford Motor company auoted from the Los Angeles Times.

```
Following international condemnation of [[apartheid]], Ford divested from South Africa in 1988, and sold its stake in Samcor, although it licensed the use of its brand name to the company.<ref>{{cite news |url=http://articles.latimes.com/1987-06-15/business/fi-4242_1_south-africa |title=Ford Discussing Plans to Divest in South Africa : Firm Would Give 24% Stake to Workers, But Maintain a Presence |first1=Ralph |last1=Vartabedian |first2=Michael |last2=Parks |work=[[Los Angeles Times]] |date=June 15, 1987 |access-date=October 14, 2016}}</ref>
```

So you can see a lot can be influenced by the way the data is gained.

#### Eigenvector Centrality

In [33]:
if 'eigen_cent_in' in net_dat_red and \
    'eigen_cent_out' in net_dat_red and \
    net_dat_red['eigen_cent_in'] and \
    net_dat_red['eigen_cent_out']:
        node_in_eigen =  net_dat_red['eigen_cent_in']
        node_out_eigen =  net_dat_red['eigen_cent_out']
else:
    node_in_eigen, node_out_eigen = eigen_cents(c_weak_red)

In [36]:
print "10 most central companies according to in-edges eigenvector centrality:"
pprint(Counter(node_in_eigen).most_common(10))

print "\n10 most central companies according to out-edges eigenvector centrality:"
pprint(Counter(node_out_eigen).most_common(10))

10 most central companies according to in-edges eigenvector centrality:
[(u'Paramount Pictures', 0.27982289012922407),
 (u'Warner Bros.', 0.2562183935912465),
 (u'Metro-Goldwyn-Mayer', 0.2223053205147436),
 (u'Columbia Pictures', 0.20996826006680364),
 (u'The Walt Disney Company', 0.20632546831510437),
 (u'Universal Studios', 0.2029219038821539),
 (u'20th Century Fox', 0.20199603541820754),
 (u'Time Warner', 0.16383407029790323),
 (u'Viacom', 0.15654828077330127),
 (u'United Artists', 0.14581433181797088)]

10 most central companies according to out-edges eigenvector centrality:
[(u'Paramount Pictures', 0.1435390426673471),
 (u'Warner Bros.', 0.13091206710825676),
 (u'Metro-Goldwyn-Mayer', 0.12999764306368608),
 (u'HBO', 0.127090340059302),
 (u'StudioCanal', 0.1200066451120615),
 (u'Participant Media', 0.1176612428728507),
 (u'Sony Pictures Worldwide Acquisitions', 0.11629898460142575),
 (u'PolyGram Filmed Entertainment', 0.11557376690967804),
 (u'Screen Gems', 0.1147176932869096),
 (u

From retail and food stores the eigenvector centrality changed to entertainment companies. 

TODO: WHY?

Furthermore, there is a high number of technology corporations that generally have a high number 

In [37]:
node = 'Paramount Pictures'
print "Links of", node, "are:\n", c_graph_reduced.neighbors(node)

Links of Paramount Pictures are:
[u'First National Pictures', u'Universal Studios', u'List of Paramount executives', u'Viacom Productions', u'CBS Home Entertainment', u'Gulf and Western Industries', u'Gary Sanchez Productions', u'Metro-Goldwyn-Mayer', u'Viacom (original)', u'Walt Disney Studios (division)', u'CBS Television Stations', u'Warner Bros.', u'Nordisk Film', u'Wanda Group', u'Di Bonaventura Pictures', u'Paramount Home Media Distribution', u'NBCUniversal', u'DreamWorks', u'Universal Music Group', u'Famous Players', u'Viacom Media Networks', u'Fake Empire Productions', u'CBS Television Distribution', u'Bad Robot Productions', u'Paramount Stations Group', u'Sony/ATV Music Publishing', u'Capital Cities Communications', u'Skydance Media', u'Cedar Fair', u'DreamWorks Animation', u'Image Entertainment', u'Paramount Famous Productions', u'United International Pictures', u'Republic Pictures', u'Spelling Television', u'Cruise/Wagner Productions', u'Famous Players Film Company', u'World

#### Betweenness Centrality

Betweenness is a centrality measure of a node within a graph. Betweenness centrality quantifies the number of times a node acts as a bridge along the shortest path between two other nodes. Compared to eigenvector centrality which accounts for the 'importance' of a node by taking into account the 'importance' of nodes to which it is pointing to (out-edges) or which are pointing at the node (in-edges) it takes a much longer time to compute.

In [29]:
if 'bet_cent' in net_dat_red and \
    net_dat_red['bet_cent']:
        node_cent =  net_dat_red['bet_cent']
else:
    node_cent = nx.betweenness_centrality(c_weak_red)

SyntaxError: invalid syntax (<ipython-input-29-4413a934fd58>, line 1)

In [44]:
print "Top 10 most central company according to betweenness centrality:"
pprint(Counter(node_cent).most_common(10))

Top 10 most central company according to betweenness centrality:


NameError: name 'node_dict' is not defined

#### Closeness Centrality

Closeness centrality is a measure that wasn't introduced in the [course](http://kurser.dtu.dk/course/02805) curriculum but we wanted to know more about it.



For some of the measures below an undirected version of the graph was created.

In [31]:
if 'wcc_undir' in net_dat_red and net_dat_red['wcc_undir']:
    c_weak_red_undir =  net_dat_red['wcc_undir']
else:
    # create undirected version of the network
    c_weak_red_undir = c_weak_red.to_undirected()

#### Communities


In [39]:
# compute the best partition
partition = community.best_partition(c_weak_red_undir)

# report modularity
print "Modularity with Louvain algorithm:"
print community.modularity(partition, c_weak_red_undir)

# report number of communities
print "Number of communities found by Louvain algorithm:"
print len(set(partition.values()))

Modularity with Louvain algorithm:
0.682525868857
Number of communities found by Louvain algorithm:
74


In [40]:
def findCommunities(G):
    """
    Partition network with the Infomap algorithm.
    Annotates nodes with 'community' id and return number of communities found.
    """

    infomapWrapper = infomap.Infomap("--two-level")
    for e in G.edges_iter():
        infomapWrapper.addLink(*e)

    infomapWrapper.run();
    tree = infomapWrapper.tree
    communities = {}
    for node in tree.leafIter():
        communities[node.originalLeafIndex] = node.moduleIndex()

    nx.set_node_attributes(G, 'community', communities)
    return tree.numTopModules()

In [41]:
# the infomap function only accepts integers as node names
c_weak_red_infomap = nx.convert_node_labels_to_integers(c_weak_red_undir)
print "Number of communities found by Infomap algorithm:", findCommunities(c_weak_red_infomap)

Number of communities found by Infomap algorithm: 2079


In [43]:
# store network data to file
network_red = {
    'graph': c_graph_reduced,
    'raw_dat': merged_companies,
    'wcc': c_weak_red,
    'deg_cent_in': node_in_degree,
    'deg_cent_out': node_out_degree,
    'eigen_cent_in': node_in_eigen,
    'eigen_cent_out': node_out_eigen,
    #'bet_cent': node_cent,
    'wcc_undir': c_weak_red_undir
}

In [44]:
# save the file for fast reprocessing
with open(network_red_f, 'wb') as f:
    pickle.dump(network_red, f)

In [45]:
# produces a file that can be imported into the Gephi network visualization tool
nx.write_gexf(c_graph_reduced, gephi_f)

KeyError: <type 'datetime.datetime'>

## What if?

What if google disappears and all nodes connecting directly to google with it.