In [19]:
%matplotlib inline

In [8]:
import os
DATADIR = os.path.join(os.getcwd(),"..", "ClassPrep")
print(os.path.exists(DATADIR))
import csv
from collections import defaultdict
import gzip
import pickle
import networkx as nx
from IPython.display import Image, clear_output
import warnings
warnings.simplefilter('ignore')
import nxdrawing as nxd

True


### Create a MultiGraph of 'To'/'From' Relationships

In [2]:
import re
re_me = re.compile(r"""Brian\.Chapman@utah\.edu""",re.I)

### [``MultiGraph``](https://networkx.github.io/documentation/latest/reference/classes.multigraph.html)

We're going to put the e-mail information into a NetworkX ``MultiGraph``. The ``MultiGraph`` allows us to have multiple edges between nodes.

In this graph nodes will be senders and receivers and edges will be particular messages.

In [6]:
my_email = nx.MultiDiGraph()

with open(os.path.join(DATADIR,
                            "my_emails_2017.txt"),'rt') as f:
    reader = csv.reader( f,delimiter="\t" )

    for row in reader:
        try:
            my_email.add_edge(row[0].lower(),row[1].lower())
        except Exception as error:
            print(error)
            pass


## How large is this graph?


In [7]:
my_email.number_of_nodes(), my_email.number_of_edges()

(4392, 41817)

## Challenge: To whom did you send the most e-mails?

#### Hint: `neighbors` or `successors`

## Challenge: From whom did you receive the most e-mails?

#### Hint: `predecessors`

### Connected Subgraphs

Graphs can consist of distinct components that are disconnected from each other [connected_component_subgraphs](https://networkx.github.io/documentation/latest/reference/generated/networkx.algorithms.components.connected.connected_component_subgraphs.html?highlight=connected_component_subgraphs) creates distinct graphs for each connected component and returns them in a list. This is only defined for undirected graphs.

We also use the [``sort``](https://docs.python.org/3/library/stdtypes.html#list.sort) method of the list to sort the subgraphs by the number of nodes in each graph.

>*key* specifies a function of one argument that is used to extract a comparison key from each list element (for example, key=str.lower). The key corresponding to each item in the list is calculated once and then used for the entire sorting process. The default value of None means that list items are sorted directly without calculating a separate key value.

* We use list comprehension to keep the subgraphs that have more than two nodes.
* We use an [anonymous lambda function](https://docs.python.org/3/howto/functional.html#small-functions-and-the-lambda-expression) to do the sorting.

In [5]:
subgraphs = [g for g in nx.connected_component_subgraphs(my_email.to_undirected()) if g.number_of_nodes() > 2]
subgraphs.sort(key=lambda g: g.number_of_nodes())
print("The number of subgraphs is %d"%len(subgraphs))
print([g.number_of_nodes() for g in subgraphs])

The number of subgraphs is 7
[3, 3, 3, 3, 15, 44, 4409]


In [145]:
for n in sg.nodes():
    print(n)

ccconlin9319@gmail.com
rebekah.hendon@utah.edu
karen.petersen.kunigk@cityacademyslc.org
pemac@intechopen.com
fowlkes@ics.uci.edu
Heidi.Lex@hsc.utah.edu
749197960-fpns.4hrr.wu39.75xn@property.booking.com
Ariel.Stewart@hsc.utah.edu
luz.claudio@mssm.edu
carrie.radmall@nurs.utah.edu
brittany@cs.montana.edu
soyoungan0@gmail.com
Ceola.Miller@hsc.utah.edu
Melanie.Mansuy@hsc.utah.edu
Tania.Velasquez@va.gov
Charlton.Park@hsc.utah.edu
Kathryn.Peterson@hsc.utah.edu
Andrew.Gerald.Smith@hsc.utah.edu
tianxin.yang@hsc.utah.edu
sbarlow@le.utah.gov
Richard.Vance@principalinvestigators.org
Gabriel.Fine@hsc.utah.edu
u0080243@umail.utah.edu
Joe.Breen@utah.edu
Kirsten.Mallik@hsc.utah.edu
Jeremy.Uffens@hsc.utah.edu
auto-message@eventbrite.com
nancy.parker@hsc.utah.edu
morris.maryjane@comcast.net
srey@asu.edu
Emily.Wilson@imail.org
vidnyan.siddamshetty@gmail.com
chris.wasden@business.utah.edu
David.Collingridge@imail.org
WILLIAM.AUFFERMANN@hsc.utah.edu
josh.quinton@icloud.com
Dory.Trimble@hsc.utah.edu
jproko

minjung0310@gmail.com
jacob.smith@hsc.utah.edu
Talmage.Shill@imail.org
Skyler.Jennings@hsc.utah.edu
support@gbhealthwatch.com
melanie.hooten@hsc.utah.edu
acctmgr@chpc.utah.edu
Darrel.Brodke@va.gov
dennis.parker@hsc.utah.edu
younghee.lee@utah.edu
seminars@info.ve10.com
Alex.Engar@utah.edu
steve.alder@utah.edu
Elizabeth.Rabon@hsc.utah.edu
Jeremiah.Alt@hsc.utah.edu
no-reply@dropboxmail.com
pay-request@noreply.utah.edu
alexis.farrer@utah.edu
hberty08@gmail.com
barrett@cs.byu.edu
ucair_inc@lists.hsc.utah.edu
sally.zuspan@hsc.utah.edu
David.Hiti@hci.utah.edu
melissa.l.seipp@aruplab.com
d-klabjan@northwestern.edu
Ali.Ahmed@imail2.org
nicholas.campbell91@yahoo.com
support@pagepress.org
pwrose@ucsd.edu
robert.schlaberg@aruplab.com
Jennifer.Majersik@hsc.utah.edu
jbchapman10@gmail.com
pws@caltech.edu
jeanmarie.mayer@hsc.utah.edu
alexbabumec@gmail.com
lalindra.desilva@utah.edu
Matthew.Stein@hsc.utah.edu
mrock@uhin.org
marta.heilbrun@hsc.utah.edu
jamiewoodcock38@icloud.com
Rong.Xiao@hsc.utah.edu
Lu

j.anderson@hsc.utah.edu
grant.cannon@hsc.utah.edu
civilengineeringjournals@gmail.com
jessica.baker@hci.utah.edu
carms38@hotmail.com
JulieB.Kraemer@hsc.utah.edu
Jim.Turnbull@hsc.utah.edu
keilbeck@genetics.utah.edu
contact@scientificfuture.com
snoww.griff@live.com
benjamin.brooke@hsc.utah.edu
danielle.groat@utah.edu
kjackson@samsi.info
Jen.Taggart@hci.utah.edu
chantal.babcock@hsc.utah.edu
George.Thomsen@imail.org
gabby.g.iorg@gmail.com
david.bull@hsc.utah.edu
lzhang6@hmc.psu.edu
Eugene.Payne@m.cc.utah.edu
Daniel.Denhalter@va.gov
yueqi.wang@utah.edu
yihan.dang@medizin.uni-leipzig.de
mahboobg@usc.edu
mary.beth.scholand@hsc.utah.edu
kristi.smock@aruplab.com
sarah.krstyen@gmail.com
doug.weatherbee@lightbend.com
Citiprogram-noreply@med.miami.edu
tianze.jiao@pharm.utah.edu
aln2dh@virginia.edu
MedicalImaging.submissions@sm-international.co
pickardS@pegus.com
maurae.prince@lightbend.com
craig.teerlink@hsc.utah.edu
tspark@snu.ac.kr
songj@illinois.edu
jwvh1010@gmail.com
register@continue.utah.edu


### With whom are my most frequent e-mail exchanges?

In [12]:
main_email = subgraphs[-1]
edges = main_email.edges()
edge_count2 = {}
mail_count_limit = 35
for n in main_email.nodes():
    neighbors = main_email.neighbors(n)
    for nn in neighbors:
        if "Brian.Chapman@utah.edu" in n or "Brian.Chapman@utah.edu" in nn:
            key = [n,nn]
            key.sort()
            edge_count2[tuple(key)] = main_email.number_of_edges(n,nn)

    

In [13]:
ec = list(edge_count2.items())
ec.sort(key=lambda x:x[1], reverse=True)

In [14]:
print("%s   %s\t %s"%("Node1".ljust(40),"Node2".rjust(40), "count".ljust(10)))
print()
for e,c in ec[:100]:
    print("%s \u21E8 %s\t% 3d"%(e[0].ljust(40), e[1].rjust(40),c))

Node1                                                                         Node2	 count     

Brian.Chapman@utah.edu                   ⇨                   wendy.chapman@utah.edu	 790
Brian.Chapman@utah.edu                   ⇨                John.Roberts@hsc.utah.edu	 580
Brian.Chapman@utah.edu                   ⇨               roberts@ucair.med.utah.edu	 542
Brian.Chapman@utah.edu                   ⇨                  barbara.saffel@utah.edu	 355
Brian.Chapman@utah.edu                   ⇨              Marta.Heilbrun@hsc.utah.edu	 339
Brian.Chapman@utah.edu                   ⇨              Matthew.Samore@hsc.utah.edu	 328
Brian.Chapman@utah.edu                   ⇨                   charlene.weir@utah.edu	 236
Brian.Chapman@utah.edu                   ⇨          cameron.waller@biochem.utah.edu	 200
Brian.Chapman@utah.edu                   ⇨               Karen.Eilbeck@hsc.utah.edu	 146
Brian.Chapman@utah.edu                   ⇨                     mark.keller@utah.edu	 129
Brian.Chapman

## Challenge: Select a random subset of the graph to get a better visualization

#### Hints:

1. Use random.shuffle and slicing
1. Use nx.subgraph

## Challenge: Convert the subgraph to a Graph and Redraw the subgraph