In [1]:
from watsongraph.conceptmodel import ConceptModel
import requests
import random

In [2]:
# When we create a new ConceptModel we pass a list of concepts to be initialized.
# We can use the concepts() method to check up on how many concepts we have inserted so far.
ibm = ConceptModel(['IBM'])
ibm.concepts()

['IBM']

In [3]:
# All of the graph-expansion methods in this library pass calls to the IBM Watson API.
# To keep the information firehose at a managable level the API defines a numerical "limit" and an article-view "level".
# 0 is the highest level, corresponding with the most important articles. Levels ranging from 0 to 5.
# The limit on the other hand is theoretically infinite. Queries will be returned in order of importance!
# Let's take our graph (really only a single node) and call explode() on it.
# Explode augments every node in the graph with a new query. How many results do we get from a level 0 query?
ibm.explode(limit=2000, level=0)
len(ibm.concepts())

37

In [4]:
# Let's list the relations in order of relevance.
ibm.edges()

[(0.89564085, 'IBM', 'Digital Equipment Corporation'),
 (0.8213564, 'IBM', 'X86'),
 (0.8081631, 'IBM', 'Fortran'),
 (0.80571836, 'IBM', 'Solaris (operating system)'),
 (0.803906, 'IBM', 'SQL'),
 (0.79933375, 'Supercomputer', 'IBM'),
 (0.79717726, 'IBM', 'X86-64'),
 (0.79349726, 'Advanced Micro Devices', 'IBM'),
 (0.780642, 'IBM', 'Sun Microsystems'),
 (0.7744718, 'IBM', 'Oracle Corporation'),
 (0.7431917, 'IBM', 'Operating system'),
 (0.7338766, 'IBM', 'Microprocessor'),
 (0.7300315, 'IBM', 'Unix'),
 (0.6878544, 'IBM', 'Compiler'),
 (0.6814177, 'IBM', 'Cloud computing'),
 (0.65513045, 'IBM', 'Berkeley Software Distribution'),
 (0.6541496, 'IBM', 'Intel'),
 (0.6436416, 'IBM', 'ARM architecture'),
 (0.62924486, 'Server (computing)', 'IBM'),
 (0.6270959, 'IBM', 'Hewlett-Packard'),
 (0.62568736, 'IBM', 'FreeBSD'),
 (0.6176665, 'IBM', 'Central processing unit'),
 (0.60758543, 'IBM', 'X Window System'),
 (0.60449404, 'Java (programming language)', 'IBM'),
 (0.59871125, 'IBM', 'MySQL'),
 (0.5

In [8]:
# The list_edges() object method returns (Correlation, Concept, Concept) tuples.
# Let's filter down to edges to only those that fit a certain cutoff---say, 0.65.
relevant_edges = filter(lambda edge: edge[0] > 0.65, ibm.edges())
list(relevant_edges)

[(0.89564085, 'IBM', 'Digital Equipment Corporation'),
 (0.8213564, 'IBM', 'X86'),
 (0.8081631, 'IBM', 'Fortran'),
 (0.80571836, 'IBM', 'Solaris (operating system)'),
 (0.803906, 'IBM', 'SQL'),
 (0.79933375, 'Supercomputer', 'IBM'),
 (0.79717726, 'IBM', 'X86-64'),
 (0.79349726, 'Advanced Micro Devices', 'IBM'),
 (0.780642, 'IBM', 'Sun Microsystems'),
 (0.7744718, 'IBM', 'Oracle Corporation'),
 (0.7431917, 'IBM', 'Operating system'),
 (0.7338766, 'IBM', 'Microprocessor'),
 (0.7300315, 'IBM', 'Unix'),
 (0.6878544, 'IBM', 'Compiler'),
 (0.6814177, 'IBM', 'Cloud computing'),
 (0.65513045, 'IBM', 'Berkeley Software Distribution'),
 (0.6541496, 'IBM', 'Intel')]

In [2]:
# Let's try playing with levels a bit.
microsoft = ConceptModel(['Microsoft'])
microsoft.explode(limit=2000, level=1)
len(microsoft.concepts())

555

In [11]:
apple = ConceptModel(['Apple Inc.'])
apple.explode(limit=2000, level=5)
len(apple.concepts())

2000

In [12]:
# We maxed out our allowance! Let's check out the top 20 maybe to see what we got.
apple.edges()[:20]

[(0.99995303, 'Apple Inc.', 'Apple Corporation'),
 (0.999906, 'Apple Computers Incorporated', 'Apple Inc.'),
 (0.99985904, "Apple's", 'Apple Inc.'),
 (0.99981207, 'Apple Inc.', 'Apple Inc. Slogans'),
 (0.9997651, 'Apple Inc.', 'Spruce Text List'),
 (0.99971807, 'Apple, inc.', 'Apple Inc.'),
 (0.9996711, 'Apple Inc.', 'Big fruit'),
 (0.9995853, 'Apple Inc.', 'FireWave'),
 (0.9995427, 'AppleShare IP Migration', 'Apple Inc.'),
 (0.99950033, 'Apple Inc.', 'QTSS Publisher'),
 (0.9994586, 'Apple Inc.', 'Nightmare 6'),
 (0.99941695, 'Apple Inc.', 'Machspeed'),
 (0.9993754, 'Apple Inc.', 'Apple Pugetsound Program Library Exchange'),
 (0.9993339, 'The Sheffield Institute for the Recording Arts', 'Apple Inc.'),
 (0.99929255, 'Nidomain', 'Apple Inc.'),
 (0.9992513, '/// Cheers!', 'Apple Inc.'),
 (0.99921036, 'Val Golding', 'Apple Inc.'),
 (0.9991697, 'Apple Inc.', 'William Martens'),
 (0.9991291, 'Apple Inc.', 'Cards (iOS)'),
 (0.9990886, 'Power Macintosh (second generation)', 'Apple Inc.')]

In [13]:
# When you drill this deep some of the results stop making so much sense.
# Apple Corporation, the top result, is just a redirect to Apple Inc; as are several others.
# On the other hand Sheffield Institute is a minimally relevant article dominated by "Apple certification".
# Compare this result to the much better Microsoft result:
microsoft.edges()[:20]

[(0.996133, 'Microsoft', 'Windows Live'),
 (0.9779484, '.NET Framework version history', 'Microsoft'),
 (0.97519916, 'Microsoft', 'Microsoft Office 2007'),
 (0.97378683, 'Windows Live Messenger', 'Microsoft'),
 (0.9682485, 'Microsoft', 'List of Microsoft Windows components'),
 (0.96686125, 'Windows Server 2008', 'Microsoft'),
 (0.9649106, 'Microsoft', 'Windows Presentation Foundation'),
 (0.96461135, 'Microsoft Exchange Server', 'Microsoft'),
 (0.96434224, 'Microsoft', 'Windows NT 4.0'),
 (0.9642824, 'Microsoft', 'Microsoft Visual Studio'),
 (0.9629393, 'Hotmail', 'Microsoft'),
 (0.9623143, 'Windows Server 2003', 'Microsoft'),
 (0.9609486, 'Features new to Windows Vista', 'Microsoft'),
 (0.9600303, 'Microsoft SharePoint', 'Microsoft'),
 (0.95917296, 'Windows 8', 'Microsoft'),
 (0.9589959, 'Active Directory', 'Microsoft'),
 (0.957729, 'Microsoft', 'Comparison of Microsoft Windows versions'),
 (0.957582, 'List of features removed in Windows Vista', 'Microsoft'),
 (0.9536003, 'Microsoft',

In [20]:
# How many of these are Windows products?
windows_products = filter(lambda concept: 'Windows' in concept, microsoft.concepts())
len(list(windows_products))

35

In [38]:
# Remember that every concept in the graph is backended by a Wikipedia page.
# We can crawl these pages to generate all sorts of interesting results.
# For instance, let's try filtering our results and looking only specifically for relationships with other companies.
# We do this by searching for "companies" in the articles' categorization.
# WARNING: This requires requesting pages on Wikipedia 555 times, which takes about two minutes to process.
def categories_snapshot(concept):
    dat = requests.get('https://en.wikipedia.org/wiki/' + concept).text
    return dat[dat.find("<div id='catlinks' class='catlinks'>"):]

def select(t):
    if t[1] == 'Microsoft':
        return t[2]
    else:
        return t[1]

top_555 = [(select(edge), edge[0]) for edge in microsoft.edges()]
top_companies = [concept for concept in top_555 if 'companies' in categories_snapshot(concept[0])]
top_companies

[('Borland', 0.8774125),
 ('Citrix Systems', 0.8654945),
 ('Novell', 0.85103315),
 ('VMware', 0.8332759),
 ('Sybase', 0.80411357),
 ('Netscape', 0.7864997),
 ('SUSE Linux distributions', 0.7637935),
 ('Lotus Software', 0.7480776),
 ('Symantec', 0.7341511),
 ('Corel', 0.7205867),
 ('Palm, Inc.', 0.7023346),
 ('SCO Group', 0.6792166),
 ('List of corporations by market capitalization', 0.6557007),
 ('Macromedia', 0.62072474),
 ('Bungie', 0.61561286),
 ('CA Technologies', 0.6048778),
 ('McAfee', 0.60348326),
 ('Apache Software Foundation', 0.6001845),
 ('Outline of Apple Inc.', 0.6001148),
 ('Red Hat', 0.5986523),
 ('Oracle Corporation', 0.57815367),
 ('Sun Microsystems', 0.5575713),
 ('Microsoft Studios (game studio)', 0.55289716),
 ('Compaq', 0.55195856),
 ('Salesforce.com', 0.55179834),
 ('Boxee', 0.53604156),
 ('Intuit', 0.5294857),
 ('Adobe Systems', 0.5262806)]

In [4]:
# Not bad!
# So far we've stuck to ConceptModels constructed around a single particular node.
# Let's explore the other two exploration methedologies.
# Although we started with explode(), augment() is actually the simplest of the three.
# It simply takes the concept you give it, runs it by the API, and merges the resulting graph into the existing one.
# That concept need not already be present in the graph!
# Note that both of these methods have the same level and limit paramaters as explode().
ibm.augment('IBM Watson')
len(ibm.concepts())

53

In [5]:
# We started at 37 nodes; now we've gained 16 new ones, for 53 in total.
# How many of these nodes are removed from the IBM node at least once?
# We can figure this out using the neighborhood() command, which lists everything directly connected to the chosen one.
ibm.neighborhood('IBM')

[('Advanced Micro Devices', 0.79349726),
 ('Application programming interface', 0.58770674),
 ('Server (computing)', 0.62924486),
 ('Supercomputer', 0.79933375),
 ('Java (programming language)', 0.60449404),
 ('Solaris (operating system)', 0.80571836),
 ('Berkeley Software Distribution', 0.65513045),
 ('Unix', 0.7300315),
 ('MySQL', 0.59871125),
 ('SQL', 0.803906),
 ('Intel', 0.6541496),
 ('.NET Framework', 0.57220286),
 ('X Window System', 0.60758543),
 ('IBM Watson', 0.91326),
 ('Operating system', 0.7431917),
 ('Object-oriented programming', 0.5565874),
 ('X86', 0.8213564),
 ('FreeBSD', 0.62568736),
 ('C (programming language)', 0.5316219),
 ('Unix-like', 0.56133187),
 ('Hard disk drive', 0.5598469),
 ('ARM architecture', 0.6436416),
 ('Graphical user interface', 0.50487167),
 ('Central processing unit', 0.6176665),
 ('Microprocessor', 0.7338766),
 ('Compiler', 0.6878544),
 ('Linux', 0.5351206),
 ('Cloud computing', 0.6814177),
 ('Programming language', 0.5241717),
 ('Hewlett-Packar

In [13]:
ibm_n = [t[0] for t in ibm.neighborhood('IBM')]
watson_n = [t[0] for t in ibm.neighborhood('IBM Watson')]
watsonian_only_club = [c for c in ibm.concepts() if c in watson_n and c not in ibm_n]
watsonian_only_club

['Artificial intelligence',
 'Association for Computing Machinery',
 'Carnegie Mellon University',
 'Cognition',
 'Consciousness',
 'Game show',
 'Index of robotics articles',
 'Institute of Electrical and Electronics Engineers',
 "List of minor The Hitchhiker's Guide to the Galaxy characters",
 'Metadata',
 'Rensselaer Polytechnic Institute',
 'Semantics',
 'University of Massachusetts Amherst',
 'Web search engine',
 'Wikipedia']

In [18]:
# Let's go back to our IBM graph and use another new command, expand().
# Expand works by expanding every concept with less than n edges, where n is an optional parameter defaulting to 1.
# So e.g. obviously the following does nothing:
ibm.expand(n=0)

In [19]:
# Using our default argument, on the other hand...
ibm.expand()

In [20]:
# A lot of new stuff!
# Augment can net us more results much more quickly, but it only works on a single node.
# Expand on the other hand works on as many as you let it.
# And explode works on *ALL* of them.
len(ibm.concepts())

255

In [22]:
# Of course at this point we are expanding concepts like "Wikipedia" and "Game show".
# So our results start to get pretty far away from where we started!
ibm.edges()[:20]

[(0.9824447,
  "List of minor The Hitchhiker's Guide to the Galaxy characters",
  "Places in The Hitchhiker's Guide to the Galaxy"),
 (0.9795031, 'Advanced Micro Devices', 'X86-64'),
 (0.9777767, 'C Sharp (programming language)', '.NET Framework'),
 (0.97479653, 'MySQL', 'PHP'),
 (0.9732626, 'Troy, New York', 'Rensselaer Polytechnic Institute'),
 (0.9697869, 'X86', 'X86-64'),
 (0.96823686, 'Berkeley Software Distribution', 'FreeBSD'),
 (0.9647193, 'Sun Microsystems', 'Solaris (operating system)'),
 (0.9644456, 'Semantics', 'List of linguists'),
 (0.9643342, 'MySQL', 'SQL'),
 (0.9635352, 'Graphical user interface', 'X Window System'),
 (0.9593459, 'Advanced Micro Devices', 'X86'),
 (0.95637155, 'Semantics', 'Syntax'),
 (0.95279807, 'Object-oriented programming', 'Ruby (programming language)'),
 (0.95201236, 'Robotics', 'Index of robotics articles'),
 (0.95163786, 'Berkeley Software Distribution', 'Unix'),
 (0.94865924, 'Oracle Corporation', 'SQL'),
 (0.9457252, 'FreeBSD', 'Solaris (oper

In [12]:
# "Six degrees of Wikipedia" is a popular and somewhat research-supported idea that all Wikipedia articles are within six
# hops of one another, at most.
# To illustrate how far Watson's cognitive graph can wander, on the other hand, let's take a random 3-step walks. Fun!
# Try re-running this code block yourself. Where do you end up?
def jump(concept):
    c = ConceptModel([concept])
    c.explode()
    l = len(c.concepts())
    return c.concepts()[random.randrange(0, l)]

jump(jump(jump('IBM')))

'Windows 7'

In [14]:
# Doing this a whole bunch of times:
kinda_random = [jump(jump(jump(i))) for i in ['IBM'] * 5]
kinda_random

['Windows 7',
 'X Window System',
 'Sun Microsystems',
 'Cross-platform',
 'X Window System']

In [15]:
# Daily page views are a bit of sugar built into the watson-graph library.
# This data is based on a 30-day average, and is generated by a call against the appropriate Wikipedia API.
# Note that because they are a significant additional overhead you have to instantiate them yourself!
kr = ConceptModel(kinda_random)
kr.set_view_counts()
kr.concepts_by_view_count()

[(3286, 'Windows 7'),
 (1071, 'Sun Microsystems'),
 (910, 'X Window System'),
 (763, 'Cross-platform')]

In [4]:
# There are two ways of removing concepts for a graph.
# The simple way is to just remove one by label, with remove_concept()
microsoft.add('Microsoft')
microsoft.remove('Microsoft')
len(microsoft.concepts())

554

In [8]:
# The more complex but "funner" way is with abridge(), which is basically the opposite of augment() used above.
# Remember how we defined the microsoft model? The following result shouldn't come as a surprise!
microsoft.abridge('Microsoft', limit=2000, level=1)
len(microsoft.concepts())

0

In [16]:
# Just one last thing!
# This library provides default bindings for saving to and loading from JSON.
# These are in the form of the alternating to_json() and from_json() object methods.
kr.to_json()

{'directed': False,
 'graph': {'name': '()'},
 'links': [],
 'multigraph': False,
 'nodes': [{'id': 'Sun Microsystems', 'relevance': 0.0, 'view_count': 1071},
  {'id': 'Cross-platform', 'relevance': 0.0, 'view_count': 763},
  {'id': 'X Window System', 'relevance': 0.0, 'view_count': 910},
  {'id': 'Windows 7', 'relevance': 0.0, 'view_count': 3286}]}