# Buidling the Knowledge Graph and performing Community Detection

In [None]:
#Uncomment to install workbook requirements
#!pip install -r requirements.txt
#!pip install en_core_sci_sm-0.5.1.tar.gz

In [None]:
import igraph
import csv
import scispacy
import spacy

## Designing a knowledge graph schema

In [None]:

with open('./data/20k_abstracts_clean.csv', 'r') as c:
	reader = csv.reader(c)
	data = [line for line in reader]


In [None]:
nlp = spacy.load("en_core_sci_sm")

In [None]:
text = data[0][1]
doc = nlp(text)
print(list(doc.ents))

In [None]:
abstract_entities = [[row[0], nlp(row[1]).ents] for row in data]

In [None]:
abstract_entities = [[row[0], [str(ent).lower() for ent in row[1]]] for row in abstract_entities]
print(abstract_entities[:5])

In [None]:
all_entities = [row[1] for row in abstract_entities]

In [None]:
import itertools
entities = itertools.chain.from_iterable(all_entities)

In [None]:
from collections import Counter
entity_freq = dict(Counter(entities))
entity_freq = dict(sorted(entity_freq.items(), key=lambda item: item[1], reverse=True))
print(entity_freq)

In [None]:
high_freq = {ent: value for ent, value in entity_freq.items() if value > 100}
print(len(high_freq))
print(len(entity_freq))

In [None]:
low_freq = {ent: value for ent, value in entity_freq.items() if value == 1}
print(len(low_freq))

In [None]:
removed_terms = [ent for ent, value in entity_freq.items() if value > 100 or value == 1]

In [None]:
abstract_entities = [[row[0], [ent for ent in row[1] if ent not in removed_terms]]
                  	for row in abstract_entities]
print(abstract_entities[0])

## Constructing the knowledge graph

In [None]:
terms = [abstract[1] for abstract in abstract_entities]  # get just ents no abstract IDs
unique_terms = list(set(itertools.chain.from_iterable(terms)))

In [None]:
term_ids = {term: i for i, term in enumerate(unique_terms, len(data))}

In [None]:
print(term_ids['ige'])

In [None]:
edgelist = []
for abstract_id, terms in abstract_entities:
    term_freq = dict(Counter(terms))
    for term, freq in term_freq.items():
        edgelist.append([int(term_ids[term]), int(abstract_id), freq])
 
print(edgelist[:10])
assert [term_ids['ige'], 0, 4] in edgelist


In [None]:
g = igraph.Graph(directed=True)

In [None]:
g.add_vertices(len(term_ids) + len(data))

In [None]:
text = [abstract[1] for abstract in data] + [ent for ent, _ in term_ids.items()]
g.vs['text'] = text

In [None]:
print(g.vs[term_ids['ige']]['text'])

In [None]:
types = ['abstract' for _ in data] + ['term' for _ in term_ids.items()]
g.vs['type'] = types

In [None]:
print(g.vs[term_ids['ige']]['type'])

In [None]:
edges = [[source, target] for source, target, _ in edgelist]
frequencies = [freq for _, _, freq in edgelist]


In [None]:
g.add_edges(edges)
g.es['frequency'] = frequencies

# Knowledge graph analysis and community detection

## Examing knowledge graph structure

In [None]:
print(len(g.vs))
print(len(g.es))

In [None]:
connected_components = g.clusters(mode='weak')
print(connected_components)

In [None]:
abstract_nodes = g.vs.select(type_eq='abstract')
term_nodes = g.vs.select(type_eq='term')

In [None]:
abstract_degree = g.degree(abstract_nodes)
term_degree = g.degree(term_nodes)

In [None]:
import matplotlib.pyplot as plt
plt.hist(abstract_degree, bins=20, edgecolor='black')
plt.xlabel('Abstract Node Degree')
plt.ylabel('Frequency')
plt.savefig('charts/abstract_degree.jpg')

In [None]:
plt.hist(term_degree, bins=20, edgecolor='black')
plt.xlabel('Term Node Degree')
plt.ylabel('Frequency')
plt.savefig('charts/term_node_degree.jpg')

## Identifying abstracts of interest

In [None]:
yoga_node_id = g.vs.select(text_eq='yoga')[0].index
yoga_abstract_nodes = g.neighbors(g.vs[yoga_node_id])
yoga_abstracts = [g.vs[neighbor]['text'] for neighbor in yoga_abstract_nodes]
print(yoga_abstracts)

In [None]:
related_term_nodes = [g.neighbors(node) for node in yoga_abstract_nodes]
import itertools
related_term_nodes = set(itertools.chain.from_iterable(related_term_nodes))
related_terms = g.vs(related_term_nodes)['text']
print(related_terms)

In [None]:
print(len(related_terms))

## Identifying fields with Community Detection

In [None]:
g_u = g.as_undirected()

In [None]:
community_membership = g_u.community_multilevel()
print(len(community_membership))

In [None]:
for i, community in enumerate(community_membership):
	size = len(community)
	print(f'Community: {i}, size: {size}')

In [None]:
smallest_community = sorted(list(community_membership), key=len)[0]
print(smallest_community)

In [None]:
community_nodes = g.vs[smallest_community].select(type_eq='term')

In [None]:
community_terms = community_nodes['text']
print(community_terms)

## Code from this section

In [1]:
%%writefile build_know_graph.py
""" 
Name:       build_know_graph.py.py
Author(s):  Gary Hutson & Matt Jackson on behalf of Packt publishing
Date:       03/02/2022
Usage:      build_know_graph.py
"""
import igraph
import csv
import scispacy
import spacy
import itertools
from collections import Counter
import matplotlib.pyplot as plt

# Load in data
with open('./data/20k_abstracts_clean.csv', 'r') as c:
	reader = csv.reader(c)
	data = [line for line in reader]

# Load in spacy NLP library
nlp = spacy.load("en_core_sci_sm")
text = data[0][1]
doc = nlp(text)
print(list(doc.ents))

# Get abstract entities 
abstract_entities = [[row[0], nlp(row[1]).ents] for row in data]
abstract_entities = [[row[0], [str(ent).lower() for ent in row[1]]] for row in abstract_entities]
print(abstract_entities[:5])

# Get all entities
all_entities = [row[1] for row in abstract_entities]

# Use itertools to get all entities
entities = itertools.chain.from_iterable(all_entities)

# Get frequency counts using Counter()
entity_freq = dict(Counter(entities))
entity_freq = dict(sorted(entity_freq.items(), key=lambda item: item[1], reverse=True))
print(entity_freq)

high_freq = {ent: value for ent, value in entity_freq.items() if value > 100}
print(len(high_freq))
print(len(entity_freq))

low_freq = {ent: value for ent, value in entity_freq.items() if value == 1}
print(len(low_freq))

# Remove some terms
removed_terms = [ent for ent, value in entity_freq.items() if value > 100 or value == 1]
abstract_entities = [[row[0], [ent for ent in row[1] if ent not in removed_terms]]
                  	for row in abstract_entities]
print(abstract_entities[0])

# Constructing the knowledge graph
terms = [abstract[1] for abstract in abstract_entities]  # get just ents no abstract IDs
unique_terms = list(set(itertools.chain.from_iterable(terms)))


term_ids = {term: i for i, term in enumerate(unique_terms, len(data))}
print(term_ids['ige'])

# Create the edgelists
edgelist = []
for abstract_id, terms in abstract_entities:
    term_freq = dict(Counter(terms))
    for term, freq in term_freq.items():
        edgelist.append([int(term_ids[term]), int(abstract_id), freq])
 
print(edgelist[:10])
assert [term_ids['ige'], 0, 4] in edgelist

# Instantiate igraph as a directed graph
g = igraph.Graph(directed=True)
# Add verticies
g.add_vertices(len(term_ids) + len(data))
# Extract text
text = [abstract[1] for abstract in data] + [ent for ent, _ in term_ids.items()]
g.vs['text'] = text
print(g.vs[term_ids['ige']]['text'])
types = ['abstract' for _ in data] + ['term' for _ in term_ids.items()]
g.vs['type'] = types
print(g.vs[term_ids['ige']]['type'])
edges = [[source, target] for source, target, _ in edgelist]
frequencies = [freq for _, _, freq in edgelist]
g.add_edges(edges)
g.es['frequency'] = frequencies
# Analyze and apply community detection
print(len(g.vs))
print(len(g.es))
# Find weakly connected components
connected_components = g.clusters(mode='weak')
print(connected_components)
# Get the abstract and term nodes
abstract_nodes = g.vs.select(type_eq='abstract')
term_nodes = g.vs.select(type_eq='term')
# Get the degrees
abstract_degree = g.degree(abstract_nodes)
term_degree = g.degree(term_nodes)

# Do some plotting
# Abstract degree first
plt.hist(abstract_degree, bins=20, edgecolor='black')
plt.xlabel('Abstract Node Degree')
plt.ylabel('Frequency')
plt.savefig('charts/abstract_degree.jpg')
plt.show()

# Term degree second
plt.hist(term_degree, bins=20, edgecolor='black')
plt.xlabel('Term Node Degree')
plt.ylabel('Frequency')
plt.savefig('charts/term_node_degree.jpg')

# Identify abstracts of interest
yoga_node_id = g.vs.select(text_eq='yoga')[0].index
yoga_abstract_nodes = g.neighbors(g.vs[yoga_node_id])
yoga_abstracts = [g.vs[neighbor]['text'] for neighbor in yoga_abstract_nodes]
print(yoga_abstracts)

related_term_nodes = [g.neighbors(node) for node in yoga_abstract_nodes]
import itertools
related_term_nodes = set(itertools.chain.from_iterable(related_term_nodes))
related_terms = g.vs(related_term_nodes)['text']
print(related_terms)
print(len(related_terms))

# Identifying fields with Community Detection
g_u = g.as_undirected()
community_membership = g_u.community_multilevel()
print(len(community_membership))

# Loop through communities and return their sizings
for i, community in enumerate(community_membership):
	size = len(community)
	print(f'Community: {i}, size: {size}')

# Get the smallest community
smallest_community = sorted(list(community_membership), key=len)[0]
print(smallest_community)

# Get the community nodes
community_nodes = g.vs[smallest_community].select(type_eq='term')
community_terms = community_nodes['text']
print(community_terms)

Writing build_know_graph.py
