In [None]:
import numpy as np
import pandas as pd
import nltk
from bs4 import BeautifulSoup
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
from newspaper import Article
import spacy
import neuralcoref

nlp = spacy.load('en_core_web_lg')
neuralcoref.add_to_pipe(nlp)


In [None]:
article = pd.read_csv('sample_data.csv')
article.columns = ['idx','tags','text','genre','cluster']
article = article.head(100)
article = article.dropna()
article.head(5)

In [None]:
#import three lists: titles, links and wikipedia synopses
titles = article.tags.tolist()

synopses = article.text.tolist()
    
genres = article.genre.tolist()
# print (genres)
print(str(len(titles)) + ' titles')
print(str(len(synopses)) + ' synopses')
print(str(len(genres)) + ' genres')
# synopses[0]

In [None]:
print ("titles: ", titles[0])
text = synopses[0]

In [None]:
def get_relations(id, text):
    print ("Working for title: ", id)
    text = re.sub(r'\n+', '.', text)  # replace multiple newlines with period
    text = re.sub(r'\[\d+\]', ' ', text)  # remove reference numbers
    text = nlp(text)
    text = nlp(text._.coref_resolved)  # resolve coreference clusters
    sentences = [sent.string.strip() for sent in text.sents]  # split text into sentences
    ent_pairs = list()
    who = set()
    where = set()
    when = set()
#     sent = sentences[0]
    for sent in sentences:
        sent = nlp(sent)
        #print (sent)
        spans = list(sent.ents) + list(sent.noun_chunks)  # collect nodes
        spans = spacy.util.filter_spans(spans)
        with sent.retokenize() as retokenizer:
            [retokenizer.merge(span) for span in spans]
        for ent in sent.ents:
            if ent.label_ == "GPE" and ent.text not in where:
                where.add(ent.text)
                ent_pairs.append({ "title": id, "relation": "occurs at", "object": ent.text })
            elif ent.label_ == "PERSON" and ent.text not in who:
                who.add(ent.text)
                ent_pairs.append({ "title": id, "relation": "is about", "object": ent.text })
            elif ent.label_ == "TIME" and ent.text not in when:
                when.add(ent.text)
                ent_pairs.append({ "title": id, "relation": "at time", "object": ent.text })
            else:
                #print ("No match", ent.label_, "==>", ent.text)
                next

        for token in sent:
            relation = [w for w in token.ancestors if w.dep_ == 'ROOT'] 
        print ("token", relation)
        if relation:
            relation = relation[0]
            # add adposition or particle to relationship
            if relation.nbor(1).pos_ in ('ADP', 'PART'):  
                relation = ' '.join((str(relation),
                        str(relation.nbor(1))))
        else:
            relation = 'unknown'
        
    pairs = pd.DataFrame(ent_pairs, columns=['title',
                     'relation', 'object'])

    print('Entity pairs extracted:', str(len(ent_pairs)))
    return pairs
#         print ("it happened at: ", where)
#         print ("around: ", when)
#         print ("with: ", who)
#         print ("relation", relation)



In [None]:
import networkx as nx
import matplotlib.pyplot as plt


def draw_kg(pairs):
    k_graph = nx.from_pandas_edgelist(pairs, 'title', 'object',
            create_using=nx.MultiDiGraph())
    node_deg = nx.degree(k_graph)
    layout = nx.spring_layout(k_graph, k=0.15, iterations=20)
    plt.figure(num=None, figsize=(120, 90), dpi=80)
    nx.draw_networkx(
        k_graph,
        node_size=[int(deg[1]) * 500 for deg in node_deg],
        arrowsize=20,
        linewidths=1.5,
        pos=layout,
        edge_color='red',
        edgecolors='black',
        node_color='white',
        )
    labels = dict(zip(list(zip(pairs.title, pairs.object)),
                  pairs['relation'].tolist()))
    nx.draw_networkx_edge_labels(k_graph, pos=layout, edge_labels=labels,
                                 font_color='red')
    #plt.axis('off')
    plt.show()

In [None]:
def filter_graph(pairs, node):
    k_graph = nx.from_pandas_edgelist(pairs, 'title', 'object',
            create_using=nx.MultiDiGraph())
    edges = nx.dfs_successors(k_graph, node)
    nodes = []
    for k, v in edges.items():
        nodes.extend([k])
        nodes.extend(v)
    subgraph = k_graph.subgraph(nodes)
    layout = (nx.random_layout(k_graph))
    nx.draw_networkx(
        subgraph,
        node_size=1000,
        arrowsize=20,
        linewidths=1.5,
        pos=layout,
        edge_color='red',
        edgecolors='black',
        node_color='white'
        )
    labels = dict(zip((list(zip(pairs.title, pairs.object))),
                    pairs['relation'].tolist()))
    edges= tuple(subgraph.out_edges(data=False))
    sublabels ={k: labels[k] for k in edges}
    nx.draw_networkx_edge_labels(subgraph, pos=layout, edge_labels=sublabels,
                                font_color='red')
    plt.axis('off')
    plt.show()

In [None]:
all_pairs = pd.DataFrame([], columns=['title',
                     'relation', 'object'])
# for index, title in enumerate(titles):
for index in range(0,10):
    pairs = get_relations(titles[index],synopses[index])
    all_pairs = all_pairs.append(pairs, ignore_index = True)
print (all_pairs)

In [None]:
draw_kg(all_pairs)
# filter_graph(pairs, 'Congress')