# Setup

In [1]:
from py2neo import Graph
from py2neo.bulk import create_nodes, create_relationships
import pandas as pd
import os
import json
from collections import Counter
from itertools import islice
import spacy
import pytextrank

user = os.getenv("NEO4J_USER")
password = os.getenv("NEO4J_LOCAL_PASSWORD")
uri = os.getenv("NEO4J_LOCAL_URI")

graph = Graph(uri, auth=(user, password))

## Utilities Functions

In [2]:
def reset(graph):
    graph.run("MATCH (n) DETACH DELETE n")

In [3]:
def create_index(graph, name, label, attrs, fulltext=False, debug=False):
    
    if fulltext:
        cypher = f"""
        CREATE FULLTEXT INDEX {name} IF NOT EXISTS
        FOR (n:{label}) ON EACH
        [{",".join([f'n.{x}' for x in attrs])}]
        """
    else:
        cypher = f"""
        CREATE INDEX {name} IF NOT EXISTS
        FOR (n:{label}) ON
        ({",".join([f'n.{x}' for x in attrs])})
        """
    if debug:
        print(cypher)
    
    graph.run(cypher)    

In [4]:
def periodic_input(graph, data, labels, batch_size=1000):
    stream = iter(data)
    while True:
        batch = list(islice(stream, batch_size))
        if len(batch) > 0:
            create_nodes(graph.auto(), batch, labels=labels)
        else:
            break

# Data Import

In [5]:
reset(graph)

## Occupations

In [6]:
esco_occupations = pd.read_csv("../data/ESCO/occupations_en.csv")
data =  [{
    "preferred_label":x.preferredLabel,
    "alt_label":x.altLabels,
    "uri":x.conceptUri,
    "isco_code":x.iscoGroup,
    "description":x.description
    } 
    for x in esco_occupations.itertuples()]

In [7]:
periodic_input(graph, data, {"Occupation", "ONET"})
create_index(graph,"Occupation","Occupation",["uri"])
create_index(graph,"OccupationFT","Occupation",["preferred_label", "alt_label"], fulltext=True)

## ISCO GROUP

In [8]:
isco_group = pd.read_csv("../data/ESCO/ISCOGroups_en.csv")
data =  [{
    "preferred_label":x.preferredLabel,
    "alt_label":x.altLabels,
    "uri":x.conceptUri,
    "description":x.description
    } for x in isco_group.itertuples()]

In [9]:
periodic_input(graph, data, {"ISCO","Occupation"})
create_index(graph,"ISCO","ISCO",["uri"])

## Occupation-Occupation Relation

In [10]:
occupation_relation = pd.read_csv("../data/ESCO/broaderRelationsOccPillar.csv")
data =  [((x.broaderUri), {}, (x.conceptUri),)
    for x in occupation_relation.itertuples()]

In [12]:
create_relationships(graph.auto(), data, "BROADER_THEN", \
    start_node_key=(("Occupation"), "uri"), end_node_key=(("Occupation"), "uri"))

## Skills

In [143]:
esco_skill = pd.read_csv("../data/ESCO/skills_en.csv")
data =  [{
    "preferred_label":x.preferredLabel,
    "alt_label":x.altLabels,
    "uri":x.conceptUri,
    "description":x.description} 
    for x in esco_skill.itertuples()]

In [144]:
periodic_input(graph, data, {"Skill"})
create_index(graph,"Skill","Skill",["uri"])
create_index(graph,"SkillFT","Skill",["preferred_label", "alt_label"], fulltext=True)

## Skill Group

In [145]:
skill_group = pd.read_csv("../data/ESCO/skillGroups_en.csv")
data =  [{
    "preferred_label":x.preferredLabel,
    "alt_label":x.altLabels,
    "uri":x.conceptUri,
    "description":x.description} 
    for x in skill_group.itertuples()]

In [146]:
periodic_input(graph, data, {"Skill_Group", "Skill"})
create_index(graph,"Skill","Skill_Group",["uri"])
create_index(graph,"SkillGroupFT","Skill_Group",["preferred_label", "alt_label"], fulltext=True)

## Skill-Skill Relation

In [147]:
skill_relation = pd.read_csv("../data/ESCO/broaderRelationsSkillPillar.csv")
data =  [((x.broaderUri), {}, (x.conceptUri))
    for x in skill_relation.itertuples()]

In [148]:
create_relationships(graph.auto(), data, "BROADER_THEN", \
    start_node_key=(("Skill"), "uri"), end_node_key=(("Skill"), "uri"))

## Occupation-Skill Relation

In [149]:
occ_skill_relation = pd.read_csv("../data/ESCO/occupationSkillRelations.csv")
data =  [(x.occupationUri, {"type":x.skillType}, x.skillUri)
    for x in occ_skill_relation.itertuples()]

In [150]:
create_relationships(graph.auto(), data, "REQUIRE", \
    start_node_key=(("Occupation"), "uri"), end_node_key=(("Skill"), "uri"))

## Courses

In [6]:
def remove_html_tags(text):
    """Remove html tags and new line from a string"""
    import re
    clean = re.compile('<.*?>|\n')
    return re.sub(clean, ' ', text)

def normalize(xs):
    return [" ".join(x.lower().split()) for x in xs]

In [7]:
with open("../data/COURSE/go1contents_2604_reg.json", "r") as f:
    courses = json.load(f)

for c in courses:
    c["tags"] = normalize(c["tags"])
    c["description"] = remove_html_tags(c["description"])

In [8]:
data =  [{
    "preferred_label":x["title"],
    "uri":x["id"],
    "description":x["description"]} 
    for x in courses]

In [9]:
periodic_input(graph, data, {"Course"})
create_index(graph,"Course","Course",["uri"])
create_index(graph,"CourseFT","Course",["preferred_label"], fulltext=True)

## Topics

In [10]:
topics = [[y["value"] for y in x.get("attributes", {}).get("topics", [])] for x in courses]
topic_df = pd.DataFrame(topics, columns=["topic_l1","topic_l2","topic_l3"])

In [11]:
def import_topics_neo4j(df,col):
    topic_df_drp_dup = df[col].drop_duplicates().dropna()
    data = [{"preferred_label":x,} for x in topic_df_drp_dup]

    periodic_input(graph, data, {col})
    create_index(graph,col,col,["preferred_label"])

In [12]:
# read topics to neo4j
import_topics_neo4j(topic_df,'topic_l1')
import_topics_neo4j(topic_df,'topic_l2')
import_topics_neo4j(topic_df,'topic_l3')

### Topic l1 - Topic l2 Relation

In [13]:
topic12_df_drp_dup = topic_df[['topic_l1','topic_l2']].drop_duplicates().dropna()

data =  [(x.topic_l1, {}, x.topic_l2)
    for x in topic12_df_drp_dup.itertuples()]

In [14]:
create_relationships(graph.auto(), data, "HAS_SUBTOPIC_OF", \
    start_node_key=(("topic_l1"), "preferred_label"), end_node_key=(("topic_l2"), "preferred_label"))

### Topic l2 - Topic l3 Relation

In [15]:
topic23_df_drp_dup = topic_df[['topic_l2','topic_l3']].drop_duplicates().dropna()

data =  [(x.topic_l2, {}, x.topic_l3)
    for x in topic23_df_drp_dup.itertuples()]

In [16]:
create_relationships(graph.auto(), data, "HAS_SUBTOPIC_OF", \
    start_node_key=(("topic_l2"), "preferred_label"), end_node_key=(("topic_l3"), "preferred_label"))

### Additional

In [17]:
create_index(graph,"TopicFT","Topic",["topic_l1", "topic_l2", "topic_l3"], fulltext=True)

## Course-Topic Relation

In [52]:
course_topic = pd.DataFrame({'course_uri':[x["id"] for x in courses], 'topic_preferred_label':topic_df['topic_l1']})

data =  [(x.course_uri, {}, x.topic_preferred_label)
    for x in course_topic.itertuples()]

In [56]:
create_relationships(graph.auto(), data, "HAS_TOPIC_OF", \
    start_node_key=(("Course"), "uri"), end_node_key=(("topic_l1"), "preferred_label"))

## Tags

In [211]:
esco_skill

Unnamed: 0,conceptType,conceptUri,skillType,reuseLevel,preferredLabel,altLabels,hiddenLabels,status,modifiedDate,scopeNote,definition,inScheme,description
0,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/0005c151-5b5a...,skill/competence,sector-specific,manage musical staff,manage staff of music\ncoordinate duties of mu...,,released,2016-12-20T17:43:43Z,,,http://data.europa.eu/esco/concept-scheme/skil...,Assign and manage staff tasks in areas such as...
1,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/00064735-8fad...,skill/competence,occupation-specific,supervise correctional procedures,oversee prison procedures\nmanage correctional...,,released,2016-12-20T20:17:49Z,,,http://data.europa.eu/esco/concept-scheme/memb...,Supervise the operations of a correctional fac...
2,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/000709ed-2be5...,skill/competence,sector-specific,apply anti-oppressive practices,apply non-oppressive practices\napply an anti-...,,released,2016-12-20T19:18:19Z,,,http://data.europa.eu/esco/concept-scheme/skil...,"Identify oppression in societies, economies, c..."
3,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/0007bdc2-dd15...,skill/competence,sector-specific,control compliance of railway vehicles regulat...,monitoring of compliance with railway vehicles...,,released,2016-12-20T20:02:19Z,,,http://data.europa.eu/esco/concept-scheme/skil...,"Inspect rolling stock, components and systems ..."
4,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/00090cc1-1f27...,skill/competence,cross-sector,identify available services,establish available services\nidentify availab...,,released,2016-12-20T20:15:17Z,,,http://data.europa.eu/esco/concept-scheme/memb...,Identify the different services available for ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13886,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/ffef5eb3-a15e...,skill/competence,sector-specific,remediate healthcare user's occupational perfo...,restore healthcare user's occupational perform...,,released,2016-12-20T19:25:53Z,,,http://data.europa.eu/esco/concept-scheme/memb...,"Remediate or restore the cognitive, sensorimot..."
13887,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/fff0b074-5a76...,skill/competence,sector-specific,install transport equipment lighting,install transport equipment illumination\nfix ...,,released,2016-12-20T20:03:21Z,,,http://data.europa.eu/esco/concept-scheme/skil...,Install lighting elements in transport equipme...
13888,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/fff0e2cd-d0bd...,knowledge,sector-specific,natural language processing,natural language processing\nNLP,,released,2016-08-04T15:19:37Z,,,http://data.europa.eu/esco/concept-scheme/skil...,The technologies which enable ICT devices to u...
13889,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/fff5bc45-b506...,skill/competence,cross-sector,coordinate construction activities,reviewing construction progress\nconstruction ...,,released,2016-12-20T18:22:35Z,,,http://data.europa.eu/esco/concept-scheme/skil...,Coordinate the activities of several construct...


In [None]:
PhraseMatcher()

In [210]:
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

doc = nlp("German Chancellor angela Merkel and US President Barack Obama "
          "converse in the Oval Office inside the White House in Washington, D.C.")
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

angela Merkel
Barack Obama
Washington, D.C.


In [207]:
matches

[(3766102292120407359, 2, 4),
 (3766102292120407359, 7, 9),
 (3766102292120407359, 19, 22)]

In [205]:
spacy.__version__

'3.3.0'

In [169]:
tags = [y for x in courses for y in x["tags"]]
unique_tags = set(tags)

In [204]:
unique_tags

{'windows forms',
 'what is sharepoint training?',
 'label',
 'hays-resminingxresenergy',
 'debt recovery',
 'anything of value',
 'dr lindsay peer',
 'régime d’importation',
 'enjoyment',
 'max brown',
 'is microsoft project easy to use?',
 'microsoft project training for construction',
 'unhappiness',
 'travail à domicile',
 'sharepoint training outline',
 'process',
 'high-risk',
 'credit',
 'sales &amp; customer skills',
 'disziplinarische macht',
 'work site inspection',
 '#effectiveness',
 'vapors',
 'fsb',
 'unwilling',
 'pneumatic',
 'supplied-air respirators',
 'powerpoint narration video',
 'how do you automatically advance slides in powerpoint?',
 'workpalce bullying',
 'extension cords',
 'anticipating needs',
 'employee conduct',
 'producivity',
 'microsoft planner for construction',
 'california sexual harassment training requirements',
 'e-harassment',
 'https://i.imgur.com/09nb2k4.jpg.jpg',
 'how do i do an advanced search in outlook 2010?',
 'microsoft identity',
 'tim

In [174]:
data = [{"preferred_label":x} for x in unique_tags]
periodic_input(graph, data, {"Tag"})
create_index(graph,"Tag","Tag",["preferred_label"])

In [182]:
course_tags = [(x["id"], {}, tag) for x in courses for tag in x["tags"]]

In [184]:
create_relationships(graph.auto(), course_tags, "TAG", \
    start_node_key=(("Course"), "uri"), end_node_key=(("Tag"), "preferred_label"))

In [191]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank")

<pytextrank.base.BaseTextRankFactory at 0x7f6d967a0e50>

In [198]:
txt="""
Course OverviewArtificial Intelligence has become an important and integral part of many industries, revolutionizing sectors such as Banking, Medicine, Transportation, etc. Recently, small businesses have been leveraging AI to scale up and be more efficient. This course is your stepping stone to mastering the power of AI for your own business.This course will teach you to approach AI from a leader’s perspective using practical, data-driven methods to identify and quantify business opportunities. You will learn to use several varieties of machine learning techniques, improving the capability of your business to deliver better and faster solutions to its customers and clients.By the end of the course, you will have the skills to improve and innovate the services of any business using the power of AI.Target AudienceIf you are a manager, analyst, developer, or consultant interested in leveraging the power of AI for business, then this course is for you. No prior knowledge of AI is required.This video is part of the course  Hands-On Artificial Intelligence for Small Businesses.The aim of this video is to provide mathematical background that answers the question, “How do we know if our clustering is good?”Understand Bayes’ RuleApply Bayes’ Rule to reasoning about clustersSimplify Bayes’ Rule for the clustering case
"""
doc = nlp(txt)
for phrase in doc._.phrases:
    print(phrase.text)

business opportunities
business
small businesses
Bayes
AI
clients
Transportation
machine learning techniques
Small Businesses
good?”Understand Bayes’ RuleApply Bayes’ Rule
many industries
your own business
Medicine
Bayes’ Rule
Course OverviewArtificial Intelligence
Banking
several varieties
AI.Target
any business
your business
better and faster solutions
mathematical background
part
the clustering case
Hands-On Artificial Intelligence
consultant
Banking, Medicine, Transportation
practical, data-driven methods
developer
, revolutionizing sectors
analyst
the power
an important and integral part
its customers
This course
the course
this course
your stepping stone
the capability
a leader’s perspective
our clustering
the services
the question
No prior knowledge
the skills
This video
a manager
this video
The aim
the end
You
that
we
you


In [197]:
courses[101]["description"]

'Course OverviewThis video covers some of the key points around avoiding having to escalate a call or conversation to a manager or a supervisor. Knowing a phrase or two may help reduce the need to escalate a call and help build confidence for the customer and yourself!'

In [189]:
[x for x in courses if x["id"]==12498735]

[{'id': 12498735,
  'type': 'video',
  'title': 'Using Bayesian Inference',
  'published': True,
  'description': 'Course OverviewArtificial Intelligence has become an important and integral part of many industries, revolutionizing sectors such as Banking, Medicine, Transportation, etc. Recently, small businesses have been leveraging AI to scale up and be more efficient. This course is your stepping stone to mastering the power of AI for your own business.This course will teach you to approach AI from a leader’s perspective using practical, data-driven methods to identify and quantify business opportunities. You will learn to use several varieties of machine learning techniques, improving the capability of your business to deliver better and faster solutions to its customers and clients.By the end of the course, you will have the skills to improve and innovate the services of any business using the power of AI.Target AudienceIf you are a manager, analyst, developer, or consultant inter

In [173]:
Counter(tags).most_common(50)

[('it skills', 15798),
 ('business skills', 7475),
 ('business', 3711),
 ('programming', 3424),
 ('global premium', 3071),
 ('adobe photoshop', 2880),
 ('technology skills', 2814),
 ('fundamentals', 2525),
 ('portrait', 2459),
 ('communication skills', 2189),
 ('lighting', 2143),
 ('business basics', 2011),
 ('marketing &amp; sales', 1971),
 ('communication', 1876),
 ('compliance', 1845),
 ('leadership', 1679),
 ('beginner', 1649),
 ('network security', 1593),
 ('wedding', 1483),
 ('management', 1450),
 ('finance', 1450),
 ('it software', 1443),
 ('personal development', 1321),
 ('microsoft products', 1303),
 ('camera guides', 1299),
 ('commercial', 1263),
 ('workplace safety', 1183),
 ('cyber security', 1167),
 ('family &amp; lifestyle', 1130),
 ('safety training', 1068),
 ('developer', 1066),
 ('safety and regulatory compliance training', 1042),
 ('self-improvement', 968),
 ('human resources', 956),
 ('web design', 928),
 ('project management', 899),
 ('health &amp; safety', 890),
 (