# Setup

In [9]:
from py2neo import Graph
from py2neo.bulk import create_nodes, create_relationships
import pandas as pd
import numpy as np
import os
import json
from collections import Counter
from itertools import islice
import spacy
import pytextrank
import time
from pandas import json_normalize
from collections import Counter
#from top2vec import Top2Vec
import re
from itertools import cycle
from more_itertools import collapse, pairwise

In [None]:
user = os.getenv('NEO4J_USER')
password = os.getenv('NEO4J_PASSWORD')
uri = 'bolt://localhost:7687'

graph = Graph(uri, auth=(user, password))

## Utilities Functions

In [None]:
def reset(graph):
    graph.run("MATCH (n) DETACH DELETE n")

In [None]:
def create_index(graph, name, label, attrs, fulltext=False, debug=False):
    
    if fulltext:
        cypher = f"""
        CREATE FULLTEXT INDEX {name} IF NOT EXISTS
        FOR (n:{label}) ON EACH
        [{",".join([f'n.{x}' for x in attrs])}]
        """
    else:
        cypher = f"""
        CREATE INDEX {name} IF NOT EXISTS
        FOR (n:{label}) ON
        ({",".join([f'n.{x}' for x in attrs])})
        """
    if debug:
        print(cypher)
    
    graph.run(cypher)    

In [None]:
def periodic_input(graph, data, labels, batch_size=1000):
    stream = iter(data)
    while True:
        batch = list(islice(stream, batch_size))
        if len(batch) > 0:
            create_nodes(graph.auto(), batch, labels=labels)
        else:
            break

# Data Import

In [None]:
reset(graph)

## Occupations

In [None]:
esco_occupations = pd.read_csv("../data/ESCO/occupations_en.csv")
data =  [{
    "preferred_label":x.preferredLabel,
    "alt_label":x.altLabels,
    "uri":x.conceptUri,
    "isco_code":x.iscoGroup,
    "description":x.description
    } 
    for x in esco_occupations.itertuples()]

In [None]:
periodic_input(graph, data, {"Occupation", "ONET"})
create_index(graph,"Occupation","Occupation",["uri"])
create_index(graph,"OccupationFT","Occupation",["preferred_label", "alt_label"], fulltext=True)

## ISCO GROUP

In [None]:
isco_group = pd.read_csv("../data/ESCO/ISCOGroups_en.csv")
data =  [{
    "preferred_label":x.preferredLabel,
    "alt_label":x.altLabels,
    "uri":x.conceptUri,
    "description":x.description
    } for x in isco_group.itertuples()]

In [None]:
periodic_input(graph, data, {"ISCO","Occupation"})
create_index(graph,"ISCO","ISCO",["uri"])

## Occupation-Occupation Relation

In [None]:
occupation_relation = pd.read_csv("../data/ESCO/broaderRelationsOccPillar.csv")
data =  [((x.broaderUri), {}, (x.conceptUri),)
    for x in occupation_relation.itertuples()]

In [None]:
create_relationships(graph.auto(), data, "BROADER_THEN", \
    start_node_key=(("Occupation"), "uri"), end_node_key=(("Occupation"), "uri"))

## Skills

In [None]:
esco_skill = pd.read_csv("../data/ESCO/skills_en.csv")
data =  [{
    "preferred_label":x.preferredLabel,
    "alt_label":x.altLabels,
    "uri":x.conceptUri,
    "description":x.description} 
    for x in esco_skill.itertuples()]

In [None]:
periodic_input(graph, data, {"Skill"})
create_index(graph,"Skill","Skill",["uri"])

In [None]:
create_index(graph,"SkillFT","Skill",["preferred_label", "alt_label"], fulltext=True)

## Skill Group

In [None]:
skill_group = pd.read_csv("../data/ESCO/skillGroups_en.csv")
data =  [{
    "preferred_label":x.preferredLabel,
    "alt_label":x.altLabels,
    "uri":x.conceptUri,
    "description":x.description} 
    for x in skill_group.itertuples()]

In [None]:
periodic_input(graph, data, {"Skill_Group", "Skill"})
create_index(graph,"Skill","Skill_Group",["uri"])
create_index(graph,"SkillGroupFT","Skill_Group",["preferred_label", "alt_label"], fulltext=True)

## Skill-Skill Relation

In [None]:
skill_relation = pd.read_csv("../data/ESCO/broaderRelationsSkillPillar.csv")

data =  [((x.broaderUri), {}, (x.conceptUri))
    for x in skill_relation.itertuples()]

In [None]:
create_relationships(graph.auto(), data, "BROADER_THEN", \
    start_node_key=(("Skill"), "uri"), end_node_key=(("Skill"), "uri"))

## Occupation-Skill Relation

In [None]:
occ_skill_relation = pd.read_csv("../data/ESCO/occupationSkillRelations.csv")

data =  [(x.occupationUri, {"type":x.skillType}, x.skillUri)
    for x in occ_skill_relation.itertuples()]

In [None]:
create_relationships(graph.auto(), data, "REQUIRE", \
    start_node_key=(("Occupation"), "uri"), end_node_key=(("Skill"), "uri"))

## GO1 Courses

In [4]:
def remove_html_tags(text):
    """Remove html tags and new line from a string"""
    clean = re.compile('<.*?>|\n')
    return re.sub(clean, ' ', text)

def normalize(xs):
    return [" ".join(x.lower().split()) for x in xs]


def remove_special_cha(text):
    """Remove special characters from a string"""
    clean = re.compile('[|^&+\-%*/=>():"#$“”]')
    return re.sub(clean, ' ', text)


def list_to_str(text, col):
    y = []
    for elem in text[col]:
        y.append(", ".join(elem))
    return y

def language_preprocess(text):
    text = re.sub("English", "en", text)
    text = re.sub("German", "de", text)
    return text

In [6]:
with open("../data/courses.json", "r") as f:
    courses = json.load(f)

for c in courses:
    c["tags"] = normalize(c["tags"])
    c["topics"] = normalize([y["value"] for y in c.get("attributes", {}).get("topics", [])])
    c["title"] = remove_special_cha(c["title"])
    c["description"] = remove_special_cha(remove_html_tags(c["description"]))
    c['language'] = language_preprocess(c['language'])

In [None]:
data =  [{
    "preferred_label":x["title"],
    "uri":str(x["id"]),
    "language":x['language'],
    "description":x["description"]} 
    for x in courses]

In [None]:
periodic_input(graph, data, {"Course"})
create_index(graph,"Course","Course",["uri"])
create_index(graph,"CourseFT","Course",["preferred_label"], fulltext=True)

## ESCO Topic

In [None]:
model = Top2Vec.load("Models/universal-sentence-encoder_dlearn")

data =  [{
    "preferred_label":x} 
    for x in ['topic '+str(x) for x in model.get_topics()[2]]
]

In [None]:
periodic_input(graph, data, {"ESCO_Topic"})
create_index(graph,"ESCO_Topic","ESCO_Topic",["preferred_label"])

## Skill - ESCO Topic Relation

In [None]:
data = [('topic ' + str(index), {}, esco_skill['conceptUri'].iloc[x]) 
        for index, size in enumerate(model.get_topic_sizes()[0]) 
        for x in model.search_documents_by_topic(topic_num=index, num_docs=size)[2]]

In [None]:
create_relationships(graph.auto(), data, "BELONG_TO", \
    start_node_key=(("Skill"), "uri"), end_node_key=(("ESCO_Topic"), "preferred_label"))   

## GO1 Courses - ESCO Topic Relation

In [11]:
df = json_normalize(courses) 

df['topics'] = list_to_str(df, 'topics')
df['tags'] = list_to_str(df, 'tags')

df['course_data'] = df[['title', 'topics', 'tags', 'description']].agg(' '.join, axis=1)

In [None]:
# this is even better, hahaha, just 6mins
%%time
data = [t for index, text in enumerate(df['course_data'])
        for t in zip(cycle([str(df['id'].iloc[index])]), cycle([{}]) , ['topic ' + str(x) for x in model.query_topics(text, num_topics=3)[3]])]

In [None]:
create_relationships(graph.auto(), data, "BELONG_TO", \
    start_node_key=(("Course"), "uri"), end_node_key=(("ESCO_Topic"), "preferred_label"))

## GO1 Topics

In [None]:
topics = set(list(collapse([[y["value"] for y in x.get("attributes", {}).get("topics", [])] for x in courses])))
data = [{"preferred_label":x} for x in topics]

In [None]:
periodic_input(graph, data, {"GO1_Topic"})
create_index(graph,"GO1_Topic","GO1_Topic",["preferred_label"])

## GO1 Topics - GO1 Topics Relation

In [None]:
topics_relation = list(set([tuple([y["value"] for y in x.get("attributes", {}).get("topics", [])]) for x in courses]))

data = [(x[0],{},x[1]) for x in list(set([(w,y) for x in topics_relation for w,y in pairwise(x)]))]

In [None]:
create_relationships(graph.auto(), data, "BROADER_THEN", \
    start_node_key=(("GO1_Topic"), "preferred_label"), end_node_key=(("GO1_Topic"), "preferred_label"))

## GO1 Courses - GO1 Topics Relation

In [12]:
topics = [[y["value"] for y in x.get("attributes", {}).get("topics", [])] for x in courses]
data = list(collapse([list(zip(cycle([str(id)]), cycle([{}]), topics[index])) for index, id in enumerate(df['id'])],base_type=tuple))

In [None]:
create_relationships(graph.auto(), data, "HAS", \
    start_node_key=(("Course"), "uri"), end_node_key=(("GO1_Topic"), "preferred_label"))

## GO1 Tags

In [None]:
tags = set([tag for c in courses for tag in c['tags']])
data = [{"preferred_label":x} for x in tags]

In [None]:
periodic_input(graph, data, {"Tag"})
create_index(graph,"Tag","Tag",["preferred_label"])

## GO1 Courses - GO1 Tags Relation

In [None]:
data = [(x["id"], {}, tag) for x in courses for tag in x["tags"]]

In [None]:
create_relationships(graph.auto(), data, "TAG", \
    start_node_key=(("Course"), "uri"), end_node_key=(("Tag"), "preferred_label"))

## Additional

In [None]:
esco_skill

In [None]:
PhraseMatcher()

In [None]:
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

doc = nlp("German Chancellor angela Merkel and US President Barack Obama "
          "converse in the Oval Office inside the White House in Washington, D.C.")
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

In [None]:
matches

In [None]:
spacy.__version__

In [None]:
tags = [y for x in courses for y in x["tags"]]
unique_tags = set(tags)

In [None]:
unique_tags

In [None]:
data = [{"preferred_label":x} for x in unique_tags]
periodic_input(graph, data, {"Tag"})
create_index(graph,"Tag","Tag",["preferred_label"])

In [None]:
course_tags = [(x["id"], {}, tag) for x in courses for tag in x["tags"]]

In [None]:
create_relationships(graph.auto(), course_tags, "TAG", \
    start_node_key=(("Course"), "uri"), end_node_key=(("Tag"), "preferred_label"))

In [None]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank")

In [None]:
txt="""
Course OverviewArtificial Intelligence has become an important and integral part of many industries, revolutionizing sectors such as Banking, Medicine, Transportation, etc. Recently, small businesses have been leveraging AI to scale up and be more efficient. This course is your stepping stone to mastering the power of AI for your own business.This course will teach you to approach AI from a leader’s perspective using practical, data-driven methods to identify and quantify business opportunities. You will learn to use several varieties of machine learning techniques, improving the capability of your business to deliver better and faster solutions to its customers and clients.By the end of the course, you will have the skills to improve and innovate the services of any business using the power of AI.Target AudienceIf you are a manager, analyst, developer, or consultant interested in leveraging the power of AI for business, then this course is for you. No prior knowledge of AI is required.This video is part of the course  Hands-On Artificial Intelligence for Small Businesses.The aim of this video is to provide mathematical background that answers the question, “How do we know if our clustering is good?”Understand Bayes’ RuleApply Bayes’ Rule to reasoning about clustersSimplify Bayes’ Rule for the clustering case
"""
doc = nlp(txt)
for phrase in doc._.phrases:
    print(phrase.text)

In [None]:
courses[101]["description"]

In [None]:
[x for x in courses if x["id"]==12498735]

In [None]:
Counter(tags).most_common(50)