In [1]:
!pip install google-cloud-aiplatform --upgrade --user



In [12]:
!pip install google-cloud-aiplatform
!pip install google-auth



In [1]:
# Import Vertex AI library
from google.cloud import aiplatform

# Import service account 
from google.oauth2 import service_account

In [2]:
credentials = service_account.Credentials.from_service_account_file("resumeanz-db0c67277823.json")

In [3]:
project_id = 'resumeanz'
location = 'us-central1'

In [5]:
import vertexai
vertexai.init(project='resumeanz',credentials=credentials)

In [6]:
aiplatform.TabularDataset.list()

[]

In [2]:
!pip install --upgrade google-cloud-aiplatform



In [7]:
import vertexai
from vertexai.language_models import TextGenerationModel

In [8]:
person_prompt_tpl="""From the Resume text for a job aspirant below, extract Entities strictly as instructed below
1. First, look for the Person Entity type in the text and extract the needed information defined below:
   `id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. NEVER create new entity types that aren't mentioned below. Document must be summarized and stored inside Person entity under `description` property
    Entity Types:
    label:'Person',id:string,role:string,description:string //Person Node
2. Description property should be a crisp text summary and MUST NOT be more than 100 characters
3. If you cannot find any information on the entities & relationships above, it is okay to return empty value. DO NOT create fictious data
4. Do NOT create duplicate entities
5. Restrict yourself to extract only Person information. No Position, Company, Education or Skill information should be focussed.
6. NEVER Impute missing values
Example Output JSON:
{"entities": [{"label":"Person","id":"person1","role":"Prompt Developer","description":"Prompt Developer with more than 30 years of LLM experience"}]}

Question: Now, extract the Person for the text below -
$ctext

Answer:
"""

In [9]:
postion_prompt_tpl="""From the Resume text for a job aspirant below, extract Entities & relationships strictly as instructed below
1. First, look for Position & Company types in the text and extract information in comma-separated format. Position Entity denotes the Person's previous or current job. Company node is the Company where they held that position.
   `id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. NEVER create new entity types that aren't mentioned below. You will have to generate as many entities as needed as per the types below:
    Entity Types:
    label:'Position',id:string,title:string,location:string,startDate:string,endDate:string,url:string //Position Node
    label:'Company',id:string,name:string //Company Node
2. Next generate each relationships as triples of head, relationship and tail. To refer the head and tail entity, use their respective `id` property. NEVER create new Relationship types that aren't mentioned below:
    Relationship definition:
    position|AT_COMPANY|company //Ensure this is a string in the generated output
3. If you cannot find any information on the entities & relationships above, it is okay to return empty value. DO NOT create fictious data
4. Do NOT create duplicate entities.
5. No Education or Skill information should be extracted.
6. DO NOT MISS out any Position or Company related information
7. NEVER Impute missing values
 Example Output JSON:
{"entities": [{"label":"Position","id":"position1","title":"Software Engineer","location":"Singapore",startDate:"2021-01-01",endDate:"present"},{"label":"Position","id":"position2","title":"Senior Software Engineer","location":"Mars",startDate:"2020-01-01",endDate:"2020-12-31"},{label:"Company",id:"company1",name:"Neo4j Singapore Pte Ltd"},{"label":"Company","id":"company2","name":"Neo4j Mars Inc"}],"relationships": ["position1|AT_COMPANY|company1","position2|AT_COMPANY|company2"]}

Question: Now, extract entities & relationships as mentioned above for the text below -
$ctext

Answer:
"""

In [10]:
skill_prompt_tpl="""From the Resume text below, extract Entities strictly as instructed below
1. Look for prominent Skill Entities in the text. The`id` property of each entity must be alphanumeric and must be unique among the entities. NEVER create new entity types that aren't mentioned below:
    Entity Definition:
    label:'Skill',id:string,name:string,level:string //Skill Node
2. NEVER Impute missing values
3. If you do not find any level information: assume it as `expert` if the experience in that skill is more than 5 years, `intermediate` for 2-5 years and `beginner` otherwise.
Example Output Format:
{"entities": [{"label":"Skill","id":"skill1","name":"Neo4j","level":"expert"},{"label":"Skill","id":"skill2","name":"Pytorch","level":"expert"}]}

Question: Now, extract entities as mentioned above for the text below -
$ctext

Answer:
"""

In [11]:
edu_prompt_tpl="""From the Resume text for a job aspirant below, extract Entities strictly as instructed below
1. Look for Education entity type and generate the information defined below:
   `id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. NEVER create other entity types that aren't mentioned below. You will have to generate as many entities as needed as per the types below:
    Entity Definition:
    label:'Education',id:string,degree:string,university:string,graduationDate:string,score:string,url:string //Education Node
2. If you cannot find any information on the entities above, it is okay to return empty value. DO NOT create fictious data
3. Do NOT create duplicate entities or properties
4. Strictly extract only Education. No Skill or other Entities should be extracted
5. DO NOT MISS out any Education related entity
6. NEVER Impute missing values
Output JSON (Strict):
{"entities": [{"label":"Education","id":"education1","degree":"Bachelor of Science","graduationDate":"May 2022","score":"0.0"}]}

Question: Now, extract Education information as mentioned above for the text below -
$ctext

Answer:
"""

In [12]:
from vertexai.preview.language_models import TextGenerationModel

def run_text_model(
    project_id: str,
    model_name: str,
    temperature: float,
    max_decode_steps: int,
    top_p: float,
    top_k: int,
    prompt: str,
    location: str = location,
    tuned_model_name: str = "",
    ) :
    """Text Completion Use a Large Language Model."""
    vertexai.init(project='resumeanz',credentials=credentials)
    model = TextGenerationModel.from_pretrained(model_name)
    if tuned_model_name:
      model = model.get_tuned_model(tuned_model_name)
    response = model.predict(
        prompt,
        temperature=temperature,
        max_output_tokens=max_decode_steps,
        top_k=top_k,
        top_p=top_p,)
    return response.text

In [13]:
def extract_entities_relationships(prompt, tuned_model_name):
    try:
        res = run_text_model(project_id, "text-bison@001", 0, 1024, 0.8, 40, prompt, location, tuned_model_name)
        return res
    except Exception as e:
        print(e)

In [14]:
import re
def clean_text(text):
    return re.sub(r'[^\x00-\x7F]+',' ', text)

In [15]:
from string import Template
import json

sample_que = """Developer <span class="hl">Developer</span> Developer - TATA CONSULTANTCY SERVICE Batavia, OH Relevant course work† Database Systems, Database Administration, Database Security & Auditing, Computer Security,Computer Networks, Programming & Software Development, IT, Information Security Concept & Admin,† IT System Acquisition & Integration, Advanced Web Development, and Ethical Hacking: Network Security & Pen Testing. Work Experience Developer TATA CONSULTANTCY SERVICE June 2016 to Present MRM (Government of ME, RI, MS) Developer†††† Working with various technologies such as Java, JSP, JSF, DB2(SQL), LDAP, BIRT report, Jazz version control, Squirrel SQL client, Hibernate, CSS, Linux, and Windows. Work as part of a team that provide support to enterprise applications. Perform miscellaneous support activities as requested by Management. Perform in-depth research and identify sources of production issues.†† SPLUNK Developer† Supporting the Splunk Operational environment for Business Solutions Unit aiming to support overall business infrastructure. Developing Splunk Queries to generate the report, monitoring, and analyzing machine generated big data for server that has been using for onsite and offshore team. Working with Splunk' premium apps such as ITSI, creating services, KPI, and glass tables. Developing app with custom dashboard with front- end ability and advanced XML to serve Business Solution unit' needs. Had in-house app presented at Splunk's .Conf Conference (2016). Help planning, prioritizing and executing development activities. Developer ( front end) intern TOMORROW PICTURES INC - Atlanta, GA April 2015 to January 2016 Assist web development team with multiple front end web technologies and involved in web technologies such as Node.js, express, json, gulp.js, jade, sass, html5, css3, bootstrap, WordPress.†Testing (manually), version control (GitHub), mock up design and ideas Education MASTER OF SCIENCE IN INFORMATION TECHNOLOGY in INFOTMATION TECHNOLOGY KENNESAW STATE UNIVERSITY - Kennesaw, GA August 2012 to May 2015 MASTER OF BUSINESS ADMINISTRATION in INTERNATIONAL BUSINESS AMERICAN INTER CONTINENTAL UNIVERSITY ATLANTA November 2003 to December 2005 BACHELOR OF ARTS in PUBLIC RELATIONS THE UNIVERSITY OF THAI CHAMBER OF COMMERCE - BANGKOK, TH June 1997 to May 2001 Skills Db2 (2 years), front end (2 years), Java (2 years), Linux (2 years), Splunk (2 years), SQL (3 years) Certifications/Licenses Splunk Certified Power User V6.3 August 2016 to Present CERT-112626 Splunk Certified Power User V6.x May 2017 to Present CERT-168138 Splunk Certified User V6.x May 2017 to Present CERT -181476 Driver's License Additional Information Skills† ∑††††SQL, PL/SQL, Knowledge of Data Modeling, Experience on Oracle database/RDBMS.† ∑††††††††Database experience on Oracle, DB2, SQL Sever, MongoDB, and MySQL.† ∑††††††††Knowledge of tools including Splunk, tableau, and wireshark.† ∑††††††††Knowledge of SCRUM/AGILE and WATERFALL methodologies.† ∑††††††††Web technology included: HTML5, CSS3, XML, JSON, JavaScript, node.js, NPM, GIT, express.js, jQuery, Angular, Bootstrap, and Restful API.† ∑††††††††Working Knowledge in JAVA, J2EE, and PHP.† Operating system Experience included: Windows, Mac OS, Linux (Ubuntu, Mint, Kali)††"""
prompts = [person_prompt_tpl, postion_prompt_tpl, skill_prompt_tpl, edu_prompt_tpl]
results = {"entities": [], "relationships": []}

for p in prompts:
    _prompt = Template(p).substitute(ctext=clean_text(sample_que))
    _extraction = extract_entities_relationships(_prompt, '')
    if 'Answer:\n' in _extraction:
        _extraction = _extraction.split('Answer:\n ')[1]
    if _extraction.strip() == '':
        continue
    try:
        _extraction = json.loads(_extraction.replace("\'", "'").replace('`', ''))
    except json.JSONDecodeError:
        # print(_extraction)
        #Temp hack to ignore Skills cut off by token limitation
        _extraction = _extraction[:_extraction.rfind("}")+1] + ']}'
        _extraction = json.loads(_extraction.replace("\'", "'"))
    results["entities"].extend(_extraction["entities"])
    if "relationships" in _extraction:
        results["relationships"].extend(_extraction["relationships"])

In [16]:
person_id = results["entities"][0]["id"]
for e in results["entities"][1:]:
    if e['label'] == 'Position':
        results["relationships"].append(f"{person_id}|HAS_POSITION|{e['id']}")
    if e['label'] == 'Skill':
        results["relationships"].append(f"{person_id}|HAS_SKILL|{e['id']}")
    if e['label'] == 'Education':
        results["relationships"].append(f"{person_id}|HAS_EDUCATION|{e['id']}")

In [17]:
results

{'entities': [{'label': 'Person',
   'id': 'person1',
   'role': 'Developer',
   'description': 'Developer with 10+ years of experience in IT industry.'},
  {'label': 'Position',
   'id': 'position1',
   'title': 'Developer',
   'location': 'Batavia, OH',
   'startDate': '2016-06-01',
   'endDate': 'present'},
  {'label': 'Company', 'id': 'company1', 'name': 'TATA CONSULTANTCY SERVICE'},
  {'label': 'Skill', 'id': 'skill1', 'name': 'SQL', 'level': 'expert'},
  {'label': 'Skill', 'id': 'skill2', 'name': 'Java', 'level': 'expert'},
  {'label': 'Skill', 'id': 'skill3', 'name': 'Linux', 'level': 'expert'},
  {'label': 'Skill', 'id': 'skill4', 'name': 'Splunk', 'level': 'expert'},
  {'label': 'Skill', 'id': 'skill5', 'name': 'front end', 'level': 'expert'},
  {'label': 'Skill', 'id': 'skill6', 'name': 'Db2', 'level': 'expert'},
  {'label': 'Skill', 'id': 'skill7', 'name': 'HTML5', 'level': 'expert'},
  {'label': 'Skill', 'id': 'skill8', 'name': 'CSS3', 'level': 'expert'},
  {'label': 'Skill

In [18]:
import re

def get_prop_str(prop_dict, _id):
    s = []
    for key, val in prop_dict.items():
      if key != 'label' and key != 'id':
         s.append(_id+"."+key+' = "'+str(val).replace('\"', '"').replace('"', '\"')+'"')
    return ' ON CREATE SET ' + ','.join(s)

def get_cypher_compliant_var(_id):
    s = "_"+ re.sub(r'[\W_]', '', _id).lower() #avoid numbers appearing as firstchar; replace spaces
    return s[:20] #restrict variable size

def generate_cypher(file_name, in_json):
    e_map = {}
    e_stmt = []
    r_stmt = []
    e_stmt_tpl = Template("($id:$label{id:'$key'})")
    r_stmt_tpl = Template("""
      MATCH $src
      MATCH $tgt
      MERGE ($src_id)-[:$rel]->($tgt_id)
    """)
    for obj in in_json:
      for j in obj['entities']:
          props = ''
          label = j['label']
          id = ''
          if label == 'Person':
            id = 'p'+str(file_name)
          elif label == 'Position':
            c = j['id'].replace('position', '_')
            id = f'j{str(file_name)}{c}'
          elif label == 'Education':
            c = j['id'].replace('education', '_')
            id = f'e{str(file_name)}{c}'
          else:
            id = get_cypher_compliant_var(j['name'])
          if label in ['Person', 'Position', 'Education', 'Skill', 'Company']:
            varname = get_cypher_compliant_var(j['id'])
            stmt = e_stmt_tpl.substitute(id=varname, label=label, key=id)
            e_map[varname] = stmt
            e_stmt.append('MERGE '+ stmt + get_prop_str(j, varname))

      for st in obj['relationships']:
          rels = st.split("|")
          src_id = get_cypher_compliant_var(rels[0].strip())
          rel = rels[1].strip()
          if rel in ['HAS_SKILL', 'HAS_EDUCATION', 'AT_COMPANY', 'HAS_POSITION']: #we ignore other relationships
            tgt_id = get_cypher_compliant_var(rels[2].strip())
            stmt = r_stmt_tpl.substitute(
              src_id=src_id, tgt_id=tgt_id, src=e_map[src_id], tgt=e_map[tgt_id], rel=rel)
            r_stmt.append(stmt)

    return e_stmt, r_stmt

In [19]:
ent_cyp, rel_cyp = generate_cypher('my_cv', [results])

print(ent_cyp, rel_cyp)

['MERGE (_person1:Person{id:\'pmy_cv\'}) ON CREATE SET _person1.role = "Developer",_person1.description = "Developer with 10+ years of experience in IT industry."', 'MERGE (_position1:Position{id:\'jmy_cv_1\'}) ON CREATE SET _position1.title = "Developer",_position1.location = "Batavia, OH",_position1.startDate = "2016-06-01",_position1.endDate = "present"', 'MERGE (_company1:Company{id:\'_tataconsultantcyser\'}) ON CREATE SET _company1.name = "TATA CONSULTANTCY SERVICE"', 'MERGE (_skill1:Skill{id:\'_sql\'}) ON CREATE SET _skill1.name = "SQL",_skill1.level = "expert"', 'MERGE (_skill2:Skill{id:\'_java\'}) ON CREATE SET _skill2.name = "Java",_skill2.level = "expert"', 'MERGE (_skill3:Skill{id:\'_linux\'}) ON CREATE SET _skill3.name = "Linux",_skill3.level = "expert"', 'MERGE (_skill4:Skill{id:\'_splunk\'}) ON CREATE SET _skill4.name = "Splunk",_skill4.level = "expert"', 'MERGE (_skill5:Skill{id:\'_frontend\'}) ON CREATE SET _skill5.name = "front end",_skill5.level = "expert"', 'MERGE (_

In [2]:
pip install neo4j

Collecting neo4j
  Downloading neo4j-5.17.0.tar.gz (197 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/197.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m194.6/197.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.8/197.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: neo4j
  Building wheel for neo4j (pyproject.toml) ... [?25l[?25hdone
  Created wheel for neo4j: filename=neo4j-5.17.0-py3-none-any.whl size=273834 sha256=2f8eb55ba7ff84516554df34430295e1061fdec2f3b4b7f1710e395c9310a665
  Stored in directory: /root/.cache/pip/wheels/26/a1/15/63d729065b1a6a8afce3343

In [20]:
from neo4j import GraphDatabase

In [21]:
import getpass
# You will need to change these variables
connectionUrl = "bolt://localhost:7687"
username = "neo4j"
password = "Pratikps1$"

In [22]:
driver = GraphDatabase.driver(connectionUrl, auth=(username, password))
driver.verify_connectivity()

In [23]:
import pandas as pd
def run_query(query, params={}):
    with driver.session() as session:
        result = session.run(query, params)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [24]:
run_query('CREATE CONSTRAINT unique_person_id IF NOT EXISTS FOR (n:Person) REQUIRE (n.id) IS UNIQUE')
run_query('CREATE CONSTRAINT unique_position_id IF NOT EXISTS FOR (n:Position) REQUIRE (n.id) IS UNIQUE')
run_query('CREATE CONSTRAINT unique_skill_id IF NOT EXISTS FOR (n:Skill) REQUIRE n.id IS UNIQUE')
run_query('CREATE CONSTRAINT unique_education_id IF NOT EXISTS FOR (n:Education) REQUIRE n.id IS UNIQUE')
run_query('CREATE CONSTRAINT unique_company_id IF NOT EXISTS FOR (n:Company) REQUIRE n.id IS UNIQUE')

In [25]:
%%time
for e in ent_cyp:
    run_query(e)

CPU times: total: 46.9 ms
Wall time: 2.18 s


In [26]:
%%time
for r in rel_cyp:
    run_query(r)

CPU times: total: 15.6 ms
Wall time: 3.91 s


In [27]:
import glob
from string import Template
import json
from pathlib import Path

def run_pipeline(start=0, count=1):
    txt_files = glob.glob("data/*.txt")[start:count]
    print(f"Running pipeline for {len(txt_files)} files")
    failed_files = process_pipeline(txt_files)
    print(failed_files)
    return failed_files

def process_pipeline(files):
    failed_files = []
    i = 0
    for f in files:
        i += 1
        try:
            with open(f, 'r', encoding='utf-8', errors='ignore') as file:
                print(f"  {f}: Reading File No. ({i})")
                data = file.read().rstrip()
                text = data
                print(f"    {f}: Extracting Entities & Relationships")
                results = run_extraction(f, text)
                print(f"    {f}: Generating Cypher")
                ent_cyp, rel_cyp = generate_cypher(Path(f).stem, results)
                print(f"    {f}: Ingesting Entities")
                for e in ent_cyp:
                    run_query(e)
                print(f"    {f}: Ingesting Relationships")
                for r in rel_cyp:
                    run_query(r)
                print(f"    {f}: Processing DONE")
        except Exception as e:
            print(f"    {f}: Processing Failed with exception {e}")
            failed_files.append(f)
    return failed_files
        
from timeit import default_timer as timer
def run_extraction(f, text):
    start = timer()
    prompts = [person_prompt_tpl, postion_prompt_tpl, skill_prompt_tpl, edu_prompt_tpl]
    results = {"entities": [], "relationships": []}
    for p in prompts:
        _prompt = Template(p).substitute(ctext=text)
        _extraction = extract_entities_relationships(_prompt, '')
        if 'Answer:\n' in _extraction:
            _extraction = _extraction.split('Answer:\n ')[1]
        if _extraction.strip() == '':
            continue
        try:
            _extraction = json.loads(_extraction.replace("\'", "'"))
        except json.JSONDecodeError:
            #Temp hack to ignore Skills cut off by token limitation
            _extraction = _extraction[:_extraction.rfind("}")+1] + ']}'
            _extraction = json.loads(_extraction.replace("\'", "'"))
        results["entities"].extend(_extraction["entities"])
        if "relationships" in _extraction:
            results["relationships"].extend(_extraction["relationships"])
    person_id = results["entities"][0]["id"]
    for e in results["entities"][1:]:
        if e['label'] == 'Position':
            results["relationships"].append(f"{person_id}|HAS_POSITION|{e['id']}")
        if e['label'] == 'Skill':
            results["relationships"].append(f"{person_id}|HAS_SKILL|{e['id']}")
        if e['label'] == 'Education':
            results["relationships"].append(f"{person_id}|HAS_EDUCATION|{e['id']}")
    end = timer()
    elapsed = (end-start)
    print(f"    {f}: Entity Extraction took {elapsed}secs")
    return [results]

In [30]:
failed_files = run_pipeline(0, 100)


Running pipeline for 100 files
  data\05499.txt: Reading File No. (1)
    data\05499.txt: Extracting Entities & Relationships
    data\05499.txt: Entity Extraction took 6.253810700029135secs
    data\05499.txt: Generating Cypher
    data\05499.txt: Ingesting Entities
    data\05499.txt: Ingesting Relationships
    data\05499.txt: Processing DONE
  data\05500.txt: Reading File No. (2)
    data\05500.txt: Extracting Entities & Relationships
    data\05500.txt: Entity Extraction took 5.341489000013098secs
    data\05500.txt: Generating Cypher
    data\05500.txt: Ingesting Entities
    data\05500.txt: Ingesting Relationships
    data\05500.txt: Processing DONE
  data\05501.txt: Reading File No. (3)
    data\05501.txt: Extracting Entities & Relationships
    data\05501.txt: Entity Extraction took 7.596793300006539secs
    data\05501.txt: Generating Cypher
    data\05501.txt: Ingesting Entities
    data\05501.txt: Ingesting Relationships
    data\05501.txt: Processing DONE
  data\05502.txt: 

    data\05525.txt: Entity Extraction took 9.269567699870095secs
    data\05525.txt: Generating Cypher
    data\05525.txt: Ingesting Entities
    data\05525.txt: Ingesting Relationships
    data\05525.txt: Processing DONE
  data\05526.txt: Reading File No. (28)
    data\05526.txt: Extracting Entities & Relationships
    data\05526.txt: Entity Extraction took 4.963630199898034secs
    data\05526.txt: Generating Cypher
    data\05526.txt: Ingesting Entities
    data\05526.txt: Ingesting Relationships
    data\05526.txt: Processing DONE
  data\05527.txt: Reading File No. (29)
    data\05527.txt: Extracting Entities & Relationships
    data\05527.txt: Entity Extraction took 3.2011431001592427secs
    data\05527.txt: Generating Cypher
    data\05527.txt: Ingesting Entities
    data\05527.txt: Ingesting Relationships
    data\05527.txt: Processing DONE
  data\05528.txt: Reading File No. (30)
    data\05528.txt: Extracting Entities & Relationships
    data\05528.txt: Entity Extraction took 5.

    data\05551.txt: Entity Extraction took 9.044277400011197secs
    data\05551.txt: Generating Cypher
    data\05551.txt: Ingesting Entities
    data\05551.txt: Ingesting Relationships
    data\05551.txt: Processing DONE
  data\05552.txt: Reading File No. (54)
    data\05552.txt: Extracting Entities & Relationships
    data\05552.txt: Entity Extraction took 5.577560199890286secs
    data\05552.txt: Generating Cypher
    data\05552.txt: Ingesting Entities
    data\05552.txt: Ingesting Relationships
    data\05552.txt: Processing DONE
  data\05553.txt: Reading File No. (55)
    data\05553.txt: Extracting Entities & Relationships
    data\05553.txt: Entity Extraction took 4.331262900028378secs
    data\05553.txt: Generating Cypher
    data\05553.txt: Ingesting Entities
    data\05553.txt: Ingesting Relationships
    data\05553.txt: Processing DONE
  data\05554.txt: Reading File No. (56)
    data\05554.txt: Extracting Entities & Relationships
    data\05554.txt: Entity Extraction took 4.7

    data\05577.txt: Entity Extraction took 7.329631999833509secs
    data\05577.txt: Generating Cypher
    data\05577.txt: Ingesting Entities
    data\05577.txt: Ingesting Relationships
    data\05577.txt: Processing DONE
  data\05578.txt: Reading File No. (80)
    data\05578.txt: Extracting Entities & Relationships
    data\05578.txt: Entity Extraction took 6.288914200151339secs
    data\05578.txt: Generating Cypher
    data\05578.txt: Ingesting Entities
    data\05578.txt: Ingesting Relationships
    data\05578.txt: Processing DONE
  data\05579.txt: Reading File No. (81)
    data\05579.txt: Extracting Entities & Relationships
    data\05579.txt: Entity Extraction took 12.537258799886331secs
    data\05579.txt: Generating Cypher
    data\05579.txt: Ingesting Entities
    data\05579.txt: Ingesting Relationships
    data\05579.txt: Processing DONE
  data\05580.txt: Reading File No. (82)
    data\05580.txt: Extracting Entities & Relationships
    data\05580.txt: Entity Extraction took 12

In [31]:
failed_files

['data\\05514.txt', 'data\\05562.txt']

In [32]:
local_file_path="eng-to-cypher-trng.jsonl"

In [33]:
project_id

'resumeanz'

In [34]:
credentials

<google.oauth2.service_account.Credentials at 0x1fde0a08310>

In [37]:
client = storage.Client(credential=credentials)

NameError: name 'storage' is not defined

In [38]:
from google.cloud import storage
from timeit import default_timer as timer

bucket_name = project_id + '-genai'
client = storage.Client(credentials=credentials)
try:
    bucket = client.get_bucket(bucket_name)
except:
    bucket = client.bucket(bucket_name)
    bucket.storage_class = 'STANDARD'
    bucket = client.create_bucket(bucket)

upload_name = f"eng-to-cypher-trng-{timer()}.jsonl" #this ensures vertexai reloads the file
filename = 'eng-to-cypher-trng.jsonl'
blob = bucket.blob(upload_name)
blob.upload_from_filename(filename)

In [39]:
# Assuming 'local_file_path' is the path to your local training data file in JSONL format
training_data = 'gs://' + bucket_name + '/' + upload_name
train_steps = 100

# Initialize Vertex AI with the project and location
vertexai.init(project=project_id,credentials=credentials)

# Load a pre-trained text generation model
model = TextGenerationModel.from_pretrained("text-bison@001")

# Fine-tune the model with the local JSONL file


In [40]:
tuned_model_names = model.list_tuned_model_names()
print(tuned_model_names)

['projects/209280395857/locations/us-central1/models/1718472902539476992']


In [41]:
 tuned_model_name="projects/209280395857/locations/us-central1/models/1718472902539476992"

In [42]:
def english_to_cypher_text_bison(prompt, tuned_model_name = ''):
    try:
        res = run_text_model(project_id, "text-bison@001", 0.1, 1024, 0.95, 40, prompt, location, tuned_model_name)
        return res
    except Exception as e:
        print(e)

In [44]:
samples = """
Question: How many expert java developers attend more than one universities?
Answer: MATCH (p:Person)-[:HAS_SKILL]->(s:Skill), (p)-[:HAS_EDUCATION]->(e1:Education), (p)-[:HAS_EDUCATION]->(e2:Education) WHERE toLower(s.name) CONTAINS 'java' AND toLower(s.level) CONTAINS 'expert' AND e1.university <> e2.university RETURN COUNT(DISTINCT p)

Question: Where do most candidates get educated?
Answer: MATCH (p:Person)-[:HAS_EDUCATION]->(e:Education) RETURN e.university, count(e.university) as alumni ORDER BY alumni DESC LIMIT 1

Question: How many people have worked as a Data Scientist in San Francisco?
Answer: MATCH (p:Person)-[:HAS_POSITION]->(pos:Position) WHERE toLower(pos.title) CONTAINS 'data scientist' AND toLower(pos.location) CONTAINS 'san francisco' RETURN COUNT(p)
"""

In [45]:
prompt = """You are an expert Neo4j Cypher translator who understands the question in english and convert to Cypher strictly based on the Neo4j Schema provided and following the instructions below:
1. Generate Cypher query compatible ONLY for Neo4j Version 5
2. Do not use EXISTS, SIZE keywords in the cypher. Use alias when using the WITH keyword
3. Use only Nodes and relationships mentioned in the schema
4. Always enclose the Cypher output inside 3 backticks
5. Always do a case-insensitive and fuzzy search for any properties related search. Eg: to search for a Company name use `toLower(c.name) contains 'neo4j'`
6. Candidate node is synonymous to Person
7. Always use aliases to refer the node in the query
8. Cypher is NOT SQL. So, do not mix and match the syntaxes
Schema:
(:Person {label: 'Person', id: string, role: string, description: string})-[:HAS_POSITION {}]->(:Position {label: 'Position', id: string, title: string, location: string, startDate: string, endDate: string, url: string})
(:Position {label: 'Position', id: string, title: string, location: string, startDate: string, endDate: string, url: string})-[:AT_COMPANY {}]->(:Company {label:'Company', id: string, name: string})
(:Person {label: 'Person',id: string, role: string, description: string})-[:HAS_SKILL {}]->(:Skill {label:'Skill', id: string,name: string,level: string})
(:Person {label: 'Person',id: string, role: string, description: string})-[:HAS_EDUCATION {}]->(:Education {label:'Education', id: string, degree: string, university: string, graduationDate: string, score: string, url: string})
Samples:
$samples
Question: $question
Answer:
"""

In [46]:
tuned_model_name

'projects/209280395857/locations/us-central1/models/1718472902539476992'

In [47]:
from string import Template
que = 'How many are knowledgable on all of - java, python, javascript, security?'
_prompt = Template(prompt).substitute(samples=samples, question=que)

cypher = english_to_cypher_text_bison(_prompt, tuned_model_name) #for text-bison use: english_to_cypher_text_bison(_prompt, entity_extraction_tuned_model)
if 'Answer:\n ' in cypher:
    cypher = cypher.split('Answer:\n ')[1]
cypher = cypher.replace('\n', ' ')
cypher

"MATCH (c:Person)-[:HAS_SKILL]->(s1:Skill), (c)-[:HAS_SKILL]->(s2:Skill), (c)-[:HAS_SKILL]->(s3:Skill), (c)-[:HAS_SKILL]->(s4:Skill) WHERE toLower(s1.name) CONTAINS 'java' AND toLower(s2.name) CONTAINS 'python' AND toLower(s3.name) CONTAINS 'javascript' AND toLower(s4.name) CONTAINS 'security' RETURN COUNT(DISTINCT c)"

In [50]:
tuned_model_name

'projects/209280395857/locations/us-central1/models/1718472902539476992'

In [62]:
model = model.get_tuned_model(tuned_model_name)

In [63]:
model

<vertexai.preview.language_models._PreviewTextGenerationModel at 0x1fdf34d2920>

In [58]:
from google.cloud import aiplatform as aip
my_model = aip.Model("projects/209280395857/locations/us-central1/models/1718472902539476992")

In [59]:
my_model

<google.cloud.aiplatform.models.Model object at 0x000001FDF3331D20> 
resource name: projects/209280395857/locations/us-central1/models/1718472902539476992

In [81]:
model=model.get_tuned_model("projects/209280395857/locations/us-central1/models/1718472902539476992")

In [82]:
model

<vertexai.preview.language_models._PreviewTextGenerationModel at 0x1fdf7cebc70>

In [80]:
tuned_model_names = model.list_tuned_model_names()
print(tuned_model_names)

['projects/209280395857/locations/us-central1/models/1718472902539476992']


In [84]:
llm = VertexAI(model_name="text-bison@001", 
               tuned_model_name="projects/209280395857/locations/us-central1/models/1718472902539476992",
               max_output_tokens=1024)

In [85]:
llm

VertexAI(model_name='text-bison@001', client=<vertexai.language_models.TextGenerationModel object at 0x000001FDF7CF5DB0>, client_preview=<vertexai.preview.language_models._PreviewTextGenerationModel object at 0x000001FDF7CF60B0>, max_output_tokens=1024, tuned_model_name='projects/209280395857/locations/us-central1/models/1718472902539476992')

In [86]:
from langchain.chains import GraphCypherQAChain
from langchain.graphs import Neo4jGraph
from langchain.llms import VertexAI
from langchain.prompts.prompt import PromptTemplate

CYPHER_GENERATION_TEMPLATE = """You are an expert Neo4j Cypher translator who understands the question in english and convert to Cypher strictly based on the Neo4j Schema provided and following the instructions below:
1. Generate Cypher query compatible ONLY for Neo4j Version 5
2. Do not use EXISTS, SIZE keywords in the cypher. Use alias when using the WITH keyword
3. Use only Nodes and relationships mentioned in the schema
4. Always enclose the Cypher output inside 3 backticks
5. Always do a case-insensitive and fuzzy search for any properties related search. Eg: to search for a Company name use `toLower(c.name) contains 'neo4j'`
6. Candidate node is synonymous to Person
7. Always use aliases to refer the node in the query
8. Cypher is NOT SQL. So, do not mix and match the syntaxes
Schema:
{schema}
Samples:
Question: How many expert java developers attend more than one universities?
Answer: MATCH (p:Person)-[:HAS_SKILL]->(s:Skill), (p)-[:HAS_EDUCATION]->(e1:Education), (p)-[:HAS_EDUCATION]->(e2:Education) WHERE toLower(s.name) CONTAINS 'java' AND toLower(s.level) CONTAINS 'expert' AND e1.university <> e2.university RETURN COUNT(DISTINCT p)
Question: Where do most candidates get educated?
Answer: MATCH (p:Person)-[:HAS_EDUCATION]->(e:Education) RETURN e.university, count(e.university) as alumni ORDER BY alumni DESC LIMIT 1
Question: How many people have worked as a Data Scientist in San Francisco?
Answer: MATCH (p:Person)-[:HAS_POSITION]->(pos:Position) WHERE toLower(pos.title) CONTAINS 'data scientist' AND toLower(pos.location) CONTAINS 'san francisco' RETURN COUNT(p)
Question: {question}
Answer:
"""

CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["schema", "question"], template=CYPHER_GENERATION_TEMPLATE
)


graph = Neo4jGraph(
    url="bolt://localhost:7687", 
    username="neo4j", 
    password="Pratikps1$"
)
chain = GraphCypherQAChain.from_llm(
    llm = llm,
    graph=graph, verbose=True,
    cypher_prompt=CYPHER_GENERATION_PROMPT,
    return_intermediate_steps=True
)

In [93]:
r = chain("""Which people have held a position in London with a start date in 2019""")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person)-[:HAS_POSITION]->(pos:Position) WHERE toLower(pos.location) CONTAINS 'london' AND pos.startDate CONTAINS '2019' RETURN p[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


In [94]:
print(f"Intermediate steps: {r['intermediate_steps']}")
print(f"Final answer: {r['result']}")

Intermediate steps: [{'query': "MATCH (p:Person)-[:HAS_POSITION]->(pos:Position) WHERE toLower(pos.location) CONTAINS 'london' AND pos.startDate CONTAINS '2019' RETURN p"}, {'context': []}]
Final answer: I don't know the answer.


In [90]:
import gradio as gr
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(memory_key = "chat_history", return_messages = True)
llm = VertexAI(model_name='code-bison',
            max_output_tokens=2048,
            temperature=0,
            top_p=0.95,
            top_k=0.40)
agent_chain = chain
def chat_response(input_text):
    response = agent_chain.run(input_text)
    return response

interface = gr.Interface(fn = chat_response, inputs = "text", outputs = "text", 
                         description = "Talent Finder Chatbot")

interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7861
Running on public URL: https://c957c4342850b1e897.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)






[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person)-[:HAS_POSITION]->(pos:Position) WHERE pos.endDate > '' RETURN COUNT(p)[0m
Full Context:
[32;1m[1;3m[{'COUNT(p)': 286}][0m

[1m> Finished chain.[0m


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person)-[:HAS_POSITION]->(pos:Position) WHERE pos.endDate > '' AND pos.endDate < date() + INTERVAL 5 YEAR RETURN COUNT(p)[0m


Traceback (most recent call last):
  File "C:\Users\Pratik's Predator\AppData\Roaming\Python\Python310\site-packages\langchain_community\graphs\neo4j_graph.py", line 160, in query
    data = session.run(Query(text=query, timeout=self.timeout), params)
  File "C:\Users\Pratik's Predator\AppData\Roaming\Python\Python310\site-packages\neo4j\_sync\work\session.py", line 313, in run
    self._auto_result._run(
  File "C:\Users\Pratik's Predator\AppData\Roaming\Python\Python310\site-packages\neo4j\_sync\work\result.py", line 181, in _run
    self._attach()
  File "C:\Users\Pratik's Predator\AppData\Roaming\Python\Python310\site-packages\neo4j\_sync\work\result.py", line 301, in _attach
    self._connection.fetch_message()
  File "C:\Users\Pratik's Predator\AppData\Roaming\Python\Python310\site-packages\neo4j\_sync\io\_common.py", line 178, in inner
    func(*args, **kwargs)
  File "C:\Users\Pratik's Predator\AppData\Roaming\Python\Python310\site-packages\neo4j\_sync\io\_bolt.py", line 849, i



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person)-[:HAS_POSITION]->(pos:Position) WHERE pos.startDate <> '' AND pos.endDate <> '' RETURN COUNT(p)[0m
Full Context:
[32;1m[1;3m[{'COUNT(p)': 286}][0m

[1m> Finished chain.[0m


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person)-[:HAS_SKILL]->(s:Skill), (p)-[:HAS_EDUCATION]->(e1:Education), (p)-[:HAS_EDUCATION]->(e2:Education) WHERE toLower(s.name) CONTAINS 'java' AND e1.university <> e2.university RETURN COUNT(DISTINCT p)[0m
Full Context:
[32;1m[1;3m[{'COUNT(DISTINCT p)': 19}][0m

[1m> Finished chain.[0m


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (pos:Position) WHERE toLower(pos.location) CONTAINS 'new york city' AND pos.startDate CONTAINS '2020' RETURN pos[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


[1m> Entering new GraphCypherQAChain chain...[0m
Generated C