In [1]:
import json

import langchain
from dotenv import load_dotenv
load_dotenv()

from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.pydantic_v1 import BaseModel, Field, validator
from typing import List, Dict

In [2]:
model_name = "gpt-3.5-turbo-instruct"
# model_name = "text-davinci-003"
temperature = 0.0
model = OpenAI(model_name=model_name, temperature=temperature)

In [3]:
class Education(BaseModel):
    bachelor: str = Field(description="School where the bachelor's degree was obtained. Return 'None' if information cannot be found")
    masters: str = Field(description="School where master's degree was obtained. Return 'None' if information cannot be found")
    phd: str = Field(description="School where phD was obtained. Return 'None' if information cannot be found")

parser = PydanticOutputParser(pydantic_object=Education)
format_instruction = parser.get_format_instructions()
format_instruction

'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"properties": {"bachelor": {"title": "Bachelor", "description": "School where the bachelor\'s degree was obtained. Return \'None\' if information cannot be found", "type": "string"}, "masters": {"title": "Masters", "description": "School where master\'s degree was obtained. Return \'None\' if information cannot be found", "type": "string"}, "phd": {"title": "Phd", "description": "School where phD was obtained. Return \'None\' if information cannot be found", "type": "string"}}, "required": ["bachelor", "masters", "phd"]}\n```'

In [4]:
prompt = PromptTemplate(
    template="""The following text is the biography of a professor.
    {biography}
    
    What is the education background of this professor?

    {format_instructions}

    The information returned must be found in the text provided.
    """,
    input_variables=["biography"],
    partial_variables={"format_instructions": format_instruction}
)

In [5]:
with open('./raw_data/dr_ntu_alexei_sourin.json', 'r') as f:
    data = json.load(f)

_input = prompt.format_prompt(biography=data['biography'])
output = model(_input.to_string())

print(output)


{"bachelor": "Moscow Engineering Physics Institute, Russia (MEPhI)", "masters": "Moscow Engineering Physics Institute, Russia (MEPhI)", "phd": "Moscow Engineering Physics Institute, Russia (MEPhI)"}


In [6]:
with open('./raw_data/dr_ntu_sourav_saha_bhowmick.json', 'r') as f:
    data = json.load(f)

_input = prompt.format_prompt(biography=data['biography'])
output = model(_input.to_string())
print(output)


{
    "bachelor": "None",
    "masters": "None",
    "phd": "None"
}


In [94]:
class WorkingExperience(BaseModel):
    position: Dict[str, str] = Field(description="The key is the title of the role. The value is the period where the role was held.")
    
prompt = PromptTemplate(
    template="""The following text is the biography of a professor.
    {biography}
    
    What is the working experience of this professor?
    {format_instructions}
    """,
    input_variables=["biography"],
    partial_variables={"format_instructions": format_instruction}
)

parser = PydanticOutputParser(pydantic_object=WorkingExperience)
format_instruction = parser.get_format_instructions()
format_instruction

'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"properties": {"position": {"title": "Position", "description": "The key is the title of the role. The value is the period where the role was held.", "type": "object", "additionalProperties": {"type": "string"}}}, "required": ["position"]}\n```'

In [95]:
with open('./raw_data/dr_ntu_sourav_saha_bhowmick.json', 'r') as f:
    data = json.load(f)

_input = prompt.format_prompt(biography=data['biography'])
output = model(_input.to_string())
print(output)


{"position": {"Computer Scientist and Educator": "Present", "Research Group Lead": "Present", "Visiting Associate Professor": "2007-2013", "Senior Visiting Professor": "2013"}}


In [96]:
with open('./raw_data/dr_ntu_alexei_sourin.json', 'r') as f:
    data = json.load(f)

_input = prompt.format_prompt(biography=data['biography'])
output = model(_input.to_string())

print(output)


{"position": {"Associate Professor": "1993-1999, 2000-present", "Researcher": "1983-1993", "Chair": "2012-2018", "General and Program Chair": "various years"}}
