# Using OpenAI Metadata tagger

In [1]:
import pandas as pd
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.document_transformers.openai_functions import create_metadata_tagger
from dotenv import load_dotenv
from tqdm import tqdm
load_dotenv()

import os
import json

In [2]:
raw_dr_ntu_dir='./raw_dr_ntu'
raw_faculty_db = 'scse_profile'

process_faculty_db_dir ='./processed'
process_faculty_db = 'scse_profile'
process_co_author_db = 'google_scholar_co_author'

raw_google_scholar_dir = './raw_google_scholar'
raw_google_search_dir = './google_search'

process_publications_dir = './processed_google_scholar_publications'

research_interest_dir = './research_interest'

education_output_dir = './dr_ntu_education'
os.makedirs(education_output_dir, exist_ok=True)

In [3]:
faculties = pd.read_csv(os.path.join(process_faculty_db_dir, process_faculty_db+'.csv'))
google_scholar_faculties = faculties[faculties['google_scholar'].notna()]

In [4]:
schema = {
    "properties": {
        "bachelor_degree": {"type": "string", "description": "School where bachelors degree was done. Set as 'None' if information is not available"},
        "masters": {"type": "string", "description": "School where masters degree was done. Set as 'None' if information is not available"},
        "phd": {"type": "string", "description": "School where phD was done. Set as 'None' if information is not available"}
    },
    "required": ["bachelor_degree", "masters", "phd"],
}


llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

document_transformer = create_metadata_tagger(metadata_schema=schema, llm=llm)

In [5]:
original_documents = []

for i, row in tqdm(faculties.iterrows()):
    name, id = row['full_name'], row['dr_ntu_id']

    dr_ntu = f"{raw_dr_ntu_dir}/{id}.json"
    
    if os.path.exists(dr_ntu):
        with open(dr_ntu, 'r') as f:
            dr_ntu_profile = json.load(f)


        doc = Document(
            page_content=dr_ntu_profile['biography'],
            metadata={'name': name, 'id': id}
        )
        original_documents.append(doc)

enhanced_documents = document_transformer.transform_documents(original_documents)

for doc in enhanced_documents:
    metadata = doc.metadata
    name = metadata['name']
    id = metadata['id']
    
    for k,v in metadata.items():
        if v=='None':
            metadata[k] = None
    
    with open(f'{education_output_dir}/{id}.json', 'w') as f:
        json.dump(metadata, f)

86it [00:00, 5007.64it/s]


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
