In [1]:
import pandas as pd
import numpy as np
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import uuid
import json
import ollama.client as client



splitter = RecursiveCharacterTextSplitter(
    chunk_size = 800,
    chunk_overlap  = 100,
    length_function = len,
    is_separator_regex = False,
)

In [2]:
from transformers import pipeline

## Roberta based NER
# ner = pipeline("token-classification", model="2rtl3/mn-xlm-roberta-base-named-entity", aggregation_strategy="simple")
ner = pipeline("token-classification", model="dslim/bert-large-NER", aggregation_strategy="simple")


print("Number of parameters ->", ner.model.num_parameters()/1000000, "Mn")


config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Number of parameters -> 332.538889 Mn


In [4]:
def row2NamedEntities(row):
    # print(row)
    ner_results = ner(row['text'])
    metadata = {'chunk_id': row['chunk_id']}
    entities = []
    for result in ner_results:
        entities = entities + [{'name': result['word'], 'entity': result['entity_group'], **metadata}]
        
    return entities

def dfText2DfNE(dataframe):
    ## Takes a dataframe from the parsed data and returns dataframe with named entities. 
    ## The input dataframe must have a text and a chunk_id column. 

    ## Using swifter for parallelism
    ## 1. Calculate named entities for each row of the dataframe. 
    results = dataframe.apply(row2NamedEntities, axis=1)

    ## Flatten the list of lists to one single list of entities. 
    entities_list = np.concatenate(results).ravel().tolist()

    ## Remove all NaN entities
    entities_dataframe = pd.DataFrame(entities_list).replace(' ', np.nan)
    entities_dataframe = entities_dataframe.dropna(subset=['entity'])

    ## Count the number of occurances per chunk id
    entities_dataframe = entities_dataframe.groupby(['name', 'entity', 'chunk_id']).size().reset_index(name='count')

    return entities_dataframe

In [7]:
loader = PyPDFLoader("./data/GlobalPublicHealth2022.pdf")
# loader = PyPDFDirectoryLoader("./data/kesy1dd")

pages = loader.load_and_split(text_splitter=splitter)
len(pages)


ValueError: File path ./data/GlobalPublicHealth2022.pdf is not a valid file or url

In [135]:

rows = []
for page in pages:
    row = {'text': page.page_content, **page.metadata, 'chunk_id': uuid.uuid4().hex}
    rows += [row]

df = pd.DataFrame(rows)


In [136]:
dfne = dfText2DfNE(df)


In [137]:
df_ne = dfne.groupby(['name', 'entity']).agg({'count': 'sum', 'chunk_id': ','.join}).reset_index()
df_ne.sort_values(by='count', ascending=False).head(100).reset_index()

Unnamed: 0,index,name,entity,count,chunk_id
0,486,WHO,ORG,228,"02db3a55557341d8ba3851dedd6223ed,077b1c4c99064..."
1,390,Region,MISC,72,"07a0744df31b4bfeaa18460091a48c6f,082f7b2fd6794..."
2,384,R,MISC,57,"088ecaeee2804c10bb2a8eb885930949,1a1455b89c444..."
3,160,E,MISC,37,"045bd2ce17da4b839e1198a6603c3b34,0a342463219b4..."
4,161,E,ORG,36,"02db3a55557341d8ba3851dedd6223ed,0848d9f456f04..."
...,...,...,...,...,...
95,195,Event Management System,MISC,3,"15113bcf8e2f4018a589268bf2e67852,650b458e34044..."
96,124,China,LOC,3,7a56d66c7a9a4b249c4b5e7a9ad1bcd0
97,131,Congo,LOC,3,"045bd2ce17da4b839e1198a6603c3b34,d510aaa685194..."
98,440,South -,MISC,3,"66faf6d1a4334bf69bd0300b7653096c,fb187a00ea554..."


In [140]:
pages[12].page_content

'assesses the likelihood and consequences of an acute public health threat due to exposure \nto an identified hazard. RRA involves a joint assessment by the WHO Country and Regional \nOffices and headquarters. It is conducted for events with serious public health implications \nfollowing pre-defined criteria within WHO. The RRA process provides a forum for the \ntimely assessment of available data, which takes into account the contextual and hazard-\nspecific knowledge and feedback of key experts across WHO. It supports a collaborative \nexpert prioritization of immediate actions in a time-sensitive manner. The finalized RRA \nreport may be shared with key stakeholders that could contribute to the response. RRA \nreports have become well accepted documents of high practical value both within WHO'

In [184]:


def extractConcepts(prompt: str, model='mistral-openorca:latest'):
    SYS_PROMPT = (
        "Your task is to extract the key entities mentioned in the users input.\n"
        "Entities may include - event, concept, person, place, object, document, organisation, artifact, misc, etc.\n"
        "Format your output as a list of json with the following structure.\n"
        "[{\n"
        "   \"entity\": The Entity string\n"
        "   \"importance\": How important is the entity given the context on a scale of 1 to 5, 5 being the highest.\n"
        "   \"type\": Type of entity\n"
        "}, { }]"
    )
    response, context = client.generate(model_name=model, system=SYS_PROMPT, prompt=prompt)
    return json.loads(response)


In [185]:
res = extractConcepts(prompt = pages[22].page_content)

 [
   {
      "entity": "acute public health events",
      "importance": 4,
      "type": "event"
   },
   {
      "entity": "infectious diseases",
      "importance": 3,
      "type": "concept"
   },
   {
      "entity": "disasters",
      "importance": 2,
      "type": "concept"
   },
   {
      "entity": "EMS",
      "importance": 3,
      "type": "document"
   },
   {
      "entity": "RRA reports",
      "importance": 2,
      "type": "document"
   },
   {
      "entity": "EIS bulletins",
      "importance": 2,
      "type": "document"
   },
   {
      "entity": "DON reports",
      "importance": 2,
      "type": "document"
   },
   {
      "entity": "WHO Regions",
      "importance": 3,
      "type": "organization"
   },
   {
      "entity": "IHR (2005) framework",
      "importance": 4,
      "type": "concept"
   },
   {
      "entity": "WHO",
      "importance": 4,
      "type": "organization"
   }
]

In [168]:
res

[{'entity': 'infectious diseases', 'importance': 4},
 {'entity': 'disasters', 'importance': 3},
 {'entity': 'EMS', 'importance': 3},
 {'entity': 'RRA reports', 'importance': 2},
 {'entity': 'EIS bulletins', 'importance': 1.5},
 {'entity': 'DON reports', 'importance': 1},
 {'entity': 'WHO Regions', 'importance': 2},
 {'entity': 'IHR (2005) framework', 'importance': 3}]