In [None]:
!pip install spacy
!pip install -qU pypdf2
!pip install langchain
!pip install langchain_community

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m


In [None]:
from langchain_community.document_loaders import PyPDFLoader
from google.colab import drive
import pandas as pd
drive.mount("/content/drive/") # load dataset drive

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import re
import os

In [None]:
base_path = "/content/drive/MyDrive/Tubes_NLP"
dataset_path = "/content/drive/MyDrive/Tubes_NLP/Datasets"
test_dataset_path = "/content/drive/MyDrive/Tubes_NLP/Test Datasets"

In [None]:
file_path = "/content/drive/MyDrive/Tubes_NLP/Datasets/hydrogen-peroxide-30-solution-food-grade-kosher-safety-data-sheet-67053155865ad.pdf"
loader = PyPDFLoader(file_path)
pages = []
async for page in loader.alazy_load():
    pages.append(page)

# NER Method

## Rule Based

In [None]:
def rule_based_ner(text):

    _text = text.lower()
    patterns = {
        "CHEMICAL_NAME": r"(?:product name|name|trade name)\s*([\w\-\s]+)",
        "COMPANY_NAME": r"(?:company)\s*([\w\-\,\s]+)",
        "CAS_NO": r"(\d{2,7}-\d{2}-\d)",
    }

    entities = []
    for label, pattern in patterns.items():
        for match in re.finditer(pattern, _text):
            entities.append({
                "text": match.group(1),
                "label": label,
                "start": match.start(1),
                "end": match.end(1)
            })
    return entities

In [None]:

def extract_text_from_first_page(pdf_path):
    """
    Extract text from the first page of a PDF file using LangChain's PyPDFLoader.
    """
    loader = PyPDFLoader(pdf_path)
    pages = loader.load_and_split()
    if pages:
        return pages[0].page_content
    return ""
def process_folder(folder_path, output_excel):
    """
    Process all PDF files in a folder, perform NER, and save results to an Excel file.
    """
    data = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, file_name)
            text = extract_text_from_first_page(pdf_path)

            if text:
                entities = rule_based_ner(text)
                for entity in entities:
                    data.append({
                        "File Name": file_name,
                        "Entity": entity["text"],
                        "Label": entity["label"],
                        "Start": entity["start"],
                        "End": entity["end"]
                    })

    # Save results to an Excel file
    df = pd.DataFrame(data)
    df.to_excel(output_excel, index=False)

# Specify the folder containing PDF files and the output Excel file

output_excel = "output_entities_rule.xlsx"

# Process the folder and store results
process_folder(test_dataset_path, os.path.join(base_path, output_excel))
print(f"Results saved to {os.path.join(base_path, output_excel)}")


Results saved to /content/drive/MyDrive/Tubes_NLP/output_entities_rule.xlsx


## Machine Learning Based

In [None]:
# prompt: with spacy, create train ner for finding cas_no, cheical_name, and formula. We will user the company NER that they provided

import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

# Assuming you have a DataFrame 'df' with 'File Name', 'Entity', 'Label', 'Start', 'End' columns
# Replace this with your actual data loading process
# Example using the output_entities_rule.xlsx file created by your previous code:
df = pd.read_excel(os.path.join(base_path, "output_entities_rule.xlsx"))


for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    text = extract_text_from_first_page(os.path.join(test_dataset_path, row['File Name']))
    doc = nlp.make_doc(text)
    ents = []
    try:
      start = row["Start"]
      end = row["End"]
      span = doc.char_span(start, end, label=row['Label'], alignment_mode="expand")
      if span is None:
        print("Skipping entity:", row['Entity'], "in file:", row['File Name'])
        continue
      ents.append(span)
      doc.ents = ents
      db.add(doc)
    except Exception as e:
      print("Error processing entity:", row['Entity'], "in file:", row['File Name'], "Error:", e)
      continue

db.to_disk("./train.spacy") # save the docbin object

!python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy

100%|██████████| 20/20 [00:13<00:00,  1.45it/s]


[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00 