In [None]:

response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": "patent_inventors",
                    "schema": {
                        "type": "object",
                        "properties": {
                            "inventors": {
                                "type": "array",
                                "items": {
                                    "type": "object",
                                    "properties": {
                                        "name": {
                                            "type": ["string", "null"],
                                            "description": "The name of the inventor. For example, 'Acle', 'Hempnall', 'Tibenham', 'Blakeney'. If you don't know, return null."
                                        },
                                        "sex":{
                                            "type": ["boolean", "null"],
                                            "description": "The inferred sex of the inventor. 0 for male, 1 for female. Try to guess from the first name, for example, 'Bob Dy' is 0, 'Marlyn' is 1"
                                        },
                                        "occupation": {
                                            "type": ["string"],
                                            "description": "Summarise the text and then infer the occupation of the inventor (Historical International Standard Classification of Occupations (HISCO) coding scheme). For example, 'Industrial machinery or tools engineers', 'Engineering technicians nec', 'Ships officers nfs', 'Newsvendors'."
                                        },
                                        "industry": {
                                            "type": ["string"],
                                            "description": "The industry of the inventor's occupation belongs to (according to Industry 1950 basis, US). For example, 'Aircraft and parts', 'Retail florists', 'Postal service', 'Lady/Man of leisure', 'Common or General laborer'."
                                        },
                                            
                                        "address": {
                                            "type": "object",
                                            "properties": {
                                                "street": {
                                                    "type": ["string", "null"],
                                                    "description": "Street address of the inventor. More detailed than city. For example, Kennett Square, Elm Ave, Maple Drive, Oak Lane. If you don't know, just leave blank"
                                                },
                                                "city": {
                                                    "type": ["string", "null"],
                                                    "description": "City of the inventor. For example: ['Attleboro, MA', ' Cincinnati, OH', 'Long Island City, NY']"
                                                },
                                                "county": {
                                                    "type": ["string", "null"],
                                                    "description": "County of the inventor. More detailed than state, but less than city"
                                                },
                                                "state": {
                                                        "type": ["string", "null"],
                                                        "enum": [
                                                            "Alabama", "Illinois", "District of Columbia", "Maine", "Michigan",
                                                            "Iowa", "Missouri", "New York", "Ohio", "Pennsylvania", "Florida",
                                                            "Texas", "Wisconsin", "Kansas", "Maryland", "Georgia", "Indiana",
                                                            "Utah", "Massachusetts", "Kentucky", "Vermont", "North Carolina",
                                                            "Arizona", "Arkansas", "Montana", "Nebraska", "Minnesota",
                                                            "Virginia", "Oregon", "Nevada", "New Hampshire", "Idaho",
                                                            "Rhode Island", "California", "New Jersey", "Mississippi",
                                                            "South Carolina", "Louisiana", "Colorado", "Washington",
                                                            "West Virginia", "Tennessee", "Connecticut", "New Mexico",
                                                            "South Dakota", "North Dakota", "Delaware"
                                                        ],
                                                        "description": "State of the inventor."
                                                },
                                                "region": {
                                                    "type": ["string", "null"],
                                                    "description": "Census region and division of the inventor. For example: ['New England Division, 'Middle Atlantic Division', 'East North Central Division', 'West North Central Division', 'South Atlantic Division', 'East South Central Division', 'West South Central Division', 'Mountain Division',  'Pacific Divisio']."
                                                },
                                                "foreign": {
                                                    "type": ["boolean", "null"],
                                                    "description": "True indicates froeign to the US, otherwise False."
                                                },
                                                "full_address": {
                                                    "type": ["string", "null"],
                                                    "description": "The full address string."
                                                },
                                            },
                                            "required": ["street", "county", "city", "state", "region", "foreign", "full_address"],
                                            "additionalProperties": False
                                        }
                                    },
                                    "required": ["name", "occupation", "industry", "address"],
                                    "additionalProperties": False
                                }
                            }
                        },
                        "required": ["inventors"],
                        "additionalProperties": False
                    }
                }
            }
str(response_format)

In [None]:

patent_content = 'Application filed March 20, 1889. Serial No. 303,951. (No model.) To all whom, lb may concern: Be it known that we, George Bensel and Otto Theodore Maier, both of New Orleans, in the parish of Orleans and State of Louis5 iana, have invented a new and Improved Fastening for Doors and Shutters, of which the following is a full, clear, and exact description. The object of the invention is to provide a io new and improved fastening for doors and shutters which permits of opening the doors or shutters from the outside in case of lire. The invention consists of a pin orpins supporting the lock and p'

sys_mess = f'''
You are an expert in information extraction. You have been asked to extract all information related to the name, occupation, and address for inventors in a patent document. The text is the OCR result of historical patents in the US.

Please adhere to the following instructions:
1. Return a JSON object for each inventor following the response format specified. Ensure each inventor has their properties assigned correctly.
2. Try to populate every property for 'address' (e.g., street, city, state). If any fields are missing, leave them as blank strings (""). Do not guess or misassign values.
3. Summarize the text and infer 'occupation' and 'industry' fields using the Historical International Standard Classification of Occupations (HISCO) coding scheme.

Please extract the following details for each inventor in JSON format. If any information is missing, leave it blank. 
Here is the schema for JSON structure:
{response_format}
'''

user_input = f'''
Now, based on the following patent description, extract the relevant details:
{patent_content}
'''

message = sys_mess + user_input
message

In [None]:
patent_content

In [None]:
message


## Llama3_1
Gated, need huggingface logging in

Network error, trying caching to local

### Downloading

In [None]:
# !pip install huggingface-hub
# !HUGGINGFACE_TOKEN="your_token" huggingface-cli login
# !huggingface-cli download meta-llama/Meta-Llama-3.1-8B-Instruct --cache-dir ../model/Llama-3.1-8B-Instruct

### Pred

In [None]:
from huggingface_hub import login
import os

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv)
os.environ["HUGGINGFACE_API_KEY"] = os.getenv("HUGGINGFACE_API_KEY")
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
# Log in programmatically with your token

login(token=HUGGINGFACE_API_KEY)

In [None]:
# !pip install transformers --upgrade
%%timeit
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# model_id = "meta-llama/Llama-2-7b-hf"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
    trust_remote_code=True,  # Add this line

)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])


In [9]:
import transformers
import torch

# Replace with your local model path
model_id = "../model/Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
    trust_remote_code=True,
)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
)

print(outputs[0]["generated_text"])


OSError: ../model/Llama-3.1-8B-Instruct does not appear to have a file named config.json. Checkout 'https://huggingface.co/../model/Llama-3.1-8B-Instruct/tree/None' for available files.

In [None]:
import transformers
import torch
import multiprocessing as mp
import os

# model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
model_id = "../model/Llama-3.1-8B-Instruct/models--meta-llama--Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)


def generate_text(prompt, sys_mess, user_input):
    messages = [
    {"role": "system", "content": sys_mess},
    {"role": "user", "content": user_input},
    ]

    outputs = pipeline(
    messages,
    max_new_tokens=2048,
    )
    return outputs[0]["generated_text"][-1]

cpu_counts= os.cpu_count()
# Function to parallelize model inference across multiple prompts
def parallel_inference(prompts, num_workers=64):
    with mp.Pool(processes=num_workers) as pool:
        results = pool.star_map(generate_text, prompts, sys_mess, user_input)
    return results
    
# messages = [
#     {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
#     {"role": "user", "content": "Who are you?"},
# ]

## GPTNeo

In [None]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")


## T5

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids

sequence_ids = model.generate(input_ids)
sequences = tokenizer.batch_decode(sequence_ids)
sequences

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

# task_prefix = "translate English to German: "
# # use different length sentences to test batching
# sentences = ["The house is wonderful.", "I like to work in NYC."]
# inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)

# output_sequences = model.generate(
#     input_ids=inputs["input_ids"],
#     attention_mask=inputs["attention_mask"],
#     do_sample=False,  # disable sampling to test if batching affects output
# )

# print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))

In [None]:
task_prefix = sys_mess
# use different length sentences to test batching
sentence = user_input

inputs = tokenizer(task_prefix + sentence, return_tensors="pt", padding=True)

output_sequences = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    do_sample=False,  # disable sampling to test if batching affects output
    max_length=2048
)

print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))