In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import gc

meta_df = pd.read_parquet("/kaggle/input/uspto-explainable-ai/patent_metadata.parquet")
sub_df = pd.read_csv("/kaggle/input/uspto-explainable-ai/sample_submission.csv").drop("query", axis=1)
meta_df = meta_df[meta_df["publication_number"].isin(sub_df["publication_number"])]

gc.collect()
meta_df.shape

(10, 5)

In [2]:
patent_df = pd.read_parquet("/kaggle/input/uspto-all-patents-after-1975/all_patents.parquet")
patent_df.shape

(9458171, 3)

In [3]:
meta_df = meta_df.merge(patent_df, on="publication_number").reset_index(drop=True)
meta_df["title"] = meta_df["title"].fillna("")
meta_df["abstract"] = meta_df["abstract"].fillna("")

del patent_df
gc.collect()

0

### Install dependencies for LLM inference

In [4]:
!pip install -q -U accelerate --no-index --find-links ../input/llm-detect-pip/
!pip install -q -U bitsandbytes --no-index --find-links ../input/llm-detect-pip/
!pip install -q -U transformers --no-index --find-links ../input/llm-detect-pip/

In [5]:
import whoosh_utils
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

model_name = '/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1'
tokenizer = AutoTokenizer.from_pretrained(model_name) 

bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type="fp4",
    bnb_4bit_compute_dtype= torch.float16,
    bnb_4bit_use_double_quant= False,
)

model = AutoModelForCausalLM.from_pretrained(
        model_name,
        #quantization_config=bnb_config,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
)

Processing /kaggle/input/whoosh-wheel-2-7-4/Whoosh-2.7.4-py2.py3-none-any.whl
Installing collected packages: Whoosh
Successfully installed Whoosh-2.7.4


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


In [7]:
def gen_keyword(row):
    response_start = "Here are the keywords comma separated: "

    prompt = f'''
    
    Title: {row.title}
    
    Abstract: {row.abstract}
    
    Publication Number: {row.publication_number}
    
    CPC Codes: {row.cpc_codes}
    
    -------
    Based on the provided patent details, identify several keywords that can be used to search for similar patents in the database. The keywords should be specific, relevant, and comprehensive enough to cover the main aspects of the patent. Consider the technical field, main inventions, methods, apparatus, and any unique features mentioned in the title, abstract, and CPC codes.

    Your task is to generate a set of keywords that a patent professional could use to find patents closely related to the given patent. These keywords will help in constructing Boolean queries for patent search. Ensure the keywords are comma-separated.
    '''

    messages = [
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": response_start}
    ]

    model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")
    model_inputs = model_inputs.to("cuda")
    model_inputs = model_inputs[:, :-2]
    
    generated_ids = model.generate(model_inputs, max_new_tokens=10, 
                                   pad_token_id=tokenizer.eos_token_id,
                                   do_sample=False, begin_suppress_tokens=[13, 28740])

    decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    answer = decoded[0].split(response_start.strip())[-1].strip()
    
    try:
        answer = str(eval(answer))
    except:
        pass
    
    return prompt, answer

answers = []

for i, row in tqdm(meta_df.iterrows(), total=meta_df.shape[0]):
    prompt, answer = gen_keyword(row)
    answers.append(answer)

  0%|          | 0/6 [00:00<?, ?it/s]2024-06-22 17:06:06.781450: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-22 17:06:06.781568: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-22 17:06:06.883663: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
100%|██████████| 6/6 [00:17<00:00,  2.84s/it]


In [8]:
import re
import whoosh

NUMBER_REGEX = re.compile(r'^(\d+|\d{1,3}(,\d{3})*)(\.\d+)?$')

class NumberFilter(whoosh.analysis.Filter):
    def __call__(self, tokens):
        for t in tokens:
            if not NUMBER_REGEX.match(t.text):
                yield t

BRS_STOPWORDS = ['an', 'are', 'by', 'for', 'if', 'into', 'is', 'no', 'not', 'of', 'on', 'such',
        'that', 'the', 'their', 'then', 'there', 'these', 'they', 'this', 'to', 'was', 'will']

custom_analyzer = whoosh.analysis.StandardAnalyzer(stoplist=BRS_STOPWORDS) | NumberFilter()

In [9]:
queries = []
backup_query = "ti:device"

for i, row in tqdm(meta_df.iterrows()):
    cpc = row["cpc_codes"]
    pn = row["publication_number"]
    
    try:
        tokens = [token.text for token in custom_analyzer(answers[i])]
        tokens = list(set(tokens))
        
        if len(tokens) > 0:
            cpc  = "(" + " OR ".join(cpc[:15]) + ")"
            keywords = "(" + " OR ".join(tokens) + ")"
            query = f"detd:{keywords} AND cpc:{cpc}"
        else:
            # prevent scoring to hang when there is no valid keyword
            query = f"cpc:{cpc[0]}"

        n_tokens = whoosh_utils.count_query_tokens(query)
        if n_tokens >= 50:
            query = backup_query
    except:
        query = backup_query
    
    queries.append(query)

6it [00:00, 2727.41it/s]


In [10]:
meta_df["query"] = queries

meta_df.to_csv("submission.csv", index=False, columns=["publication_number", "query"])