### CLEANED CONTENT FILE CREATION

In [1]:
import re
import json
import os
import random
from collections import defaultdict

In [2]:
def extract_claim_info(claims):
    claims = re.sub(r'c-en-01-(\d{4})', 'c-en-\\1', claims)
    claims = re.sub(r'(id="c-en-(\d{4})") num=""', r'\1 num="\2"', claims)
    claim_regex = r'(?:<clai)?m id="c-en-(\d+)" num="(\d+)">(.*?)</claim>'
    claim_text_regex = r'<claim-text>(.*?)</claim-text>'
    # Extracting all claims from the json_mapper
    all_claims = re.findall(claim_regex, claims, re.DOTALL)
    claim_info = {}
    claim_list = []
    for claim_id, claim_num, claim_content in all_claims:
        claim_list.append(int(claim_num))
        claim_texts = re.findall(claim_text_regex, claim_content, re.DOTALL)
        claim_text = ' '.join(claim_texts).strip()
        claim_text = claim_text.replace("<claim-text>", " ")
        claim_text = claim_text.replace("</claim-text>", " ")
        claim_text = re.sub(r'<[^>]+>', '', claim_text).strip()
        claim_info[f"c-en-{int(claim_num):04d}"] = claim_text
    return {f"c-en-{num:04d}": claim_info.get(f"c-en-{num:04d}", "Claim text not found") for num in claim_list}

In [3]:
def extract_paragraphs(description):
    extracted_paragraphs = {}
    paragraph_pattern = re.compile(r'<p id="p(\d+)"[^>]*>(.*?)</p>', re.DOTALL)
    matches = paragraph_pattern.findall(description)
    for pid, pcontent in matches:
        # Clean up <figref> tags and other HTML tags from paragraphs
        pcontent_clean = re.sub(r'<figref idref="[^"]+">(.*?)</figref>', r'\1', pcontent)
        pcontent_clean = re.sub(r'<[^>]+>', '', pcontent_clean).strip()
        extracted_paragraphs[f"p{int(pid):04d}"] = pcontent_clean
    return extracted_paragraphs

In [4]:
def content_cleaner(title, description, abstract=None, claims=None):
    dict_content = {}
    title = title.replace("\n", "")
    dict_content.update({"title": title})
    if abstract:
        abstract_clean = re.sub(r'<[^>]+>', '', abstract).strip()[2:]
        dict_content.update({"pa01": abstract_clean})
    if claims:
        all_claims = extract_claim_info(claims)
        dict_content.update(all_claims)
    dict_content.update(extract_paragraphs(description))
    return dict_content
    

In [11]:
def process_data(json_citing, q7_info, type_doc=None, cited=True):
    data = []
    counter_total = 0
    seen_set = set()
    matching_entry = None
    for patent in json_citing:
        application = patent["application_number"] + patent["application_category"]
        if type_doc:
            matching_entry = next(
                (element for element in q7_info if (patent["application_number"] + patent["application_category"] == element[type_doc]) and (element['Category_Cited'] == 'X' or element['Category_Cited'] == 'A') and element["Content_Cited"]), None
            )
        if matching_entry or not cited:
            if (cited and application not in seen_set) or not cited:
                seen_set.add(application)
                quadruple_dict = {
                    "Application_Number": patent["application_number"],
                    "Application_Date": patent["application_date"],
                    "Application_Category": patent["application_category"],
                    "Content": content_cleaner(
                        patent["title"],
                        patent["description"],
                        abstract=patent.get("abstract"),
                        claims=patent.get("claims")
                    )
                }
                data.append(quadruple_dict)
                counter_total += 1 
                if not cited and counter_total == 8000:
                    break
            else:
                continue
        else:
            continue
    print(f"Total number of patents processed: {len(data)}")
    return data

def open_json_file(path_to_json):
    # Opening the JSON file
    with open(path_to_json, 'r') as file:
        # Loading JSON
        json_dict = json.load(file)
    return json_dict

def data_write(data, directory, path):
    # Creation of Directory if doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)

    # File Path to store data information
    file_path = os.path.join(directory, path)
    # Writing Q7 Information on JSON
    with open(file_path, 'w') as file:
        # Converting from list of dictionaries to JSON + indent=4 for readability
        json.dump(data, file, indent=4)

def create_json(path, data1, data2=None): # data1 = uncited, data2 = cited for 2-nd json
    if data2:
        merged_data = data1 + data2 # Merging both data
        directory = '/bigstorage/DATASETS_JSON/Content_JSONs/Cited_2020_Uncited_2010-2019_Cleaned_Content_22k'
        data_write(merged_data, directory, 'CLEANED_CONTENT_DATASET_cited_patents_by_2020_uncited_2010-2019.json')
    else:
        directory = '/bigstorage/DATASETS_JSON/Content_JSONs/Citing_2020_Cleaned_Content_12k'
        data_write(data1, directory, 'CLEANED_CONTENT_DATASET_citing_patents_2020.json')

def main():
    path_to_directory_citing = "/bigstorage/DATASETS_JSON/Content_JSONs/Citing_2020_Cleaned_Content_12k"
    path_to_directory_cited = "/bigstorage/DATASETS_JSON/Content_JSONs/Cited_2020_Uncited_2010-2019_Cleaned_Content_22k"

    path_to_json_citing ='/bigstorage/DATASETS_JSON/Base_JSONS/CONTENT_DATASET_citing_patents_2020.json'
    path_to_json_uncited ='/bigstorage/DATASETS_JSON/Base_JSONS/CONTENT_DATASET_uncited_patents.json'
    path_to_json_cited ='/bigstorage/DATASETS_JSON/Base_JSONS/CONTENT_DATASET_cited_patents_by_2020.json'
    path_to_q7 = '/bigstorage/DATASETS_JSON/Q7Info/Q7.json'

    json_citing = open_json_file(path_to_json_citing)
    json_cited = open_json_file(path_to_json_cited)
    json_uncited = open_json_file(path_to_json_uncited)
    json_q7 = open_json_file(path_to_q7)

    data_citing = process_data(json_citing, json_q7, type_doc="ID_Citing")
    data_cited = process_data(json_cited, json_q7, type_doc="ID_Cited")
    data_uncited = process_data(json_uncited, json_q7, cited=False)
    create_json(path_to_directory_citing, data_citing)
    create_json(path_to_directory_cited, data_cited, data_uncited)

if __name__ == "__main__":
    main()

Total number of patents processed: 12195


### CITATIONS TRAIN-TEST FILES CREATION 

In [5]:
def process_data_for_citations(json_citing, cat_type):
    data = []
    counter_total = 0
    counter_type = 0
    for patent in json_citing:
        if patent["Category_Cited"] == cat_type and patent.get("Claims_Text") and patent.get("Content_Cited"):
            quadruple_tuple = (
                patent["ID_Citing"],
                list(patent.get("Claims_Text").keys()),
                patent["ID_Cited"],
                list(patent.get("Content_Cited").keys()),
                patent["Category_Cited"]
            )
            data.append(quadruple_tuple)
            counter_total += 1
            counter_type += 1
    print(f"Total number of patents with type {cat_type}: {counter_type}")
    print(f"Total number of patents processed: {counter_total}")
    return data, counter_type

In [6]:
def open_json_file(path_to_json):
    # Opening the JSON file
    with open(path_to_json, 'r') as file:
        # Loading JSON
        json_dict = json.load(file)
    return json_dict

path_to_new_json = "/bigstorage/DATASETS_JSON/Citing_2020_Cleaned_Content_12k/"
path_to_q7 ='/bigstorage/DATASETS_JSON/Q7Info/Q7.json'
json_citing = open_json_file(path_to_q7)
data_X = process_data_for_citations(json_citing, "X")
data_A = process_data_for_citations(json_citing, "A")

print(data_X[1])
print(data_A[1])
#create_json(data, path_to_json)

Total number of patents with type X: 3671
Total number of patents processed: 3671
Total number of patents with type A: 6189
Total number of patents processed: 6189
3671
6189


In [15]:
mock_data = data_A[0] + data_X[0]
random.shuffle(mock_data)

# Group documents by 'id_citing'
docs_by_id_citing = defaultdict(list)
for doc in mock_data:
    docs_by_id_citing[doc[0]].append(doc)

# Sort groups by their size to try filling the test dataset with smaller groups first
sorted_groups = sorted(docs_by_id_citing.items(), key=lambda x: len(x[1]))

list_1 = []  # Train
list_2 = []  # Test

type_A_count_list_1 = 0
type_X_count_list_1 = 0

type_A_count_list_2 = 0
type_X_count_list_2 = 0

# First try to fill the test set to have exactly 1000 documents
for id_citing, docs in sorted_groups:
    if len(list_2) + len(docs) <= 1000:  # Ensure test set does not exceed 1000 documents
        list_2.extend(docs)
        for doc in docs:
            if doc[4] == "A":
                type_A_count_list_2 += 1
            else:
                type_X_count_list_2 += 1
    else:
        # Once the test set is filled, start filling the train set
        list_1.extend(docs)
        for doc in docs:
            if doc[4] == "A":
                type_A_count_list_1 += 1
            else:
                type_X_count_list_1 += 1


distribution_info = {
    "Total items": len(mock_data),
    "List 1 total items": len(list_1),
    "List 2 total items": len(list_2),
    "List 1 type A items": type_A_count_list_1,
    "List 1 type X items": type_X_count_list_1,
    "List 2 type A items": type_A_count_list_2,
    "List 2 type X items": type_X_count_list_2,
    "List 1 type A ratio": type_A_count_list_1 / len(list_1),
    "List 1 type X ratio": type_X_count_list_1 / len(list_1)
}
distribution_info

6831
1000


{'Total items': 9860,
 'List 1 total items': 8860,
 'List 2 total items': 1000,
 'List 1 type A items': 5578,
 'List 1 type X items': 3282,
 'List 2 type A items': 611,
 'List 2 type X items': 389,
 'List 1 type A ratio': 0.6295711060948082,
 'List 1 type X ratio': 0.37042889390519185}

In [19]:
def create_json_citations(data, path):
    directory = "/bigstorage/DATASETS_JSON/Citation_JSONs/"
    if not os.path.exists(directory):
        os.makedirs(directory)
    file_path = os.path.join(directory, 'Citation_Train.json')
    with open(file_path, 'w') as file:
        # Converting from list of dictionaries to JSON + indent=4 for readability
        json.dump(data, file, indent=4) 

path_to_citation_json = "/bigstorage/DATASETS_JSON/Citation_JSONs"
create_json_citations(list_1, path_to_citation_json)