## TaskFlow API


## AIMS_Airflow

### nih_grant_etl

In [None]:
from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import days_ago
from airflow.utils.timezone import datetime

default_args = {
    'owner': 'rishabh',
    'start_date': datetime(2020, 11, 13)
}

dag = DAG(
    dag_id='nih_grant_etl',
    default_args=default_args,
    schedule_interval="00 09 * * 5",
    
)


nih_dowload = BashOperator(
    task_id='nih_download',
    bash_command='cd anaconda3/bin/python download_nih_grant.py',
    dag=dag
)



nih_extraction = BashOperator(
    task_id='nih_data',
    bash_command='cd anaconda3/bin/python grant_script.py',
    dag=dag
)


nih_dowload >> nih_extraction

### publication_etl

In [2]:
from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import days_ago
from airflow.utils.timezone import datetime


default_args = {
    'owner': 'VIkas',
    'start_date':datetime(2020, 11, 5)
}

dag = DAG(
    dag_id='publication_etl',
    default_args=default_args,
    schedule_interval='00 20 * * 4'
    
)

publication_download =BashOperator(
    task_id='publication_download',
    bash_command='cd anaconda3/bin/python download_pubmed_file.py',
    dag=dag
)



publication_extraction = BashOperator(
    task_id='publication_data',
    bash_command='cd anaconda3/bin/python xml_to_solr.py',
    dag=dag
)


publication_download >> publication_extraction


### patent_etl

In [None]:
from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import days_ago
from airflow.utils.timezone import datetime

default_args = {
    'owner': 'VIkas',
    'start_date': datetime(2020, 11, 13)
}

dag = DAG(
    dag_id='patent_etl',
    default_args=default_args,
    schedule_interval='40 16 * * 5',
    
)



patent_extraction = BashOperator(
    task_id='patent_data',
    bash_command='cd anaconda3/bin/python uspto_patent_download.py',
    dag=dag
)
        

patent_extraction

### Affiliation_dag

In [None]:
from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import days_ago
from airflow.utils.timezone import datetime

default_args = {
    'owner': 'Vikas',
    'start_date': datetime(2021, 3 , 22)
}

dag = DAG(
    dag_id='affiliation_dag',
    default_args=default_args,
    schedule_interval='00 14 * * 5',

)

patent_extraction = BashOperator(
    task_id='affiliation_data',
    bash_command='cd /anaconda3/bin/python affiliation_main.py',
    dag=dag
)

### aims_stats_daily_count


In [5]:
import requests
import json
import mysql.connector
from datetime import datetime
from datetime import datetime as dt
import xml.etree.ElementTree as ET
import os
import datetime
import mysql.connector
import dateutil.parser as dp

from airflow.utils.dates import days_ago
from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.utils.timezone import datetime as adt



header = {"Authorization": "Basic c29scnJlYWQ6TWVyY2sxMjM0IQ=="}

args = {
    'owner': "Vikas",
    'start_date': adt(2020, 11, 29)
}

dag = DAG(
    dag_id='aims_stats_daily_count',
    default_args=args,
    schedule_interval='00 23 * * 0',
    tags=['AIMS']
)

temp_today_date = datetime.datetime.now().date()

def modification_date(filename):
    t = os.path.getctime(filename)
    return datetime.datetime.fromtimestamp(t)


def get_last_seven_date():
    today_date = datetime.datetime.now()
    #return today_date
    ndays = datetime.timedelta(days = 7)
    diff_day = today_date - ndays
    test_date = diff_day.date()
    return test_date

def get_last_15_date():
    today_date = datetime.datetime.now()
    #return today_date
    ndays = datetime.timedelta(days = 15)
    diff_day = today_date - ndays
    test_date = diff_day.date()
    return test_date

def get_publication_count():
    pub_total_count = 0
    publication_path = './publication/SOLR_xml/'
    for filename in os.listdir(publication_path):
        if not filename.endswith('.xml'): continue
        fullname = os.path.join(publication_path, filename)
        #print(filename)
        #tree = ET.parse(fullname)

        file_modified_date = modification_date(fullname).date()
        if get_last_15_date() <= file_modified_date <= temp_today_date:
            print(file_modified_date)
            try:
                tree = ET.parse(fullname)
                pmid_count = tree.findall(".//field[@name='updated_on']")
                for val in pmid_count:
                    convrt_date = dp.parse(val.text).date()
                    if get_last_seven_date() <= convrt_date <= temp_today_date:
                        pub_total_count+=1
            except ET.ParseError:
                print(str(filename)+" solr patent xml file is still updating")
                pub_total_count += 0
                pass

    return pub_total_count

def get_grant_count():
    grant_total_count = 0
    grant_path = '/mnt4/aims/grant/SOLR_xml'
    for filename in os.listdir(grant_path):
        if not filename.endswith('.xml'): continue
        fullname = os.path.join(grant_path, filename)
        #print(filename)
        #tree = ET.parse(fullname)

        file_modified_date = modification_date(fullname).date()
        if get_last_15_date() <= file_modified_date <= temp_today_date:
            print(file_modified_date)
            try:
                tree = ET.parse(fullname)
                grant_count = tree.findall(".//field[@name='updated_on']")
                for val in grant_count:
                    convrt_date = dp.parse(val.text).date()
                    if get_last_seven_date() <= convrt_date <= temp_today_date:
                        grant_total_count+=1
            except ET.ParseError:
                print(str(filename)+" solr grant xml file is still updating")
                grant_total_count += 0
                pass
    return grant_total_count

def get_patent_count():
    patent_total_count = 0
    patent_path = '/mnt4/aims/patent/SOLR_xml'
    for filename in os.listdir(patent_path):
        if not filename.endswith('.xml'): continue
        fullname = os.path.join(patent_path, filename)
        #print(filename)
        #tree = ET.parse(fullname)

        file_modified_date = modification_date(fullname).date()
        #print("Modified Date is ", file_modified_date.date())

        if get_last_15_date() <= file_modified_date <= temp_today_date:
            print(file_modified_date)
            try:
                tree = ET.parse(fullname)
                patent_count = tree.findall(".//field[@name='updated_on']")
                for val in patent_count:
                    convrt_date = dp.parse(val.text).date()
                    if get_last_seven_date() <= convrt_date <= temp_today_date:
                        patent_total_count+=1
                #patent_total_count+=patent_count
            except ET.ParseError:
                print(str(filename)+" solr patent xml file is still updating")
                patent_total_count += 0
                pass
    return patent_total_count


def get_clinic_count():
    clinic_total_count = 0
    clinic_path = '/mnt4/aims/clinical_trial/SOLR_xml'
    for filename in os.listdir(clinic_path):
        if not filename.endswith('.xml'): continue
        fullname = os.path.join(clinic_path, filename)
        #print(filename)
        #tree = ET.parse(fullname)

        file_modified_date = modification_date(fullname).date()
        #print("Modified Date is ", file_modified_date.date())

        if get_last_15_date() <= file_modified_date <= temp_today_date:
            print(file_modified_date)
            try:
                tree = ET.parse(fullname)
                clinic_count = tree.findall(".//field[@name='updated_on']")
                for val in clinic_count:
                    convrt_date = dp.parse(val.text).date()
                    if get_last_seven_date() <= convrt_date <= temp_today_date:
                        clinic_total_count+=1
            except ET.ParseError:
                print(str(filename)+" solr clinical trial xml file is still updating")
                clinic_total_count += 0
                pass
    return clinic_total_count



def aims_stat_logic_daily():
    pubmed_url = "http://10.121.12.3:8983/solr/pubmed_v1/select?fq=updated_on%3A%5BNOW%2FDAY-7DAY%20TO%20NOW%2FDAY%2B1DAY%5D&q=*%3A*"
    grants_url = "http://10.121.12.3:8983/solr/grant_v1/select?fq=updated_on%3A%5BNOW%2FDAY-7DAY%20TO%20NOW%2FDAY%2B1DAY%5D&q=*%3A*"
    patent_url = "http://10.121.12.3:8983/solr/patent_v1/select?fq=updated_on%3A%5BNOW%2FDAY-7DAY%20TO%20NOW%2FDAY%2B1DAY%5D&q=*%3A*"
    ct_url = "http://10.121.12.3:8983/solr/clinical_trial_v1/select?fq=updated_on%3A%5BNOW%2FDAY-7DAY%20TO%20NOW%2FDAY%2B1DAY%5D&q=*%3A*"
    pub_resp = requests.get(pubmed_url, headers=header)
    pub_content = json.loads(pub_resp.content)

    grants_resp = requests.get(grants_url, headers=header)
    grants_content = json.loads(grants_resp.content)

    patent_resp = requests.get(patent_url, headers=header)
    patent_content = json.loads(patent_resp.content)

    CT_resp = requests.get(ct_url, headers=header)
    CT_content = json.loads(CT_resp.content)

    count = {}

    pub_num_count = pub_content['response']['numFound']
    print(pub_num_count)

    grant_num_count = grants_content['response']['numFound']
    print(grant_num_count)

    patent_num_count = patent_content['response']['numFound']
    print(patent_num_count)

    ct_num_count = CT_content['response']['numFound']
    print(ct_num_count)

    file_pub_num_count = get_publication_count()
    file_grant_num_count = get_grant_count()
    file_patent_num_count = get_patent_count()
    file_ct_num_count = get_clinic_count()
    today = dt.now()

    # establishing the connection
    conn = mysql.connector.connect(
        user='aims', password='9nCBwbQ9!', host='10.121.12.3', database='aims')

    # Creating a cursor object using the cursor() method
    cursor = conn.cursor()

    # Preparing SQL query to INSERT a record into the database.
    sql_query = """INSERT INTO data_collection_track(
       Date, Publications, Patent, Grants, Clinical_trials, Publications_file, Patent_file, Grants_file, Clinical_trials_file)
       VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)"""
    data_record = (today, pub_num_count, patent_num_count, grant_num_count, ct_num_count, file_pub_num_count, file_patent_num_count, file_grant_num_count,file_ct_num_count)

    try:
        # Executing the SQL command
        cursor.execute(sql_query, data_record)

        # Commit your changes in the database
        conn.commit()

    except Exception as e:
        print(e)
        # Rolling back in case of error
        conn.rollback()

    # Closing the connection
    conn.close()



stasts_task_aims_daily_count = PythonOperator(
    task_id="aims_stats_daily",
    provide_context=False,
    python_callable=aims_stat_logic_daily,
    dag=dag
)

# web_hook_task_year = PythonOperator(
#     task_id="aims_webhook",
#     provide_context=False,
#     python_callable=webhook,
#     dag=dag
# )

# stasts_task_aims_daily_count

### clinical_trial_author_weekly


In [None]:
import sys
sys.path.append('/mnt4/aims/airflow/dags/author_lib')
from name_match import AuthorMatch as au
# from author_lib.name_match import AuthorMatch as au
import json
import requests
import time
import uuid
import requests
import json
import pandas as pd
from xml.sax.saxutils import escape
import ast
import datetime
import traceback

from airflow.operators.python_operator import PythonOperator

from airflow.models import DAG
from airflow.utils.timezone import datetime as adt
from airflow.utils.dates import days_ago



header = {"Authorization": "Basic c29scnJlYWQ6TWVyY2sxMjM0IQ=="}

args = {
    'owner': "Vikas",
    'start_date': days_ago(7)
}

dag = DAG(
    dag_id='clinical_trial_author_weekly',
    default_args=args,
    schedule_interval='00 23 * * 0',
    tags=['AIMS']
)



start = time.time()

def get_all_clinic_authors():
    clinic_url = 'http://10.121.12.3:8983/solr/clinical_trial_v1/select?fq=updated_on%3A%5BNOW-7DAY%2FDAY%20TO%20NOW%5D&q=central_name%3A*%20AND%20primary_sponsors%3A*&rows=10&start=0'
    response = requests.get(clinic_url, headers={'Authorization': "Basic c29scnJlYWQ6TWVyY2sxMjM0IQ=="""})
    json_data = json.loads(response.text)
    new_count = json_data['response']['numFound']
    print(new_count)
    if new_count==0:
        return
    new_clinic_url = 'http://10.121.12.3:8983/solr/clinical_trial_v1/select?fq=updated_on%3A%5BNOW-7DAY%2FDAY%20TO%20NOW%5D&q=central_name%3A*%20AND%20primary_sponsors%3A*&rows='+str(20)+'&start=0'
    response_2 = requests.get(new_clinic_url, headers={'Authorization': "Basic c29scnJlYWQ6TWVyY2sxMjM0IQ=="""})
    json_data_2 = json.loads(response_2.text)
    title_item_2 = json_data_2['response']['docs']

    result_list = []
    cnt = 0
    for each_item in title_item_2:
        try:
            cnt += 1
            print(cnt)
            auth_name_tag = each_item['central_name'][0]
            proj_title = each_item['brief_title'][0]
            abst_text = each_item['description'][0]
            org_text = each_item['primary_sponsors'][0]
            tag_text_list = each_item['tags'][0:5]
            tag_txt = ''
            for tg in tag_text_list:
                tag_txt = tag_txt + tg + ","

            if '$' in auth_name_tag:
                auth_name_tag_2 = auth_name_tag.split('$')
                for each_name in auth_name_tag_2:
                    temp_clinic_dict = {}
                    clean_prefix = au.refine_name(each_name.strip())
                    if not clean_prefix:
                        continue

                    check_existing_name = 'http://10.121.12.3:8983/solr/author_v1/select?q=author_name%3A%22'+clean_prefix.title()+'%22'
                    response_3 = requests.get(check_existing_name, headers={'Authorization': "Basic c29scnJlYWQ6TWVyY2sxMjM0IQ=="""})
                    json_data_3 = json.loads(response_3.text)
                    new_count_3 = json_data_3['response']['numFound']
                    if new_count_3>=1:
                        old_auth = 'http://10.121.12.3:8983/solr/author_v1/select?q=author_name%3A%22'+clean_prefix.title()+'%22&wt=json&indent=true'
                        response_4 = requests.get(old_auth,
                                                  headers={'Authorization': "Basic c29scnJlYWQ6TWVyY2sxMjM0IQ=="""})
                        json_data_4 = json.loads(response_4.text)
                        all_tag = json_data_4['response']['docs'][0]
                        temp_clinic_dict['id'] = all_tag['id']
                        temp_clinic_dict['original_name'] = clean_prefix.title()
                        try:
                            if all_tag['clinic_id']:
                                all_tag['clinic_id'].append(each_item['id'])
                                temp_clinic_dict['clinic_id'] = all_tag['clinic_id']
                        except:
                            temp_clinic_dict['clinic_id'] = each_item['id']
                        try:
                            if all_tag['grant_id']:
                                temp_clinic_dict['grant_id'] = all_tag['grant_id']
                        except:
                            pass
                        try:
                            if all_tag['patent_id']:
                                temp_clinic_dict['patent_id'] = all_tag['patent_id']
                        except:
                            pass
                        try:
                            if all_tag['PMID']:
                                temp_clinic_dict['PMID'] = all_tag['PMID']
                        except:
                            pass
                        all_tag['title'].append(proj_title)
                        temp_clinic_dict['Title'] = all_tag['title']
                        all_tag['abstract'].append(abst_text)
                        temp_clinic_dict['Abstract'] = all_tag['abstract']
                        all_tag['tag'].append(tag_txt)
                        temp_clinic_dict['tag'] = all_tag['tag']
                        temp_clinic_dict['affiliation'] = all_tag['affiliation'][0]
                    else:
                        temp_clinic_dict['original_name'] = clean_prefix.title()
                        temp_clinic_dict['id'] = 'nan'
                        temp_clinic_dict['clinic_id'] = each_item['id']
                        temp_clinic_dict['Title'] = proj_title
                        temp_clinic_dict['Abstract'] = abst_text
                        temp_clinic_dict['affiliation'] = org_text
                        temp_clinic_dict['tag'] = tag_txt[:-1]
                    if temp_clinic_dict != {}:
                        result_list.append(temp_clinic_dict)
        except Exception:
            traceback.print_exc()
    print("total results count ", len(result_list))
    df2 = pd.DataFrame(result_list)
    df = df2.drop_duplicates(subset=['original_name', ], keep=False)
    empty_str = ''
    add_start = '<?xml version="1.0" encoding="UTF-8"?>\n<add>\n'
    add_end = '</add>\n'
    start_doc = '<doc>\n'
    end_doc = '</doc>\n'
    f_close = '</field>\n'

    empty_str += add_start
    for i, row in df.iterrows():
        if str(row['clinic_id']) == 'nan':
            continue
        empty_str += start_doc
        if str(row['id']) == 'nan':
            empty_str += '<field name="id">' + escape(str(uuid.uuid4())) + f_close
        else:
            empty_str += '<field name="id">' + escape(str(row['id'])) + f_close
        empty_str += '<field name="author_name">' + escape(str(row['original_name'])) + f_close

        clinic_count = 0
        if type(row['clinic_id'])==list:
            for val in row['clinic_id']:
                clinic_count+=1
                if str(val) != 'nan':
                    empty_str += '<field name="clinic_id">' + escape(str(val)) + f_close
        else:
            empty_str += '<field name="clinic_id">' + escape(str(row['clinic_id'])) + f_close
            clinic_count+=1

        patent_count = 0
        try:
            if type(row['patent_id']) == list:
                for val in row['patent_id']:
                    patent_count+=1
                    if str(val) != 'nan':
                        empty_str += '<field name="patent_id">' + escape(str(val)) + f_close
        except:
            pass

        grant_count = 0
        try:
            if type(row['grant_id']) == list:
                for val in row['grant_id']:
                    grant_count += 1
                    if str(val) != 'nan':
                        empty_str += '<field name="grant_id">' + escape(str(val)) + f_close
        except:
            pass

        PMID_count = 0
        try:
            if type(row['PMID']) == list:
                for val in row['PMID']:
                    PMID_count += 1
                    if str(val) != 'nan':
                        empty_str += '<field name="PMID">' + escape(str(val)) + f_close
        except:
            pass


        if type(row['Title']) == list:
            for val in row['Title']:
                if str(val) != 'nan':
                    empty_str += '<field name="title">' + escape(str(val)) + f_close
        else:
            empty_str += '<field name="title">' + escape(str(row['Title'])) + f_close

        if type(row['Abstract']) == list:
            for val in row['Abstract']:
                if str(val) != 'nan':
                    empty_str += '<field name="abstract">' + escape(str(val)) + f_close
        else:
            empty_str += '<field name="abstract">' + escape(str(row['Abstract'])) + f_close

        if type(row['tag']) == list:
            for val in row['tag']:
                if str(val).lstrip() != 'nan':
                    empty_str += '<field name="tag">' + escape(str(val)) + f_close
        else:
            empty_str += '<field name="tag">' + escape(str(row['tag'])) + f_close

        check_type_affi_ori = row['affiliation']
        empty_str += '<field name="affiliation">' + escape(str(check_type_affi_ori)) + f_close
        empty_str += '<field name="total_clinical_trail">' + str(clinic_count) + f_close
        if patent_count!=0:
            empty_str += '<field name="total_patent">' + str(patent_count) + f_close
        if grant_count!=0:
            empty_str += '<field name="total_grant">' + str(grant_count) + f_close
        if PMID_count!=0:
            empty_str += '<field name="total_publication">' + str(PMID_count) + f_close
        empty_str += '<field name="total_document_count">' + str(clinic_count+patent_count+grant_count+PMID_count) + f_close

        empty_str += end_doc
    empty_str += add_end
    time_now  = datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S')
    file_name = '/mnt4/aims/workflow/people_dag_xml_files/'+str(time_now)+"_clinic_author.xml"
    with open(file_name, 'a', encoding='utf-8') as xmlFile:
        xmlFile.write((empty_str))

author_module_regular_update = PythonOperator(
    task_id="clinical_author_weekly_updater",
    provide_context=False,
    python_callable=get_all_clinic_authors,
    dag=dag
)


author_module_regular_update

### Startup

In [None]:
"response":{"numFound":1392425,"start":0,"docs":[
      {
        "id":"7c5659e8-2450-5190-9b71-38d62d80c632",
        "Company_Name":["ephemera"],
        "Company_Name_copy_txt":["ephemera"],
        "Short_Description":["Your true self, unleashed."],
        "Description":["Ephemera is a mobile app that allows you to send anonymous geolocalized messages to everyone around you! All the messages are automatically deleted after a few minutes.ephemerapp.com"],
        "Website":["http://ephemerapp.com"],
        "Recipient_Country":["Portugal"],
        "state":["Área Metropolitana de Lisboa"],
        "city":["Lisbon"],
        "Publisher":["Angel"],
        "Company_Size":10,
        "Market":["Social Media"],
        "Tags":["mobile app",
          "everyone",
          "message",
          "true self",
          "few minutes.ephemerapp.com",
          "ephemera"],
        "_version_":1688072483867983872
      },
      {
        "id":"586d2d27-8f88-58f5-8194-36a6f74f1a4f",
        "Company_Name":["Epic Business Consulting"],
        "Company_Name_copy_txt":["Epic Business Consulting"],
        "Short_Description":["Analytics For People Who Hate Analytics"],
        "Description":["Epic gives small to medium sized businesses all the benefits of advanced site and usability analytics with none of the headache. When a company signs up for our service they get a full analytics audit, ensuring that everything is being tracked correctly on their website. We also discuss key performance indicators the business tracks and might suggest a few of our own. Going forward, the company will get bi-weekly status reports in PDF format. The reports will only include KPI's and relevant data, along with targeted alerts and suggestions. An example would be, \"65% of Facebook traffic converts into paying customers. You should definitely expand your Facebook marketing efforts.\"We continually update the customer on important metrics, and even suggest and implement A/B testing for better conversion. Clients receive full service, stress free analytics without ever seeing a spreadsheet."],
        "Website":["http://www.epicbusinessconsulting.com"],
        "Recipient_Country":["United States"],
        "state":["Illinois"],
        "Publisher":["Angel"],
        "Company_Size":10,
        "Market":["Small and Medium Businesses",
          "E-Commerce",
          "Consulting",
          "Business Development"],
        "Team_Member":["{'Epic Business ': \"We deliver Epic Business Consulting. What's that you want to hire us? Good Choice. You'll be #excited in no time!\"}"],
        "emerging_term":["stress",
          "headache",
          "update"],
        "Tags":["service",
          "suggestion",
          "advanced site",
          "usability analytics",
          "company sign",
          "people",
          "targeted alert",
          "client",
          "headache",
          "relevant data",
          "facebook marketing effort",
          "full analytics audit",
          "sized business",
          "customer",
          "bi-weekly status report",
          "facebook traffic convert",
          "company",
          "example",
          "pdf format",
          "full service",
          "stress free analytics",
          "better conversion",
          "analytics",
          "everything",
          "website",
          "important metric",
          "key performance indicator",
          "business track",
          "spreadsheet",
          "report",
          "benefit",
          "implement a/b"],
        "category_path":["Engineering/Human-machine_interaction/Human-computer_interaction/Usability"],
        "category_path_copy_txt":["Engineering/Human-machine_interaction/Human-computer_interaction/Usability"],
        "category":["Engineering"],
        "category_copy_txt":["Engineering"],
        "sub_category":["Human-machine_interaction",
          "Usability",
          "Human-computer_interaction"],
        "_version_":1688072483869032448
      },

## patent_etl

In [2]:
import io
import codecs 

In [22]:
# with io.open(filename) as patent_xml:
#     patent_data=patent_xml.read()
# print(patent_data)

In [None]:
#######
'''from lxml import etree

context = etree.iterparse(filename, events=('end',), tag='nodes')

for event, element in context:
	<Do the stuff here you want to do with the element. This element has all the information about the content of 'node' tag 
    and its child elements because in the context above I have ordered it to capture only 'end' events for me. 
    So it captures the event when the parser hits the end of the node tag i.e </node> tag or <node/> if it has no content inside it.

	element.clear()
	#This line tells that you won't be accessing any child elements of the element now. So the parser can just throw them off.


	#Now clearing the parent elements of the 'element'
	while elem.getprevious() is not None:
    		del elem.getparent()[0]
	# 'not None' is used here because if the element you are parsing is root itself, then it will raise an exception because there is no parent for it, so you might have to handle that exception too in that case.'''

In [11]:
import lxml 
from lxml import etree
import xml.etree.cElementTree as ET
filename='../Airflow_modules/Patent/uspto_datasets/ipa210520.xml'
import re
# !pip install xmlformatter
# !pip install patentdata
# !pip install scispacy

In [12]:
with open(filename) as f:
    patent_xml = f.read()
    f.close()

In [13]:
text=re.compile("<\?xml version=\"1\.0\" encoding\=\"UTF\-8\"\?>")
file=text.split(patent_xml)

In [14]:
while '' in file:
    file.remove('')
print('No of patents',len(file))

No of patents 8835


In [15]:
#patent_id=US20210144906A1-20210520.XML
patent_text='<us-patent-application lang="EN" dtd-version="v4.4 2014-04-03" file="US20210144899A1-20210520.XML" status="PRODUCTION" id="us-patent-application" country="US" date-produced="20210506" date-publ="20210520">'
patent_id=re.compile('file\=\"([U][S]\d{11}\w\d)\-\d{8}\.XML\"')
print('patent id',patent_id.findall(patent_text))

# inv_title='Agricultural Disc and Process for Manufacturing an Agricultural Disc for Use in Agricultural Work'
inv_title='<invention-title id="d2e79">Agricultural Disc and Process for Manufacturing an Agricultural Disc for Use in Agricultural Work</invention-title>'
invention_title=re.compile("<invention-title id=\w>")
print('invention_id',invention_title.findall(inv_title))


patent id ['US20210144899A1']
invention_id []
