In [304]:
import os
import pymysql
import datetime
import pandas as pd
import lib
from sqlalchemy import create_engine
from functools import reduce
from sqlalchemy import create_engine
import urllib.parse
from lib.configuration import get_config, get_connection_string,get_current_config
import json

In [305]:
os.environ['PACKAGE_HOME']="/Users/gchickering/offline_Github/PatentsView/PatentsView-DB"

In [306]:
config = get_current_config('granted_patent', **{
        "execution_date": datetime.date(2021, 11, 4)
    })


    generating config with parameters: 
    type: granted_patent
    schedule: weekly
    execution date: 2021-11-04


In [307]:
database = '{}'.format('elastic_production_pgpub_20231231')
host = '{}'.format('patentsview-ingest-production.cckzcdkkfzqo.us-east-1.rds.amazonaws.com')
user = '{}'.format('pipeline_user')
password = '{}'.format('')
port = '{}'.format('3306')
connection_string = 'mysql+pymysql://{0}:{1}@{2}:{3}/{4}?charset=utf8mb4'.format(user, password, host, port, database)

In [308]:
connection_string=get_connection_string(config, database='PROD_DB')

In [309]:
engine=create_engine(connection_string)

In [310]:
columns=pd.read_sql_query(sql="""
select *
from information_schema.COLUMNS
where TABLE_SCHEMA = 'elastic_production_pgpub_20231231';
""", con=engine)

In [311]:
columns['TABLE_NAME'].unique() #as a list

array(['rel_app_text', 'publication_cpc_at_issue', 'publication_ipcr',
       'publication_gov_contract', 'publication_uspc_at_issue',
       'publication', 'publication_wipo', 'foreign_priority',
       'publication_cpc_current', 'granted_pregrant_crosswalk', 'ipcr',
       'publication_pct_data', 'us_parties', 'publication_assignee',
       'publication_inventor', 'publication_gov_interest_organizations',
       'publication_us_related_documents'], dtype=object)

In [312]:
columns["DATA_TYPE"].unique()

array(['varchar', 'bigint', 'mediumtext', 'int', 'date', 'smallint',
       'text', 'tinyint'], dtype=object)

In [313]:
category_list = ['applicant_type',
'category',
'classification_data_source',
'classification_level',
'classification_status',
'deceased',
'designation',
'doc_type',
'doctype',
'field_id',
'group',
'group_id',
'ipc_class',
'kind',
'relkind',
'role',
'section',
'section_id',
'state',
'status',
'subclass',
'subgroup',
'subgroup_id',
'symbol_position',
'type']

In [314]:
def generate_column_setting(column):
    data_type = 'varchar'
    if column.DATA_TYPE in ['date', 'timestamp']:
        data_type = 'date'
    if column.DATA_TYPE in ['int', 'bigint', 'smallint', 'float', 'decimal']:
        data_type = 'int'

    null_allowed = True
    if column.IS_NULLABLE == 'NO':
        null_allowed = False

    location_field = False
    if column.COLUMN_NAME in ['country', 'country_transformed']:
        location_field = True

    category = False
    if column.COLUMN_NAME in category_list:
        if column.TABLE_NAME == 'foreign_priority':
            if column.COLUMN_NAME =='kind':
                category = False
        elif column.TABLE_NAME in ['further_cpc','main_cpc']:
            category = False
        elif column.COLUMN_NAME == 'section_id' and column.TABLE_NAME != 'cpc':
            category = False
        else:
            category=True

    setting = {
        column.COLUMN_NAME: {
            'data_type': data_type,
            'null_allowed': null_allowed,
            'category': category,
            'location_field': location_field
        },
    }
    return setting


In [315]:
def generate_table_setting(table_group):
    y = table_group.apply(generate_column_setting, axis=1)
    fields = reduce(lambda a, b: dict(a, **b), y.tolist())
    return {"fields": fields}

In [316]:
x=columns.groupby('TABLE_NAME').apply(func=generate_table_setting)

In [317]:
x

TABLE_NAME
foreign_priority                          {'fields': {'document_number': {'data_type': '...
granted_pregrant_crosswalk                {'fields': {'patent_id': {'data_type': 'varcha...
ipcr                                      {'fields': {'ipcr_id': {'data_type': 'int', 'n...
publication                               {'fields': {'id': {'data_type': 'varchar', 'nu...
publication_assignee                      {'fields': {'assignee_id': {'data_type': 'varc...
publication_cpc_at_issue                  {'fields': {'document_number': {'data_type': '...
publication_cpc_current                   {'fields': {'document_number': {'data_type': '...
publication_gov_contract                  {'fields': {'document_number': {'data_type': '...
publication_gov_interest_organizations    {'fields': {'document_number': {'data_type': '...
publication_inventor                      {'fields': {'inventor_id': {'data_type': 'varc...
publication_ipcr                          {'fields': {'document_numbe

In [318]:
# my_list

In [319]:
my_list = []
for keys in x.keys():
    my_list.append(x[keys])
    
    
super_dict = {}
for k in set(k for d in my_list for k in d):
    super_dict[k] = [d[k] for d in my_list if k in d]

In [320]:
super_dict

{'fields': [{'document_number': {'data_type': 'int',
    'null_allowed': False,
    'category': False,
    'location_field': False},
   'foreign_doc_number': {'data_type': 'varchar',
    'null_allowed': True,
    'category': False,
    'location_field': False},
   'date': {'data_type': 'date',
    'null_allowed': True,
    'category': False,
    'location_field': False},
   'country': {'data_type': 'varchar',
    'null_allowed': True,
    'category': False,
    'location_field': True},
   'kind': {'data_type': 'varchar',
    'null_allowed': True,
    'category': False,
    'location_field': False}},
  {'patent_id': {'data_type': 'varchar',
    'null_allowed': True,
    'category': False,
    'location_field': False},
   'document_number': {'data_type': 'int',
    'null_allowed': True,
    'category': False,
    'location_field': False},
   'application_number': {'data_type': 'varchar',
    'null_allowed': True,
    'category': False,
    'location_field': False},
   'current_pgpub_id_f

In [321]:
for i in super_dict.keys():
    for j in super_dict[i]:
            j["TestScripts"] = ["ElasticDBTester"]

In [323]:
x2 = x.to_json(indent = 2)

In [324]:
x2

'{\n  "foreign_priority":{\n    "fields":{\n      "document_number":{\n        "data_type":"int",\n        "null_allowed":false,\n        "category":false,\n        "location_field":false\n      },\n      "foreign_doc_number":{\n        "data_type":"varchar",\n        "null_allowed":true,\n        "category":false,\n        "location_field":false\n      },\n      "date":{\n        "data_type":"date",\n        "null_allowed":true,\n        "category":false,\n        "location_field":false\n      },\n      "country":{\n        "data_type":"varchar",\n        "null_allowed":true,\n        "category":false,\n        "location_field":true\n      },\n      "kind":{\n        "data_type":"varchar",\n        "null_allowed":true,\n        "category":false,\n        "location_field":false\n      },\n      "TestScripts":[\n        "ElasticQBTester"\n      ]\n    }\n  },\n  "granted_pregrant_crosswalk":{\n    "fields":{\n      "patent_id":{\n        "data_type":"varchar",\n        "null_allowed":tr

In [346]:
# Define the file path where you want to save the JSON file
file_path = '../resources/table_config_elasticsearch_pgpub.json'

# Write JSON data to a new file
with open(file_path, 'w') as f:
    f.write(x2)

print(f"JSON data has been saved to {file_path}")

JSON data has been saved to ../resources/table_config_elasticsearch_pgpub2.json


In [326]:
pwd()

'/Users/gchickering/offline_Github/PatentsView/PatentsView-DB/QA'