In [36]:
!pwd

import py_stringmatching
from math import floor

from py_stringmatching import JaroWinkler
jw = JaroWinkler()


# Function to calculate the
# Jaro Similarity of two strings

    
# name matching using locality-sensitive hashing (simhash)
# these are mostly idempotent

jw = JaroWinkler()
ja = py_stringmatching.similarity_measure.jaccard.Jaccard()
co = py_stringmatching.similarity_measure.cosine.Cosine()
from py_stringmatching import SoftTfIdf
me = py_stringmatching.similarity_measure.monge_elkan.MongeElkan()
from py_stringmatching import Levenshtein
leven = Levenshtein()
from py_stringmatching import TfIdf









from itertools import chain, repeat
from collections import defaultdict
from math import ceil

import numpy as np
import pandas as pd
import networkx as nx
from editdistance import eval as levenshtein

from standardize import standardize_weak, standardize_strong
from tables import read_csv
from simhash import shingle, Cluster

# firm name sources - tag: (table, id_col, name_col)
colmap = {
    'apply': ('apply_apply', 'appnum', 'appname'),
    'grant': ('grant_grant', 'patnum', 'owner'),
    'assignor': ('assign_use', 'assignid', 'assignor'),
    'assignee': ('assign_use', 'assignid', 'assignee'),
    'compustat': ('compustat', 'compid', 'name'),
}

# find all unique names
def generate_names(output, columns):
    print('generating names')

    sdict = {}
    for tag, (table, id_col, name_col) in columns.items():
        src = read_csv(f'{output}/{table}.csv', usecols=[id_col, name_col]).dropna()
        src['name'] = src[name_col].apply(standardize_weak)
        sdict[tag] = src

    names = pd.concat([src['name'] for src in sdict.values()], axis=0).drop_duplicates()
    names = names[names.str.len()>0].reset_index(drop=True)
    names = names.rename('name').rename_axis('id').reset_index()
    names.to_csv(f'{output}/name.csv', index=False)

    for tag, (table, id_col, name_col) in columns.items():
        src = pd.merge(sdict[tag], names, how='left', on='name')
        src[[id_col, 'id']].to_csv(f'{output}/{tag}_match.csv', index=False)

    print(f'found {len(names)} names')

# k = 8, thresh = 4 works well
def filter_pairs(output, nshingle=2, k=8, thresh=4):
    print('filtering pairs')

    c = Cluster(k=k, thresh=thresh)
    name_dict = {}

    names = read_csv(f'{output}/name.csv', usecols=['id', 'name'])
    for i, id, name in names.itertuples():
        words = name.split()
        shings = list(shingle(name, nshingle))

        features = shings + words
        weights = list(np.linspace(1.0, 0.0, len(shings))) + list(np.linspace(1.0, 0.0, len(words)))

        c.add(features, weights=weights, label=id)
        name_dict[id] = name

        if i > 0 and i % 100_000 == 0:
            print(f'{i}: {len(c.unions)}')

    pairs = pd.DataFrame([(i1, i2, name_dict[i1], name_dict[i2]) for i1, i2 in c.unions], columns=['id1', 'id2', 'name1', 'name2'])
    pairs.to_csv(f'{output}/pair.csv', index=False)

    print('Found %i pairs' % len(pairs))

# compute distances on owners in same cluster
def find_groups(output, thresh=0.80):
    print('finding matches')

    
    
    
    def dmetr(name1, name2):
        max_len = max(len(name1), len(name2))
        max_dist = int(ceil(max_len*(1.0-thresh)))
        set_name1 = name1.split()
        set_name2 = name2.split()
        ldist = ja.get_sim_score(set_name1, set_name2)
        #ldist = co.get_sim_score(set_name1, set_name2)
        #sf = SoftTfIdf([set_name1, set_name2])
        #ldist = sf.get_raw_score(set_name1, set_name2)
        #ldist = jw.get_sim_score(name1, name2)
        #ldist = me.get_raw_score(set_name1, set_name2)
        #ldist = leven.get_sim_score(name1, name2)
        #tf = TfIdf([set_name1, set_name2])
        #ldist = tf.get_sim_score(set_name1, set_name2)
        return ldist if (ldist != -1 and max_len != 0) else 0.0

    
    
    
    
    
    
    
    
    close = []
    name_std = {}

    pairs = read_csv(f'{output}/pair0.csv', usecols=['id1', 'id2', 'name1', 'name2'])
    for i, id1, id2, name1, name2 in pairs.itertuples():
        if id1 not in name_std:
            name_std[id1] = standardize_strong(name1)
        if id2 not in name_std:
            name_std[id2] = standardize_strong(name2)
        
        #name을 공백을 기준으로 분리후 집합으로 선언합니다.
        #n1std = set(name_std[id1].split())
        #n2std = set(name_std[id2].split())
        
        #생성된 집합끼리 차집합 연산을 통해 공통된 원소를 삭제하고 다시 리스트로 변환합니다.
        #n1std1 = list(n1std - n2std)
        #n2std2 = list(n2std - n1std)
        
        #공통 원소가 제거된 원소를 다시 문자열로 복원시킵니다.
        #n1std = " ".join(n1std1)
        #n2std = " ".join(n2std2)
        n1std = name_std[id1]
        n2std = name_std[id2]
        
        if dmetr(n1std, n2std) > thresh:
            close.append((id1, id2))

        if i > 0 and i % 100_000 == 0:
            print(f'{i}: {len(close)}')

    G = nx.Graph()
    G.add_edges_from(close)
    comps = sorted(nx.connected_components(G), key=len, reverse=True)

    match = pd.DataFrame(chain(*[zip(repeat(fid), ids) for fid, ids in enumerate(comps)]), columns=['firm_num', 'id'])
    match.to_csv(f'{output}/match_JC_pair0.csv', index=False)

    print(f'found {len(comps)} groups')


# must be less than 1000000 components
def merge_firms(output, columns, base=1000000):
    print('merging firms')

    names = read_csv(f'{output}/name.csv')
    match = read_csv(f'{output}/match_JC_pair0.csv')
    firms = pd.merge(names, match, how='left', on='id')
    firms['firm_num'] = firms['firm_num'].fillna(firms['id']+base)
    firms[['firm_num', 'id']].to_csv(f'{output}/firm_JC_pair0.csv', index=False)

    for tag, (table, id_col, name_col) in columns.items():
        src = read_csv(f'{output}/{tag}_match.csv')
        src = pd.merge(src, firms, on='id')
        src[[id_col, 'firm_num']].to_csv(f'{output}/{tag}_firm_JC_pair0.csv', index=False)

if __name__ == "__main__":

    import easydict
 
    args = easydict.EasyDict({
        "sources": "",
        "output": "tables",
    })
 

    import argparse

    # parse input arguments
    #parser = argparse.ArgumentParser(description='Create firm name clusters.')
    #parser.add_argument('sources', nargs='*', type=str, help='data sources to use')
    #parser.add_argument('--output', type=str, default='tables', help='directory to operate on')
    #args = parser.parse_args()

    sources0 = ['apply', 'grant', 'assignee', 'assignor']
    sources = args.sources if len(args.sources) > 0 else sources0
    columns = {s: colmap[s] for s in sources}

    # go through steps
    #generate_names(args.output, columns)
    #filter_pairs(args.output)
    find_groups(args.output)
    merge_firms(args.output, columns)
    print("Done!")

/mnt/user2/2021_patent
finding matches
100000: 532
200000: 1168
found 1340 groups
merging firms
Done!


In [53]:
me = py_stringmatching.similarity_measure.monge_elkan.MongeElkan()
me.get_raw_score(['a', 'b', 'a'], ['a'])

0.6666666666666666

In [39]:
#SoftTfIdf = py_stringmatching.similarity_measure.soft_tfidf.SoftTfIdf()
from py_stringmatching import SoftTfIdf

In [5]:
from py_stringmatching import Levenshtein
leven = Levenshtein()
leven.get_sim_score("cook", "book")

0.75

In [3]:
name1 = "the university of male"
name2 = "of the a university of madrid holdings"
#x.split()
#jw.get_sim_score(x, y)
#standardize_strong(y)

In [7]:
def distinguish_function(x, y):
    set_x = set(x.split())
    set_y = set(y.split())
    new_x = list(set_x - set_y)
    new_y = list(set_y- set_x)
    x_string = " ".join(new_x)
    y_string = " ".join(new_y)
    jw.get_sim_score(x_string, y_string)
from py_stringmatching import TfIdf
set_name1 = name1.split()
set_name2 = name2.split()

tf = TfIdf([set_name1, set_name2])
tf.get_sim_score(set_name1, set_name2)

TypeError: get_sim_score() missing 2 required positional arguments: 'bag1' and 'bag2'

In [38]:
tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']])
tfidf.get_raw_score(['a', 'b', 'a'], ['b', 'c'])
!pwd

/mnt/user2/2021_patent


In [87]:
geo = pd.read_csv("/mnt/user2/R/geocode.csv", dtype={"State Code (FIPS)" : str, "County Code (FIPS)" : str})

In [91]:
geo["city_name"] = city_list

In [77]:
a = list(geo["Area Name (including legal/statistical area description)"])
#a1 = a[0].split()

#a1[0:(len(a1)-1)]

city_list = list()
for i in range(len(a)):
    a1 = a[i].split()
    a1 = a1[0:(len(a1)-1)]
    a2 = " ".join(a1)
    city_list.append(a2)
city_list

['Autauga',
 'Baldwin',
 'Barbour',
 'Bibb',
 'Blount',
 'Bullock',
 'Butler',
 'Calhoun',
 'Chambers',
 'Cherokee',
 'Chilton',
 'Choctaw',
 'Clarke',
 'Clay',
 'Cleburne',
 'Coffee',
 'Colbert',
 'Conecuh',
 'Coosa',
 'Covington',
 'Crenshaw',
 'Cullman',
 'Dale',
 'Dallas',
 'DeKalb',
 'Elmore',
 'Escambia',
 'Etowah',
 'Fayette',
 'Franklin',
 'Geneva',
 'Greene',
 'Hale',
 'Henry',
 'Houston',
 'Jackson',
 'Jefferson',
 'Lamar',
 'Lauderdale',
 'Lawrence',
 'Lee',
 'Limestone',
 'Lowndes',
 'Macon',
 'Madison',
 'Marengo',
 'Marion',
 'Marshall',
 'Mobile',
 'Monroe',
 'Montgomery',
 'Morgan',
 'Perry',
 'Pickens',
 'Pike',
 'Randolph',
 'Russell',
 'St. Clair',
 'Shelby',
 'Sumter',
 'Talladega',
 'Tallapoosa',
 'Tuscaloosa',
 'Walker',
 'Washington',
 'Wilcox',
 'Winston',
 'Aleutians East',
 'Aleutians West Census',
 'Anchorage',
 'Bethel Census',
 'Bristol Bay',
 'Chugach Census',
 'Copper River Census',
 'Denali',
 'Dillingham Census',
 'Fairbanks North Star',
 'Haines',
 'Ho

In [78]:
        #name을 공백을 기준으로 분리후 집합으로 선언합니다.
        #n1std = set(name_std[id1].split())
        #n2std = set(name_std[id2].split())
        
        #생성된 집합끼리 차집합 연산을 통해 공통된 원소를 삭제하고 다시 리스트로 변환합니다.
        #n1std1 = list(n1std - n2std)
        #n2std2 = list(n2std - n1std)
        
        #공통 원소가 제거된 원소를 다시 문자열로 복원시킵니다.
        #n1std = " ".join(n1std1)
        #n2std = " ".join(n2std2)
len(city_list)

24283

In [79]:
len(a)

24283

In [92]:
geo.to_csv("/mnt/user2/R/geo.csv")

In [94]:
inventor = pd.read_csv("/mnt/user2/R/all_inventors.csv") #dtype={"State Code (FIPS)" : str, "County Code (FIPS)" : str})

In [99]:
us_inventor = inventor[inventor["inventor_country_code"] == "US"]

In [101]:
us_inventor.to_csv("/mnt/user2/R/us_inventor.csv")

In [106]:
unmatch = pd.read_csv("/mnt/user2/R/unmatch.csv")
fips = pd.read_csv("/mnt/user2/R/FIPS.csv")
fips_making = pd.read_csv("/mnt/user2/R/fips_making.csv")

In [104]:
def matching_machine(a, b):
    unmatch["invertor_city_name", "inventor_region_code"]["inventor_region_code" == a]

Unnamed: 0,application_number,inventor_name_first,inventor_name_middle,inventor_name_last,inventor_rank,inventor_city_name,inventor_region_code,FIPS.x,State Code (FIPS),FIPS.y
0,10015986,Kristina,J.,Hennessy,1,parkville,MO,,,
1,10015986,Karen,K.,Brown,2,parkville,MO,,,
2,10048355,David,P,Chastain,1,action,MA,,,
3,10089942,Robert,,Chesnut,5,cardiff-by-the-sea,CA,,,
4,10207755,Robert,L.,Studer,2,lake alice rose,WA,,,
...,...,...,...,...,...,...,...,...,...,...
867778,90019010,DELPHI,,TECHNOLOGIES INC (PATENT OWNER),2,annadale,VA,,,
867779,90019010,BMW OF NORTH AMERICA,,LLC (3RD PTY. REQ.),3,"woodcliff lake,",NJ,,,
867780,90014650,ARJUNA NATURAL,,PRIVATE LIMITED (PATENT OWNER),2,alwaye,IN,,,
867781,17274116,Murali,,NARASIMHA,1,lake osweg,OR,,,


In [107]:
leven.get_sim_score("aa", "abc")

0.33333333333333337