In [1]:
import pandas as pd
import numpy as np


In [2]:
pd.set_option('display.max_colwidth', -1)
names =  pd.read_csv('sec-data/sec__edgar_company_info.csv')
print('The shape: %d x %d' % names.shape)
names.head()

The shape: 663000 x 3


Unnamed: 0,Line Number,Company Name,Company CIK Key
0,1,!J INC,1438823
1,2,"#1 A LIFESAFER HOLDINGS, INC.",1509607
2,3,#1 ARIZONA DISCOUNT PROPERTIES LLC,1457512
3,4,#1 PAINTBALL CORP,1433777
4,5,$ LLC,1427189


In [3]:
names

Unnamed: 0,Line Number,Company Name,Company CIK Key
0,1,!J INC,1438823
1,2,"#1 A LIFESAFER HOLDINGS, INC.",1509607
2,3,#1 ARIZONA DISCOUNT PROPERTIES LLC,1457512
3,4,#1 PAINTBALL CORP,1433777
4,5,$ LLC,1427189
5,6,& S MEDIA GROUP LLC,1447162
6,7,&TV COMMUNICATIONS INC.,1479357
7,8,"'MKTG, INC.'",886475
8,9,'OHANA LABS INC.,1703629
9,10,(OURCROWD INVESTMENT IN MST) L.P.,1599496


In [4]:
import re

def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

print('All 3-grams in "McDonalds":')
ngrams('Vertias')

All 3-grams in "McDonalds":


['Ver', 'ert', 'rti', 'tia', 'ias']

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

company_names = names['Company Name']
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(company_names)

In [6]:
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [7]:
import time
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)
t = time.time()-t1
print("SELFTIMED:", t)

SELFTIMED: 7743.734587192535


In [8]:
matches

<663000x663000 sparse matrix of type '<class 'numpy.float64'>'
	with 1372363 stored elements in Compressed Sparse Row format>

In [9]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similarity': similairity})
        

In [10]:
matches_df = get_matches_df(matches, company_names, top=100000)
matches_df_copy = matches_df
matches_df = matches_df[matches_df['similarity'] < 0.999] # Remove all exact matches
matches_df = matches_df[matches_df['similarity']>0.9]
f=matches_df.sample(10)
f

Unnamed: 0,left_side,right_side,similarity
52498,ALERUS SECURITIES /BD,PARS SECURITIES INC /BD,0.995359
22472,ADVENT PARTNERS GPE VII-A CAYMAN LIMITED PARTNERSHIP,ADVENT PARTNERS GPE VIII-A LIMITED PARTNERSHIP,0.903676
94110,ARROWHEAD FINANCIAL GROUP INC /BD,CENTURY FINANCIAL GROUP INC /BD,0.985612
47542,AGRICAPITAL SECURITIES INC /BD,FUND SECURITIES INC /BD,0.993694
75774,ANDERSON KELLY J.,ANDERSON KELLY,0.911791
46337,AGL LIFE ASSURANCE CO SEPARATE ACCOUNT VL 107,AGL LIFE ASSURANCE CO SEPARATE ACCOUNT VL 100,0.946809
42088,"AEI 2006 VENTURE INVESTMENTS IV, LLC",AEI 2006 VENTURE INVESTMENTS II LLC,0.919949
89719,"ARES CLO MANAGEMENT XXXI, L.P.","ARES CLO MANAGEMENT XXI, L.P.",0.948581
18690,ADOLPH COORS CO,COORS ADOLPH CO,0.909914
90607,ARES XXXII CLO LTD.,"ARES XXXIII CLO, LTD.",0.980905


In [11]:
# import spacy

# nlp = spacy.load('en')  # make sure to use larger model!
# tokens = nlp(u'dog cat banana')

# for token1 in f['left_side']:
#     for token2 in f['right_side']:
#         print(token1, token2, token1.similarity(token2))

In [12]:
matches_df_filt = matches_df_copy[matches_df_copy['similarity']<0.85]
matches_df_filt=matches_df_filt[matches_df_filt['similarity']>0.7]
matches_df_filt=matches_df_filt.reset_index(drop=True)

In [13]:
import distance, jellyfish
distance.nlevenshtein(matches_df_filt['left_side'][0], matches_df_filt['right_side'][0])
jellyfish.jaro_distance(matches_df_filt['left_side'][0], matches_df_filt['right_side'][0])

0.8141945773524721

In [14]:
import jellyfish
def calculate_lev(row):
    return distance.nlevenshtein(row['left_side'], row['right_side'])
def calculate_jaro(row):
    return jellyfish.jaro_distance(row['left_side'], row['right_side'])

matches_df_filt['levenshtein'] = matches_df_filt.apply(calculate_lev, axis=1)
matches_df_filt['jaro'] = matches_df_filt.apply(calculate_jaro, axis=1)

In [15]:
matches_df_filt

Unnamed: 0,left_side,right_side,similarity,levenshtein,jaro
0,& S MEDIA GROUP LLC,HH & S MEDIA GROUP LLC,0.845132,0.136364,0.814195
1,02 MEDTECH INC,O2 MEDTECH INC,0.840850,0.071429,0.952381
2,"0210, LLC",90210 LLC,0.845242,0.222222,0.925926
3,03 ENTERTAINMENT GROUP INC,REAL ENTERTAINMENT GROUP INC,0.806923,0.142857,0.829518
4,1 JOINT VENTURE,PETERS 1 JOINT VENTURE,0.814375,0.318182,0.738384
5,1 JOINT VENTURE,CORAL 1 JOINT VENTURE,0.805682,0.285714,0.838095
6,1 LANE TECHNOLOGIES CORP,ANE TECHNOLOGIES INC,0.818192,0.291667,0.735185
7,1-800 IDEAS COM INC,800 IDEAS INC,0.812487,0.315789,0.869096
8,"10 OUTDOOR ADVERTISING, INC.",ADAMS OUTDOOR ADVERTISING INC,0.823358,0.241379,0.825465
9,"11 RONIIN, LLC","RONIIN, LLC",0.802134,0.214286,0.837662


In [16]:
import re

string="124134 asda sdfsdfsg43534,:<>&+"

result=re.sub(r'[^a-zA-Z]',"",string)
result

'asdasdfsdfsg'

In [17]:
# def string_removal(row):
#     return re.sub(r'[^a-zA-Z]',"",row['leftside']),re.sub(r'[^a-zA-Z]',"",row['rightside'])

# matches_df_filt['left_clean'], matches_df_filt['right_clean']=matches_df_filt.apply(string_removal, axis=1)
