In [1]:
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer
import time
from nltk.metrics.distance import edit_distance
from nltk.metrics.distance import jaccard_distance


STrain = pd.read_csv('STrain.csv', sep = '|')
G = pd.read_csv('G.csv', sep = '|')
merge = STrain.merge(G, how = 'left', on = 'company_id', suffixes = ['STrain','R'])

# Remove punctuations and lowercase

def remove_punctuations(X):
    translator = X.maketrans('', '', string.punctuation)
    return X.translate(translator)

# remove lowercase for comparing initials, remove space directly
def remove_lowercase(X):
    translator = X.maketrans('', '', string.ascii_lowercase)
    return X.translate(translator).replace(' ','')

def remove_whitespace(X):
    # remove the space and make lowercase here
    return X.replace(' ', '')

def make_lower(X):
    # convert to lower case
    return X.lower()

# remove all the punctuations
G['name_no_punc'] = G['name'].apply(remove_punctuations)
# after removing the punctuations, creating another one with only capital letters
G['name_capital'] =G['name_no_punc'].apply(remove_lowercase)
G['name_no_punc'] = G['name_no_punc'].apply(make_lower)
G['name_no_punc_no_space'] = G['name_no_punc'].apply(remove_whitespace)

# Same transformation for STrain
STrain['name_no_punc'] = STrain['name'].apply(remove_punctuations)
STrain['name_capital'] =STrain['name_no_punc'].apply(remove_lowercase)
STrain['name_no_punc'] = STrain['name_no_punc'].apply(make_lower)
STrain['name_no_punc_no_space'] = STrain['name_no_punc'].apply(remove_whitespace)

print('There are total', STrain[STrain['company_id'] == -1].shape[0], 'unmatched companies.')

There are total 30348 unmatched companies.


In [20]:
import time
from nltk.metrics.distance import edit_distance
from nltk.metrics.distance import jaccard_distance
def find_the_word(X, Matrix):
    '''Find the word in bag of words of G
         using the column "name_no_punc" '''
    
    t = time.time()
    # split the name into separate words
    words = X.split()
    length = len(words)
    # Find the word in the bag of words of G
    found = False
    j = 0
    while (found == False) & (j < length):
        found = words[j] in Matrix.columns 
        j = j + 1
        
    if found:
        sub_matrix = Matrix[Matrix[words[j-1]]>=1]
        return sub_matrix.index # return index
    else:
        return None
    
def find_the_word_whole(X,Matrix):
    to_compare_dict = {}
    '''Input the whole training company (name_no_punc), and output the index to dictionary'''
    for i in X.index:
        words = X.loc[i].split()
        length = len(words)
        # Find the word in the bag of words of G
        found = False
        j = 0
        while (found == False) & (j < length):
            found = words[j] in Matrix.columns 
            j = j + 1
        
        if found:
            to_compare_dict[i] = Matrix[Matrix[words[j-1]]>=1]
           
        else:
            to_compare_dict[i] = None
                
        return to_compare_dict
    
    
    
def compare_jacaard(to_compared,X,jd_threshold):
    '''use the column "name_no_punc_no_space" '''
    best_matching_index = -1
    for k in to_compared.index:
        jd = jaccard_distance(set(to_compared.loc[k]), set(X))
        if jd <= jd_threshold:
            best_matching_index = k
            break
    
    return best_matching_index

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_df=0.03)
matrix = vectorizer.fit_transform(G['name_no_punc']).todense() 
#sum_words = matrix.sum(axis=0) 
features = vectorizer.get_feature_names()
#word_count = pd.DataFrame(sum_words,index = features, column = ['count'])
#words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
#words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
#print( vectorizer.vocabulary_ )

In [2]:
# Try TF-IDF and compute cosine similarity for Train and G
t = time.time()
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=1, analyzer='char', ngram_range = (3,3))
tf_idf_G = tfidf.fit_transform(G['name_no_punc'])
timespent = time.time() - t
print('TD-IDF of G takes {} seconds'.format(timespent))

TD-IDF of G takes 11.448935985565186 seconds


In [3]:
# Try TF-IDF and compute cosine similarity for Train and G (space removed)
t = time.time()
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_nospace = TfidfVectorizer(min_df=1, analyzer='char', ngram_range = (2,4))
tf_idf_G = tfidf_nospace.fit_transform(G['name_no_punc_no_space'])
timespent = time.time() - t
print('TD-IDF of G takes {} seconds'.format(timespent))

TD-IDF of G takes 31.79740595817566 seconds


In [3]:
# Transform train with tfidf
t = time.time()
tf_idf_STrain = tfidf.transform(STrain['name_no_punc'])
timespent = time.time() - t
print('TD-IDF of STrain takes {} seconds'.format(timespent))

TD-IDF of STrain takes 2.4644408226013184 seconds


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
t = time.time()
similarity_score = cosine_similarity(tf_idf_G, tf_idf_STrain, dense_output=False)
timespent = time.time() - t
print('cosine similarity takes {} seconds'.format(timespent))

In [34]:
from  scipy.sparse import find
from sklearn.metrics.pairwise import cosine_similarity

def matching(Sample_slice,tf_idf_G, threshold):
    '''Sample_slice is a sparse matrix'''
    i,v, j =find(Sample_slice)
    # V are the nonzero columns
    x,y,z = find(tf_idf_G[:,v])
    sub_G = tf_idf_G[:,v]
    slice_G = sub_G[x,:]
    similarity_score = cosine_similarity(slice_G, Sample_slice[:,v], dense_output=True)
    max_index= similarity_score.argmax()
    if max_index >= threshold:
        return x[max_index]
    else:
        return -1

In [48]:
tf_idf_G[0,[1,2]]

<1x2 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [4]:
grams = tfidf.get_feature_names()

In [36]:
t = time.time()
Sample = tf_idf_STrain[0:100]
predict = []
for i in range(Sample.shape[0]):
    pred = matching(Sample[i,:],tf_idf_G, 0.75)
    predict.append(pred)
    
timespent = time.time() - t
print('sample matching takes {} seconds'.format(timespent))

sample matching takes 15.51825499534607 seconds


In [64]:
company_id[predict]


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


 342766    250537.0
 84431     356624.0
 422974    420749.0
 432988    523467.0
 75219     231108.0
 391805    111934.0
 188971    355140.0
 273215    205639.0
 372128    485612.0
 330810    589052.0
 329018    210358.0
 335331      9529.0
 206289    453809.0
 300020     85684.0
-1              NaN
 10345     253824.0
 56044     545661.0
 215261    524676.0
 14081      95247.0
-1              NaN
 314718    271364.0
 215941    151846.0
 312791    197835.0
 397251    293343.0
 161147    164015.0
 229674    607912.0
 268070    168375.0
 261931    177092.0
 275565    340707.0
 298655    477352.0
             ...   
 26226     524563.0
 376762    105633.0
 385       461136.0
 80081     173621.0
 157175    125258.0
 239768    177010.0
 220547    594766.0
 227734    154285.0
 133048    600713.0
 23593     629407.0
 265943    169190.0
 114017    206579.0
 111223    164752.0
 285267    629724.0
 425638    421753.0
 226572    293623.0
 92614     612614.0
 428387    329427.0
 170422    571672.0


In [60]:
Sample = STrain.sample(n = 100)
Sample.head()

Unnamed: 0,train_index,name,company_id,name_no_punc,name_capital,name_no_punc_no_space
16613,16613,MP Prope. LLC,-1,mp prope llc,MPPLLC,mppropellc
10499,10499,Humanagroup Holding AB,320888,humanagroup holding ab,HHAB,humanagroupholdingab
4300,4300,"Ianc Partners Funds, Artisan. - Ianc Developin...",475596,ianc partners funds artisan ianc developing w...,IPFAIDWF,iancpartnersfundsartisaniancdevelopingworldfund
3671,3671,BBM SA,354740,bbm sa,BBMSA,bbmsa
31073,31073,Captal Impera Spółka Akcyjna,-1,captal impera spółka akcyjna,CISółA,captalimperaspółkaakcyjna


In [55]:
ans = company_id[[1,2,1]]

In [56]:
ANS = pd.DataFrame([],columns=['company_id'],index= [4,5,6])

In [57]:
ANS['company_id'] = company_id

In [66]:
G.reset_index()

Unnamed: 0,index,company_id,name,name_no_punc,name_capital,name_no_punc_no_space
0,0,634022,PRIMCOM SA,primcom sa,PRIMCOMSA,primcomsa
1,1,324497,The David Isaacs Fund,the david isaacs fund,TDIF,thedavidisaacsfund
2,2,280848,Bramor Enterprises Limited,bramor enterprises limited,BEL,bramorenterpriseslimited
3,3,432662,NAVEXIM S.A.,navexim sa,NAVEXIMSA,naveximsa
4,4,524224,Magal Group SA,magal group sa,MGSA,magalgroupsa
5,5,513585,Marly SPF S.A.,marly spf sa,MSPFSA,marlyspfsa
6,6,354496,I.T APPARELS LIMITED,it apparels limited,ITAPPARELSLIMITED,itapparelslimited
7,7,381944,VX 30.141 ApS,vx 30141 aps,VX30141AS,vx30141aps
8,8,526057,Rydex ETF Trust - Guggenheim S&P 500 Equal Wei...,rydex etf trust guggenheim sp 500 equal weigh...,RETFTGSP500EWETF,rydexetftrustguggenheimsp500equalweightetf
9,9,34381,Rydex Series Funds - Retailing Fund,rydex series funds retailing fund,RSFRF,rydexseriesfundsretailingfund


In [19]:
from sklearn.metrics.pairwise import cosine_similarity
t = time.time()
similarity_score = cosine_similarity(slice_G, tf_idf_STrain[0,v], dense_output=True)
timespent = time.time() - t
print('cosine similarity takes {} seconds'.format(timespent))

cosine similarity takes 0.022331953048706055 seconds


In [27]:
max_index= similarity_score.argmax()
max_index == 0.7

False

In [29]:
# use max_index to locate G_index
G_index = x[max_index]
print(G_index)

342766


In [24]:
similarity_score[0]

array([0.26080726])

In [43]:
test = 'A/B/C'.split('/')[-1]
l = test.split('/')
l[-1]

'C'

In [9]:
tf_idf_STrain_pd = pd.DataFrame(tf_idf_STrain.todense(), columns = grams, index = STrain.index)

In [12]:
tf_idf_G_pd = pd.DataFrame(tf_idf_G.todense(), columns = grams, index = G.index)

In [16]:
row = tf_idf_STrain_pd.iloc[0,:]

In [143]:
A = {'financial','service'}
B = {'service','financial'}
C = {'financial','time','series'}
D = 'financialservice'
E = 'servicefinancial'
F = 'financialtimeseries'
d = set(D)
e = set(E)
f = set(F)
y = edit_distance(D,F)/len(D)
x = jaccard_distance(d,f)
print(x)
print(y)

print(z)

0.25
0.4375
{'a', 'e', 's', 'i', 'c', 'r', 'f', 'n', 'v', 'l'}


In [14]:
# convert the results of bag of words in to dataframe

Matrix = pd.DataFrame(matrix, columns = features, index = G.index)

In [15]:
vectorizer.stop_words_ 

{'bv', 'fund', 'gmbh', 'inc', 'limited', 'llc', 'ltd', 'srl', 'trust'}

In [16]:
stop_words = list(vectorizer.stop_words_ )

    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450256 entries, 0 to 450255
Columns: 238928 entries, 00 to žūb
dtypes: int64(238928)
memory usage: 801.5 GB


In [None]:
# Break down the summation, in order to find the most frequent words
# 10000 at a time
import numpy as np
iteration = int(np.ceil(Matrix.shape[0] / 10000))

total_word_count = np.zeros(Matrix.shape[1])
for i in np.arange(0,iteration):
    lower_index = int(i*10000)
    upper_index = (i+1)*10000
    print(lower_index, upper_index)
    sub_matrix = Matrix.iloc[lower_index:upper_index]
    sub_word_count = sub_matrix.sum(axis = 0)
    total_word_count = total_word_count + sub_word_count
    
# There should be some residual
res_word_count = Matrix.iloc[iteration*10000:-1].sum(axis=0)
total_word_count = total_word_count + res_word_count

0 10000
10000 20000
20000 30000
30000 40000
40000 50000
50000 60000
60000 70000
70000 80000
80000 90000
90000 100000


In [1]:
450123/10000

45.0123

In [15]:
# Test matching_procedure

Sample = STrain.sample(n=100,random_state=10)



In [1]:
index= find_the_word(Sample.at[63420,'name_no_punc'], Matrix)
print(index)
print(G['name_no_punc_no_space'].loc[index])
print(G.at[216462,'name_no_punc_no_space'])

NameError: name 'find_the_word' is not defined

In [None]:
Sample_predict = pd.DataFrame([],index = Sample.index,columns = ['predict'])
Sample_predict['predict'] = -1

for i in Sample.index:
    company = Sample.at[i,'name_no_punc']
    compare_index = find_the_word(company, Matrix)
    print(i)
    if compare_index is not None:
        print(i)
        to_compared = G['name_no_punc_no_space'].loc[compare_index]
    
        company_train = Sample.at[i,'name_no_punc_no_space']
        predict = compare_jacaard(to_compared,company_train,0.1)
        Sample_predict.at[i,'predict'] = predict
        

33226
33226
64804
64804
39763
39763
51270
51270
9698
9698
5948
5948
27955
27955
55001
55001
50875
50875
47755
47755
29430
29430
32953
32953
6808
6808
32927
32927
64298
64298
96492
96492
31836
31836
97276
97276
67716
67716
2144
2144
22232
22232
75323
75323
63124
63124
11476
11476
77635
77635
76295
76295
55644
55644
18506
18506
91698
91698
65435
90450
90450
38019
38019
63469
63469
44824
44824
87863
87863
32118
32118
66512
66512
64395
64395
63420
63420


In [1]:
Sample

NameError: name 'Sample' is not defined

In [1]:
outcome, Round2 = matching_procedure(G,Sample,Matrix,0.1,4)

NameError: name 'Round2' is not defined

In [52]:
merge.loc[Sample.index]

Unnamed: 0,train_index,nameSTrain,company_id,nameR
33226,33226,HXRUK III LIMITED,539375,HXRUK III LIMITED
64804,64804,SWARCO Traffic Funds Austria GmbH,84806,SWARCO Traffic Austria GmbH
39763,39763,Ostersetzer & Co. eGse. m.b.H.,-1,
51270,51270,FACADEfip ATLANTIQUE,210136,FIP FACADE ATLANTIQUE
9698,9698,Major IP,-1,
5948,5948,BMC DIFFUSION,353477,BMC DIFFUSION
27955,27955,Adept Ltd Investment Management Public Limited...,-1,
55001,55001,Silm. Oyj Global,16000,Silmäasema Oyj
50875,50875,Ferrari S.p.A.,-1,
47755,47755,Asso ciation de Commercialisation Holding de P...,454000,Association de Commercialisation de Pneus et L...


In [23]:
Sample.head()

Unnamed: 0,train_index,name,company_id,name_no_punc,name_capital,name_no_punc_no_space,predict,loss
20353,20353,BB PRINCIPI DI FEUDO SOCIETA' AGRICOLA A RESPO...,393379,BB PRINCIPI DI FEUDO SOCIETA AGRICOLA A RESPON...,BBPRINCIPIDIFEUDOSOCIETAAGRICOLAARESPONSABILIT...,bbprincipidifeudosocietaagricolaaresponsabilit...,-1,1
93040,93040,bMOLINOX - SRL,189514,bMOLINOX SRL,MOLINOXSRL,bmolinoxsrl,-1,1
51054,51054,GbmH Holding SCB Inte. & Co. KG,85240,GbmH Holding SCB Inte Co KG,GHHSCBICKG,gbmhholdingscbintecokg,-1,1
82879,82879,Matias Fastighets Company 2 AB,445985,Matias Fastighets Company 2 AB,MFC2AB,matiasfastighetscompany2ab,-1,1
82312,82312,Macro - Multi advisers Opportunities,95736,Macro Multi advisers Opportunities,MMO,macromultiadvisersopportunities,11191,5


In [149]:
 # Count of new document
new_doc = ['FIDC IIIllc'.lower()]
new_count  = vectorizer.transform(new_doc).todense()

In [None]:
import numpy as np
np.matmul(Matrix,np.transpose(new_count))

In [125]:
vectorizer.vocabulary_['financial']

77950

In [60]:
len(vectorizer.get_feature_names())

238937

In [62]:
find_the_word = 'financial'
vectorizer.vocabulary_ [find_the_word]

G.loc[264227]

In [63]:
location = vectorizer.vocabulary_ [find_the_word]
features[location]

'financial'

In [71]:
# all the company in ground truth, and all the vocabularies
import numpy as np
np.argmax(predict)
predict[3]

432988

In [68]:
if find_the_word in features:
    Index = Matrix[Matrix[find_the_word]>=1]
else:
    print('no such word')
    
    

In [96]:
import time

In [97]:
t = time.time()
Index = Matrix[Matrix[find_the_word]>=1].index
Financial = G.loc[Index]
elapsed = time.time() - t
print('time spent for computing this is', elapsed)

time spent for computing this is 2.596104860305786


In [79]:
G['company_id'][264277]

448627

In [80]:
G.loc[G['company_id']==264277]

Unnamed: 0,company_id,name,name_no_punc,name_capital,name_no_punc_no_space
406184,264277,Blanchard Aid Propco LLC,blanchard aid propco llc,BAPLLC,blanchardaidpropcollc


In [98]:
t = time.time()
Index = Matrix[Matrix[find_the_word]>=1]
Financial = G.loc[Index.index]
elapsed = time.time() - t
print('time spent for computing this is', elapsed)

time spent for computing this is 1.97940993309021


In [72]:
Financial

Unnamed: 0,company_id,name,name_no_punc
18,260264,Temasek Financial (II) Private Limited,Temasek Financial II Private Limited
139,135139,Nidaros Financial S.A.,Nidaros Financial SA
150,179135,VANARES FINANCIAL LTD,VANARES FINANCIAL LTD
266,469229,"Chatham Financial Europe, Ltd",Chatham Financial Europe Ltd
443,529778,Lykke Financial Capital IVS,Lykke Financial Capital IVS
2295,621087,"Resource Financial Services, Inc.",Resource Financial Services Inc
3945,245201,REWARD FINANCIAL SERVICES LTD,REWARD FINANCIAL SERVICES LTD
4088,496479,AXIOM FINANCIAL SERVICES LTD,AXIOM FINANCIAL SERVICES LTD
4107,527195,ISLAND FINANCIAL SOLUTIONS LIMITED,ISLAND FINANCIAL SOLUTIONS LIMITED
4239,75144,FINANCIAL ADVICE CENTRE LIMITED,FINANCIAL ADVICE CENTRE LIMITED


In [46]:
# How long does it take to apply bag of words to the whole G
import timeit
matrix_G = vectorizer.fit_transform(G['name']).todense()

In [45]:
G.shape

(450256, 2)

In [44]:
# join and remove lowercase
import string
def join_and_remove_low(X):
    y = ''.join(X)
    
    z = y.translate(None,string.ascii_lowercase)
    zz = z.translate(None,',.-()"')

result = Sample_G['name'].apply(join_and_remove_low)                  

TypeError: translate() takes exactly one argument (2 given)

In [45]:
print(string.ascii_lowercase)

abcdefghijklmnopqrstuvwxyz


In [51]:
import string
ss = 'AbCdEfG'
t = ss.maketrans(string.ascii_lowercase, None )

TypeError: maketrans() argument 2 must be str, not None

In [None]:
#### Ideas
# remove sapce and punctuations
# remove lower cases
# compare with levenshein distance
