# Fuzzy Match, Preprocessing

In [24]:
#pip install fuzzywuzzy

In [1]:
import pandas as pd
import openpyxl

from fuzzywuzzy import process
from fuzzywuzzy import fuzz

from tqdm import tqdm



In [2]:
dbpedia = pd.read_csv("../Data/DBPedia_Companies_Preprocessed.csv")
sbti = pd.read_excel("../Data/companies-taking-action.xlsx")
forbes=pd.read_csv("../Data/Forbes_2000_top_company_CLNQ11.csv")

In [3]:
#Create Unique IDs for forbes and sbti dataset
sbti.index = ["SBTI_" + str(i) for i in range(1, len(sbti)+1)]
sbti.index.name = "UID"
sbti = sbti.reset_index()

forbes.index = ["Forbes_" + str(i) for i in range(1, len(forbes)+1)]
forbes.index.name = "UID"
forbes = forbes.reset_index()

In [4]:
# use same column name for company name

dbpedia["name"] = dbpedia["label"]
sbti["name"] = sbti["Company Name"]
forbes["name"] = forbes["Organization Name"]


## Fuzzy match dataframes by company name

In [110]:
def fuzzymatch_dfs_on_name(df1, df2, min_similarity_score):

    def get_df_name(df):
        name =[x for x in globals() if globals()[x] is df][0]
        return name

    df1_name = get_df_name(df1)
    df2_name = get_df_name(df2)
    match_name = df1_name + "_" + df2_name

    fuzzy_match_results = []
    match_id = 1

    df1_names = df1["name"].tolist()
    df2_names = df2["name"].tolist()

    for name1 in tqdm(df1_names):
        
        match, score = process.extractOne(name1, df2_names, scorer=fuzz.token_sort_ratio)
        
        if score >= min_similarity_score:
            match_info = {
                "Match ID": match_name + "_" + str(match_id),
                df1_name + "_uid": df1[df1['name'] == name1]['UID'].values[0],
                df2_name + "_uid": df2[df2['name'] == match]['UID'].values[0],
                df1_name + "_company": name1,
                df2_name + "_company": match,
                "Similarity Score": score
            }

        

            fuzzy_match_results.append(match_info)
            match_id += 1

    fuzzy_match_df = pd.DataFrame(fuzzy_match_results)

    return fuzzy_match_df



run fuzzy matching with each dataset combination:

In [113]:
# dbpedia & forbes

fuzzy_match_dbpedia_forbes = fuzzymatch_dfs_on_name(dbpedia2, forbes2, 10)
print(fuzzy_match_dbpedia_forbes.head())

#save
fuzzy_match_dbpedia_forbes.to_excel("../Python/Identiy Resolution/dbp_forbes_fuzzy_match_results.xlsx", index=False)


['Berkshire Hathaway', 'ICBC', 'Saudi Arabian Oil Company (Saudi Aramco)', 'JPMorgan Chase', 'China Construction Bank']


100%|██████████| 5/5 [00:00<00:00, 731.22it/s]

BioNTech
ICBC
Royal Ceramics
JPMorgan Chase
Vodafone Kabel Deutschland
Saudi Arabian Oil Company (Saudi Aramco)
Union Pacific Corporation
Saudi Arabian Oil Company (Saudi Aramco)
Vallibel One
ICBC
             Match ID                             dbpedia2_uid  \
0  dbpedia2_forbes2_1  0    DBPedia_0
Name: UID, dtype: object   
1  dbpedia2_forbes2_2  1    DBPedia_1
Name: UID, dtype: object   
2  dbpedia2_forbes2_3  2    DBPedia_2
Name: UID, dtype: object   
3  dbpedia2_forbes2_4  3    DBPedia_3
Name: UID, dtype: object   
4  dbpedia2_forbes2_5  4    DBPedia_4
Name: UID, dtype: object   

                      forbes2_uid            dbpedia2_company  \
0  1    2
Name: UID, dtype: int64                    BioNTech   
1  3    4
Name: UID, dtype: int64              Royal Ceramics   
2  2    3
Name: UID, dtype: int64  Vodafone Kabel Deutschland   
3  2    3
Name: UID, dtype: int64   Union Pacific Corporation   
4  1    2
Name: UID, dtype: int64                Vallibel One   

               




In [60]:
# dbpedia & sbti
fuzzy_match_dbpedia_sbti = fuzzymatch_dfs_on_name(dbpedia, sbti, 10)

print(fuzzy_match_dbpedia_sbti.head())

#save
#fuzzy_match_dbpedia_sbti.to_excel("../Python/Identiy Resolution/dbp_sbti_fuzzy_match_results.xlsx", index=False)

Unnamed: 0,UID,company,label,industries,keyPeople,founders,hqLocationCountry,revenue,assets,netIncome,foundingYear,type
0,DBPedia_0,http://dbpedia.org/resource/BioNTech,BioNTech,Biotechnology,"Ryan Richardson, Sean Marett, Sierk Poetting, ...","Helmut Jeggle, Michael Motschmann, Thomas Strü...",Germany,17761766400,14378570000.0,344803800.0,2008.0,Societas Europaea
1,DBPedia_1,http://dbpedia.org/resource/Royal_Ceramics,Royal Ceramics,Capital good,"A. M. Weerasinghe, Aravinda Perera, H. Amarase...",,Sri Lanka,57545,86241.0,13690.0,1990.0,Public company
2,DBPedia_2,http://dbpedia.org/resource/Vodafone_Kabel_Deu...,Vodafone Kabel Deutschland,Telecommunication,Manuel Cubero,,Germany,1719790080,2685880000.0,232124700.0,1980.0,Subsidiary
3,DBPedia_3,http://dbpedia.org/resource/Union_Pacific_Corp...,Union Pacific Corporation,,Lance M. Fritz,,United States,21800000000,63500000000.0,6500000000.0,1969.0,Public company
4,DBPedia_4,http://dbpedia.org/resource/Vallibel_One,Vallibel One,Conglomerate (company),"Dinusha Bhaskaran, H. Amarasekara",Dhammika Perera,Sri Lanka,96859,285210.0,20580.0,2010.0,Public company


In [7]:
# forbes & sbti
fuzzy_match_forbes_sbti = fuzzymatch_dfs_on_name(forbes, sbti, 10)

#print(fuzzy_match_forbes_sbti.head())

#save
fuzzy_match_forbes_sbti.to_excel("../Python/Identiy Resolution/forbes_sbti_fuzzy_match_min10_results.xlsx", index=False)

100%|██████████| 1999/1999 [07:47<00:00,  4.28it/s]

        Match ID                              forbes_uid  \
0  forbes_sbti_1  0    Forbes_1
Name: UID, dtype: object   
1  forbes_sbti_2  1    Forbes_2
Name: UID, dtype: object   
2  forbes_sbti_3  2    Forbes_3
Name: UID, dtype: object   
3  forbes_sbti_4  3    Forbes_4
Name: UID, dtype: object   
4  forbes_sbti_5  4    Forbes_5
Name: UID, dtype: object   

                                     sbti_uid  \
0  5149    SBTI_5150
Name: UID, dtype: object   
1  1264    SBTI_1265
Name: UID, dtype: object   
2  4597    SBTI_4598
Name: UID, dtype: object   
3  2879    SBTI_2880
Name: UID, dtype: object   
4    778    SBTI_779
Name: UID, dtype: object   

                             forbes_company           sbti_company  \
0                        Berkshire Hathaway          Sunway Berhad   
1                                      ICBC                    CRB   
2  Saudi Arabian Oil Company (Saudi Aramco)    S-BIC COMPANY, LTD.   
3                            JPMorgan Chase            Jordanes 




In [111]:
# test
sbti_test = sbti.head(10)
forbes_test = forbes.head(10)
fuzzy_match_forbes_sbti = fuzzymatch_dfs_on_name(forbes_test, sbti_test, 10)

#print(fuzzy_match_forbes_sbti.head())

#save
#fuzzy_match_forbes_sbti.to_excel("../Python/Identiy Resolution/forbes_sbti_fuzzy_match_min10_results.xlsx", index=False)

100%|██████████| 10/10 [00:00<00:00, 501.16it/s]


In [112]:
# extract uids from obejct!!
fuzzy_match_forbes_sbti.head()

Unnamed: 0,Match ID,forbes_test_uid,sbti_test_uid,forbes_test_company,sbti_test_company,Similarity Score
0,forbes_test_sbti_test_1,Forbes_1,SBTI_7,Berkshire Hathaway,3B-Fibreglass,32
1,forbes_test_sbti_test_2,Forbes_2,SBTI_8,ICBC,3i Group plc,25
2,forbes_test_sbti_test_3,Forbes_3,SBTI_1,Saudi Arabian Oil Company (Saudi Aramco),(ACIP) Alexandria Company for Industrial Packages,42
3,forbes_test_sbti_test_4,Forbes_4,SBTI_1,JPMorgan Chase,(ACIP) Alexandria Company for Industrial Packages,26
4,forbes_test_sbti_test_5,Forbes_5,SBTI_9,China Construction Bank,3P Innovation Ltd,40


In [16]:
fuzzy_match_forbes_sbti["forbes_uid2"] = fuzzy_match_forbes_sbti["forbes_uid"]

In [81]:
#index_to_extract = 0  # Change this index to the desired position
#fuzzy_match_forbes_sbti['forbes_uid2'] = fuzzy_match_forbes_sbti['forbes_uid'].apply(lambda x: x[index_to_extract] if len(x) > index_to_extract else None)

def extract_value(row, colname):
    index_to_extract = row.name  # Row index as the index to extract
    inner_list = row[colname]
    value = inner_list[index_to_extract]
    return value

#fuzzy_match_forbes_sbti['forbes_uid'] = fuzzy_match_forbes_sbti.apply(lambda x: extract_value(x, colname= "forbes_uid"), axis=1)
fuzzy_match_forbes_sbti['sbti_uid2'] = fuzzy_match_forbes_sbti.apply(lambda x: extract_value(x, colname= "sbti_uid"), axis=1)


KeyError: 0

In [102]:
t = fuzzy_match_forbes_sbti["sbti_uid"][0].index

KeyError: 'sbti_uid'