In [None]:
import warnings
warnings.filterwarnings("ignore")
import data_prep_ditto_util as prep_util
import data_prep_consts as prep_consts
%load_ext autoreload
%autoreload 2

Read in prepared Data

In [91]:
coypu = prep_util.read_csv_file(prep_consts.COYPU_PATH_PREP)
scm = prep_util.read_csv_file(prep_consts.SCM_PATH_PREP)

Generate country-based datasets

In [92]:
scm_germany = scm[scm.Country == 'DE'].reset_index(drop = True)
scm_china = scm[scm.Country == 'CN'].reset_index(drop = True)
scm_usa = scm[scm.Country == 'US'].reset_index(drop = True)

coypu_germany = coypu[coypu.Country == 'DE'].reset_index(drop = True)
coypu_china = coypu[coypu.Country == 'CN'].reset_index(drop = True)
coypu_usa = coypu[coypu.Country == 'US'].reset_index(drop = True)

In [93]:
scm_germany['Parsed_Company'] = scm_germany['Parsed_Company'].fillna('')
scm_china['Parsed_Company'] = scm_china['Parsed_Company'].fillna('')
scm_usa['Parsed_Company'] = scm_usa['Parsed_Company'].fillna('')

coypu_germany['Parsed_Company'] = coypu_germany['Parsed_Company'].fillna('')
coypu_china['Parsed_Company'] = coypu_china['Parsed_Company'].fillna('')
coypu_usa['Parsed_Company'] = coypu_usa['Parsed_Company'].fillna('')

Save COYPU and SCM as COLVAL Files

In [94]:
col_val_builder_coy = prep_util.ColValueBuilder()\
                .add_column('Parsed_Company', 'COMPANY')\
                .add_column('Company_Category', 'CO_CATEGORY')\
                .add_column('long', 'LONG')\
                .add_column('lat', 'LAT')

col_val_builder_scm = prep_util.ColValueBuilder()\
                .add_column('Parsed_Company', 'COMPANY')\
                .add_column('Company_Category', 'CO_CATEGORY')\
                .add_column('long', 'LONG')\
                .add_column('lat', 'LAT')

Preparing Ditto Blocking File - Generating COL VAL Pairs

In [98]:
counter_positive = 0

def generate_positive_country_pairs(country_scm, country_coypu, filename, threshold = prep_consts.COMPARE_COMPANY_THRESHOLD):
  global counter_positive
  
  for i in range(len(country_scm['Country']) - 1):
    for j in range(len(country_coypu['Country']) - 1):
      
      if prep_util.compare_company_names(country_scm['Parsed_Company'][i], country_coypu['Parsed_Company'][j], threshold):
        #print(country_scm['Company'][i], country_coypu['Company'][j])
        counter_positive += 1
        matched_pair = ''
        matched_pair = prep_util.form_colval_pairs([
                                                    ['COMPANY', country_scm['Parsed_Company'][i]], 
                                                    ['CO_CATEGORY', country_scm['Company_Category'][i]],
                                                    ['LONG', country_scm['long'][i]],
                                                    ['LAT', country_scm['lat'][i]]
                                                    ], 
                                                   [
                                                    ['COMPANY', country_coypu['Parsed_Company'][j]],
                                                    ['CO_CATEGORY', country_coypu['Company_Category'][j]],
                                                    ['LONG', country_coypu['long'][j]],
                                                    ['LAT', country_coypu['lat'][j]]
                                                    ],
                                                    1)
        print(matched_pair)
        print('reached index i: ', i, 'reached index j: ', j)
        
        prep_util.save_txt_to_file(filename, matched_pair + '\n')

In [None]:
print('Starting DE country search')
generate_positive_country_pairs(scm_germany, coypu_germany, './data/blocker_data_de.txt')
print('Starting CN country search')
generate_positive_country_pairs(scm_china, coypu_china, './data/blocker_data_cn.txt', 0.15)
print('Starting US country search')
generate_positive_country_pairs(scm_usa, coypu_usa, './data/blocker_data_us.txt')

In [67]:
block_us = len(open("./data/blocker_data_us.txt", encoding = 'utf-8').readlines())
block_de = len(open("./data/blocker_data_de.txt", encoding = 'utf-8').readlines())
block_cn = len(open("./data/blocker_data_cn.txt", encoding = 'utf-8').readlines())

In [101]:
import random

def generate_negative_samples(scm_country, coypu_country, filename, sample_count):
  concated_negatives = ''
  negative_vals_count = 0
  
  while negative_vals_count < sample_count:

    rndm_scm_idx = random.randint(0, len(scm_country['Country']) - 1)
    rndm_coy_idx = random.randint(0, len(coypu_country['Country']) - 1)

    comp_value = prep_util.compare_company_names(scm_country['Parsed_Company'][rndm_scm_idx], coypu_country['Parsed_Company'][rndm_coy_idx], prep_consts.COMPARE_COMPANY_THRESHOLD)
    
    if comp_value == 0:
      negative_vals_count += 1
      matched_pair = prep_util.form_colval_pairs([
                                                  ['COMPANY', scm_country['Parsed_Company'][rndm_scm_idx]],
                                                  ['CO_CATEGORY', scm_country['Company_Category'][rndm_scm_idx]],
                                                  ['LONG', scm_country['long'][rndm_scm_idx]],
                                                  ['LAT', scm_country['lat'][rndm_scm_idx]]
                                                  ],
                                                  [
                                                  ['COMPANY', coypu_country['Parsed_Company'][rndm_coy_idx]],
                                                  ['CO_CATEGORY', coypu_country['Company_Category'][rndm_coy_idx]],
                                                  ['LONG', coypu_country['long'][rndm_coy_idx]],
                                                  ['LAT', coypu_country['lat'][rndm_coy_idx]]
                                                  ],
                                                  0)
      
      concated_negatives += matched_pair + '\n'
      print(matched_pair)
      negative_vals_count += 1

  prep_util.save_txt_to_file(filename , concated_negatives)

In [None]:
generate_negative_samples(scm_germany, coypu_germany, './data/blocker_data_de.txt', block_de)
generate_negative_samples(scm_china, coypu_china, './data/blocker_data_cn.txt', block_cn)
generate_negative_samples(scm_usa, coypu_usa, './data/blocker_data_us.txt', block_us)

Input data fuzzy matcher

In [81]:
fuzzy_results = prep_util.read_csv_file("../fuzzy_matcher/fuzzy_results.csv")

In [82]:
# generate country-based datasets
fuzzy_results_de = fuzzy_results[fuzzy_results.Country_SCM == 'DE'].reset_index(drop = True)
fuzzy_results_us = fuzzy_results[fuzzy_results.Country_SCM == 'US'].reset_index(drop = True)
fuzzy_results_cn = fuzzy_results[fuzzy_results.Country_SCM == 'CN'].reset_index(drop = True)

In [83]:
fuzzy_results_de = fuzzy_results_de[['best_match_score', 
                                     'Company_SCM', 'Location_SCM', 'Company_Category_SCM', 'long_SCM', 'lat_SCM',
                                     'Company_COYPU', 'Location_COYPU', 'Company_Category_COYPU', 'long_COYPU', 'lat_COYPU'
                                    ]]

fuzzy_results_us = fuzzy_results_us[['best_match_score', 
                                     'Company_SCM', 'Location_SCM', 'Company_Category_SCM', 'long_SCM', 'lat_SCM',
                                     'Company_COYPU', 'Location_COYPU', 'Company_Category_COYPU', 'long_COYPU', 'lat_COYPU'
                                    ]]

fuzzy_results_us = fuzzy_results_us[['best_match_score', 
                                     'Company_SCM', 'Location_SCM', 'Company_Category_SCM', 'long_SCM', 'lat_SCM',
                                     'Company_COYPU', 'Location_COYPU', 'Company_Category_COYPU', 'long_COYPU', 'lat_COYPU'
                                    ]]

In [86]:
THRESHOLD_FUZZY = -0.487
def create_col_val_fuzzy_positive(df, filename):
    global THRESHOLD_FUZZY
    for i in range(len(df['best_match_score']) - 1):
        if df['best_match_score'][i] > THRESHOLD_FUZZY:
            matched_pair = prep_util.form_colval_pairs([
                                                        ['COMPANY', df['Company_SCM'][i]], 
                                                        ['CO_CATEGORY', df['Company_Category_SCM'][i]],
                                                        ['LONG', df['long_SCM'][i]],
                                                        ['LAT', df['lat_SCM'][i]]
                                                        ], 
                                                        [
                                                        ['COMPANY', df['Company_COYPU'][i]],
                                                        ['CO_CATEGORY', df['Company_Category_COYPU'][i]],
                                                        ['LONG', df['long_COYPU'][i]],
                                                        ['LAT', df['lat_COYPU'][i]]
                                                        ],
                                                        1)
            print(matched_pair)
            prep_util.save_txt_to_file(filename, matched_pair + '\n')

In [None]:
# count positive values in fuzzy
positive_counter_de = len(fuzzy_results_de[fuzzy_results_de.best_match_score > THRESHOLD_FUZZY])
positive_counter_us = len(fuzzy_results_us[fuzzy_results_us.best_match_score > THRESHOLD_FUZZY])
positive_counter_cn = len(fuzzy_results_cn[fuzzy_results_cn.best_match_score > THRESHOLD_FUZZY])

In [None]:
# count negative values in fuzzy
negative_fuzzy_de = fuzzy_results_de[fuzzy_results_de.best_match_score < THRESHOLD_FUZZY].reset_index(drop = True)
negative_fuzzy_us = fuzzy_results_us[fuzzy_results_us.best_match_score < THRESHOLD_FUZZY].reset_index(drop = True)
negative_fuzzy_cn = fuzzy_results_cn[fuzzy_results_cn.best_match_score < THRESHOLD_FUZZY].reset_index(drop = True)

In [None]:
def create_col_val_fuzzy_negative(df, filename, positive_counter):
    random_list = set()
    for i in range(positive_counter):
        random_idx = random.randint(0, len(df['Company_SCM']) - 1)
        
        while random_idx in random_list:
            random_idx = random.randint(0, len(df['Company_SCM']) - 1)

        random_list.add(random_idx)

        matched_pair = prep_util.form_colval_pairs([
                                                    ['COMPANY', df['Company_SCM'][random_idx]], 
                                                    ['CO_CATEGORY', df['Company_Category_SCM'][random_idx]],
                                                    ['LONG', df['long_SCM'][random_idx]],
                                                    ['LAT', df['lat_SCM'][random_idx]]
                                                    ], 
                                                    [
                                                    ['COMPANY', df['Company_COYPU'][random_idx]],
                                                    ['CO_CATEGORY', df['Company_Category_COYPU'][random_idx]],
                                                    ['LONG', df['long_COYPU'][random_idx]],
                                                    ['LAT', df['lat_COYPU'][random_idx]]
                                                    ],
                                                    0)
        print(matched_pair)
        prep_util.save_txt_to_file(filename, matched_pair + '\n')

Generate COLVAL Files

In [None]:
# generate positive matches for fuzzy matcher input
create_col_val_fuzzy_positive(fuzzy_results_de, './data/fuzzy/blocker_data_fuzzy_de.txt')
create_col_val_fuzzy_positive(fuzzy_results_us, './data/fuzzy/blocker_data_fuzzy_us.txt')
create_col_val_fuzzy_positive(fuzzy_results_cn, './data/fuzzy/blocker_data_fuzzy_cn.txt')

In [None]:
# generate negative matches for fuzzy matcher input
create_col_val_fuzzy_negative(fuzzy_results_de, './data/fuzzy/blocker_data_fuzzy_de_NEW.txt', positive_counter_de)
create_col_val_fuzzy_negative(fuzzy_results_us, './data/fuzzy/blocker_data_fuzzy_us_NEW.txt', positive_counter_us)
create_col_val_fuzzy_negative(fuzzy_results_cn, './data/fuzzy/blocker_data_fuzzy_cn_NEW.txt', positive_counter_cn)