This notebook will prepare data in the following order:
1. Detect Unicode characters and replace them with their ASCII counterpart
2. German Characters (replaces ger chars to international chars)
3. Lower + Company's legal structure (Replaces upper case letters to lower and drops company's legal structure such as: co, gmbh, llc...) 
4. Chinese translate (translates chinese characters to english)
5. Extract location (extracts location from brackets in scm data to an additional column)			
6. Cleaning extracted location info from company name (only SCM)

Imports and packages

In [2]:
#! pip install levenshtein
#! pip install pandas
#! pip install googletrans==3.1.0a0
#! pip install https://github.com/explosion/spacy-models/releases/download/de_core_news_lg-3.2.0/de_core_news_lg-3.2.0-py3-none-any.whl
#! pip install spacy
#spacy.cli.download("en_core_web_lg")
import warnings
warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2
import spacy
import data_prep_util as util_prep
import data_prep_consts as consts_prep

In [3]:
coypu_cities_original = util_prep.read_csv_file(consts_prep.COYPU_CITIES_PATH)
scm_original = util_prep.read_csv_file(consts_prep.SCM_PATH)

In [4]:
# read in data
coypu_cities = util_prep.read_csv_file(consts_prep.COYPU_CITIES_PATH)
scm = util_prep.read_csv_file(consts_prep.SCM_PATH)

#change column names and detect important countries in scm data
coypu_cities = util_prep.prepare_columns_coypu(coypu_cities)
scm_sub = util_prep.prepare_columns_scm(scm)

Unicode

In [5]:
coypu_cities['city'] = coypu_cities['city'].map(util_prep.replace_float_city)

In [6]:
#clone name column
coypu_cities['Parsed_Company'] = coypu_cities['Company']
coypu_cities['Parsed_City'] = coypu_cities['city']
scm_sub['Parsed_Company'] = scm_sub['Company']

#unicode
coypu_cities['Parsed_Company'] = coypu_cities['Parsed_Company'].map(util_prep.normalize_unicode)
coypu_cities['Parsed_City'] = coypu_cities['Parsed_City'].map(util_prep.normalize_unicode)
scm_sub['Parsed_Company'] = scm_sub['Parsed_Company'].map(util_prep.normalize_unicode)

Replace German Characters

In [7]:
#replace german characters
coypu_cities['Parsed_Company'] = coypu_cities['Parsed_Company'].map(util_prep.change_umlaut)
coypu_cities['Parsed_City'] = coypu_cities['Parsed_City'].map(util_prep.change_umlaut)
scm_sub['Parsed_Company'] = scm_sub['Parsed_Company'].map(util_prep.change_umlaut)

Translator (CH to EN)

In [None]:
from googletrans import Translator

GROUP_SIZE = 100

translator = Translator()

wf = open('./data/chinese_translations.txt', 'a', encoding='utf-8')

group_index_start = -1
current_group = []

for index, row in coypu_cities.iterrows():

    if row['Country'] != 'CN':
        continue
    
    if len(current_group) >= GROUP_SIZE:
        print('Translating content:\n', current_group)

        translated_content = translator.translate(
            src="zh-CN", dest="en", text=current_group)

        for i in range(len(translated_content)):
            wf.write(str(group_index_start + i) + ', ' + translated_content[i].text + '\n')

        current_group = []

    if len(current_group) == 0:
        group_index_start = index

    current_group.append(row['Company'])

if len(current_group) >= 0:
        print('Translating content:\n', current_group)

        translated_content = translator.translate(
            src="zh-CN", dest="en", text=current_group)

        for i in range(len(translated_content)):
            wf.write(str(group_index_start + i) + ', ' + translated_content[i].text + '\n')

        current_group = []
        
wf.close()

In [None]:
from googletrans import Translator
import time

GROUP_SIZE = 50

translator = Translator()

wf = open('./data/chinese_translations_cities.txt', 'a', encoding='utf-8')

group_index_start = -1
current_group = []

for index, row in coypu_cities.iterrows():

    if row['Country'] != 'CN':
        continue

    if len(current_group) >= GROUP_SIZE:
        print('Translating content:\n', current_group)

        while True:
            try:
                translated_content = translator.translate(
                    src="zh-CN", dest="en", text=current_group)
                break 
            
            except:
                print('translating group failed, waiting 100 sec before trying again')
                time.sleep(100)
                print('reattempting to translate group')
                continue
            
        for i in range(len(translated_content)):
            wf.write(str(group_index_start + i) + ', ' + translated_content[i].text + '\n')

        current_group = []

    if len(current_group) == 0:
        group_index_start = index

    current_group.append(row['city'])

if len(current_group) >= 0:
        print('Translating content:\n', current_group)

        while True:
            try:
                translated_content = translator.translate(
                    src="zh-CN", dest="en", text=current_group)
                break 
            except:
                print('translating group failed, waiting 100 sec before trying again')
                time.sleep(100)
                print('reattempting to translate group')
                continue
            
        for i in range(len(translated_content)):
            wf.write(str(group_index_start + i) + ', ' + translated_content[i].text + '\n')

        current_group = []
        
wf.close()

In [8]:
import pandas as pd
ch = pd.read_csv('./data/chinese_translations.txt', usecols= ['Index','Company'])                   #manually added column names
ch_cities = pd.read_csv('./data/chinese_translations_cities.txt', usecols= ['Index','Parsed_City']) #manually added column names

Add translated Names to Dataframe

In [9]:
coypu_ch_start_index = ch['Index'][0] 
coypu_ch_end_index = (int(ch['Index'].tail(1)) + 1) 

coypu_ch_city_start_index = ch_cities['Index'][0] 
coypu_ch_city_end_index = (int(ch_cities['Index'].tail(1)) + 1) 

coypu_cities['Parsed_Company'][coypu_ch_start_index:coypu_ch_end_index] = ch['Company']
coypu_cities['Parsed_City'][coypu_ch_city_start_index:coypu_ch_city_end_index] = ch_cities['Parsed_City']

In [10]:
#replace translated chinese names
coypu_cities['Parsed_Company'] = util_prep.replace_chinese_translation(coypu_cities['Parsed_Company'], coypu_cities, 'company')
coypu_cities['Parsed_City'] = util_prep.replace_chinese_translation(coypu_cities['Parsed_City'], coypu_cities, 'city')

Export Location Info from Brackets SCM

In [11]:
scm_sub['Location'] = scm_sub['Parsed_Company'].map(util_prep.find_brackets_in_string)

Clean entries (removing unnecessary words and transforming to lower case)

In [12]:
util_prep.clear_company_content(scm_sub)

Remove Location Info

In [13]:
scm_sub['Location'] = scm_sub['Location'].map(util_prep.clear_location_column)
coypu_cities['Parsed_City'] = coypu_cities['Parsed_City'].map(lambda city: city.lower())\
                                                         .map(util_prep.normalize_unicode)\
                                                         .map(util_prep.change_umlaut)

Clean Company Descriptors


In [14]:
coypu_cities['Parsed_Company'] = coypu_cities['Parsed_Company'].map(util_prep.clear_company_name)
scm_sub['Parsed_Company'] = scm_sub['Parsed_Company'].map(util_prep.clear_company_name)

In [15]:
scm_sub['Parsed_Company'] = scm_sub['Parsed_Company'].map(util_prep.clear_comp_name_column)

Extract Company Legal Form

In [16]:
company_prep_cat_coy = coypu_cities['Company'].map(lambda company: company.lower().strip())\
                                          .map(util_prep.change_umlaut)
                                                          
coypu_cities['Company_Category'] = list(map(util_prep.company_category, company_prep_cat_coy, coypu_cities['Country']))

In [17]:
company_prep_cat_scm = scm_sub['Company'].map(lambda company: company.lower().strip())\
                                         .map(util_prep.change_umlaut)
                                                          
                                                          
scm_sub['Company_Category'] = list(map(util_prep.company_category, company_prep_cat_scm, scm_sub['Country']))

In [18]:
coypu_cities['Parsed_Company'] = coypu_cities['Parsed_Company'].map(util_prep.remove_suffixes)
scm_sub['Parsed_Company'] = scm_sub['Parsed_Company'].map(util_prep.remove_suffixes)

In [20]:
coypu_cities.Company_Category.fillna('', inplace = True)
scm_sub.Company_Category.fillna('', inplace = True)

In [21]:
scm_sub['Location'] = scm_sub['Location'].map(lambda location: location.lower())

Add Geolocation Info to the Data

In [22]:
scm_sub.to_csv('./geolocation/SCM_geo.csv', index = False)
coypu_cities.to_csv('./geolocation/COYPU_geo.csv', index = False)

In [54]:
coypu_geoloc = pd.read_csv('./geolocation/coypu_geolocate.csv')
scm_geoloc = pd.read_csv('./geolocation/SCM_geolocate.csv')

Add Longitude and Latitude

In [55]:
coypu_cities['long'] = coypu_geoloc['long']
coypu_cities['lat'] = coypu_geoloc['lat']

scm_sub['long'] = scm_geoloc['long']
scm_sub['lat'] = scm_geoloc['lat']

In [None]:
import en_core_web_lg

nlp = en_core_web_lg.load()
nlp = spacy.load('en_core_web_lg')

count = 0

def detect_cities(value):
    global count

    doc = nlp(value)

    count += 1

    for ent in doc.ents:

        if ent.label_ == 'GPE' and ent.text != 'DE' and ent.text != 'US' and ent.text != 'CN':
            print(ent.text, ' | ', count / len(coypu_cities['Parsed_Company']) * 100, '%')
            return ent.text

    return ''


coypu_cities['Location'] = coypu_cities['Parsed_Company'].map(detect_cities)

coypu_cities.to_csv('companies_with_locations.csv', columns = ['company', 'Location'])


Export Prepared Data

In [58]:
scm_sub.to_csv('./data/SCM_prep_data.csv', index = False)
coypu_cities.to_csv('./data/COYPU_prep_data.csv', index = False)