In [1]:
import pandas as pd
from transliterate import translit
from tqdm.notebook import tqdm
from collections import Counter

### 1. Show data

In [2]:
df = pd.read_csv('../data/data.csv')

In [3]:
df.shape

(497819, 4)

In [4]:
df.head()

Unnamed: 0,pair_id,name_1,name_2,is_duplicate
0,1,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
1,2,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
2,3,"Rishichem Distributors Pvt., Ltd.",Dsa,0
3,4,Powermax Rubber Factory,Co. One,0
4,5,Tress A/S,Longyou Industries Park Zhejiang,0


In [5]:
df.name_1.nunique(), df.name_2.nunique()

(17656, 17684)

In [6]:
len(set(df.name_1.tolist()) - set(df.name_2.tolist()))

338

In [7]:
len(set(df.name_2.tolist()) - set(df.name_1.tolist()))

366

### Check top-k popular word

In [8]:
k = 30
list_from_name_1 = Counter(" ".join(df["name_1"]).split()).most_common(28)
list_from_name_2 = Counter(" ".join(df["name_2"]).split()).most_common(28)

In [9]:
list_from_name_1, list_from_name_2

([('Ltd.', 136552),
  ('Co.,', 58457),
  ('Inc.', 49109),
  ('International', 48822),
  ('De', 45427),
  ('Industries', 34039),
  ('Trading', 27911),
  ('Logistics', 26357),
  ('&', 25204),
  ('Rubber', 19583),
  ('Co.', 18828),
  ('Private', 16729),
  ('Sa', 16457),
  ('Pvt.,', 15727),
  ('India', 15429),
  ('Llc', 14520),
  ('Cv', 14511),
  ('S.A.', 14003),
  ('Ltda', 13903),
  ('S', 13181),
  ('Mexico', 10969),
  ('Products', 10864),
  ('Industrial', 10063),
  ('Corporation', 10002),
  ('Imp.', 9569),
  ('A', 9470),
  ('Exp.', 9361),
  ('C.V.', 8207)],
 [('Ltd.', 136887),
  ('Co.,', 58670),
  ('International', 49103),
  ('Inc.', 48371),
  ('De', 44832),
  ('Industries', 34125),
  ('Trading', 27822),
  ('Logistics', 26241),
  ('&', 24773),
  ('Rubber', 19547),
  ('Co.', 18754),
  ('Private', 16601),
  ('Sa', 16102),
  ('Pvt.,', 15812),
  ('India', 15520),
  ('Llc', 14696),
  ('Cv', 14511),
  ('Ltda', 13908),
  ('S.A.', 13816),
  ('S', 13080),
  ('Mexico', 10976),
  ('Products', 10808

In [10]:
list_words = list_from_name_1 + list_from_name_2
list_words = [el[0] for el in list_words]

In [11]:
list_words

['Ltd.',
 'Co.,',
 'Inc.',
 'International',
 'De',
 'Industries',
 'Trading',
 'Logistics',
 '&',
 'Rubber',
 'Co.',
 'Private',
 'Sa',
 'Pvt.,',
 'India',
 'Llc',
 'Cv',
 'S.A.',
 'Ltda',
 'S',
 'Mexico',
 'Products',
 'Industrial',
 'Corporation',
 'Imp.',
 'A',
 'Exp.',
 'C.V.',
 'Ltd.',
 'Co.,',
 'International',
 'Inc.',
 'De',
 'Industries',
 'Trading',
 'Logistics',
 '&',
 'Rubber',
 'Co.',
 'Private',
 'Sa',
 'Pvt.,',
 'India',
 'Llc',
 'Cv',
 'Ltda',
 'S.A.',
 'S',
 'Mexico',
 'Products',
 'Corporation',
 'Industrial',
 'Imp.',
 'A',
 'Exp.',
 'C.V.']

### Check company with Russian names

In [12]:
def check_rus_symbols(string):
    
    rus_letters = ["а", "б", "в", "г", "д","е","ё",
                   "ж","з","и","й","к","л","м","н","о",
                   "п","р","с","т","у","ф","х","ц","ч",
                   "ш","щ","ъ","ы","ь","э","ю","я"]
    
    return any([i in rus_letters for i in string])

In [14]:
list_russian_company = []

for i, row in tqdm(df.iterrows()):
    
    if check_rus_symbols(row.name_1):
        list_russian_company.append(row.name_1)
        
    if check_rus_symbols(row.name_2):
        list_russian_company.append(row.name_2)    

0it [00:00, ?it/s]

In [15]:
len(list_russian_company), set(list_russian_company)

(652,
 {'*** ООО "ИнтерТорг"',
  '*** ООО "ИнтерТулз"',
  '*** ООО "Интербалк"',
  '«ГК Полипласт» ',
  'АО "Интеркомп"',
  'Бостик',
  'Мишлен',
  'ООО "ИнтерСталь"',
  'ООО "ИнтэрПром"',
  'ООО "Полимаркет"',
  'ООО "Элит"',
  'ООО «Полипласт Новомосковск»',
  'ООО «Полипласт Северо-запад»',
  'ООО «Полипласт-УралСиб»',
  'ООО Полимаркет',
  'ООО Руспласт',
  'ООО Химинвест групп',
  'Ондулин',
  'Репсол'})

### Create function for apply to data

In [23]:
rus_letters = ["а", "б", "в", "г", "д","е","ё",
                   "ж","з","и","й","к","л","м","н","о",
                   "п","р","с","т","у","ф","х","ц","ч",
                   "ш","щ","ъ","ы","ь","э","ю","я"]

def convert_string_to_english(company_name):
    
    """
    The function checks the string and translates to English,
    and delete 'OOO' and other abbreviation iin Russian names
    """
    
 
    if any([i in rus_letters for i in company_name]):
        company_name = company_name.replace('ООО', '')
        company_name = company_name.replace('АО', '')
        company_name = company_name.replace('ГК', '')
        
        return translit(company_name, language_code='ru', reversed=True)
    
    return company_name

In [24]:
def replace_symbols(company_name):
    """Delete all symbols in name_company"""
    
    update_name = ''
    
    for ch in company_name:
        if ch.isalnum():
            update_name += ch
        else:
            update_name += ' '
            
    update_name = update_name.strip()
    update_name = ' '.join(update_name.split())
    
    if update_name == '': 
        update_name = company_name
    return update_name

In [25]:
## Examples
replace_symbols('*** ООО "ИнтерТорг"'), replace_symbols('Technocraft Industries (India) Ltd.')

('ООО ИнтерТорг', 'Technocraft Industries India Ltd')

In [26]:
list_words = ['Ltd.', 'Co.,', 'Inc.', 'International', 'De', 'Industries', 'Trading', 'Logistics', '&',
              'Rubber', 'Co.', 'Private', 'Sa', 'Pvt.,', 'India', 'Llc', 'Cv', 'S.A.', 'Ltda', 'S', 'Mexico',
              'Products','Industrial', 'Corporation', 'Imp.', 'A', 'Exp.', 'C.V.', 'Ltd.', 'Co.,', 'International',
              'Inc.', 'De', 'Industries', 'Trading', 'Logistics', '&', 'Rubber', 'Co.', 'Private', 'Sa', 'Pvt.', 
              'India', 'Llc', 'Cv', 'Ltda', 'S.A.', 'S', 'Mexico', 'Products', 'Corporation', 'Industrial', 'Imp.', 
              'A','Exp.','C.V.', 'A/S']

In [27]:
def drop_popular_words(company_name):
    """
    Drop popular words from company_name
    """
    c = company_name
    update_name = []
    company_name = company_name.replace(',', ' ')
    for word in company_name.split():
        if word not in list_words:
            update_name.append(word)
        
    if len(' '.join(update_name)) == 0:
        return c
    return ' '.join(update_name)

In [28]:
def update_company_names(company_name):
    """
    Function, which include 3 function:
    1. convert_string_to_english
    2. drop_popular_words
    3. replace_symbols
    """
    
    company_name = convert_string_to_english(company_name)
    company_name = drop_popular_words(company_name)
    company_name = replace_symbols(company_name)
    
    return company_name

In [29]:
#Example
update_company_names('*** ООО "ИнтерТорг"'), update_company_names('Technocraft Industries (India) Ltd.')

('InterTorg', 'Technocraft India')

### Apply to data and check

In [30]:
df['name_1_upd'] = df.name_1.apply(update_company_names)
df['name_2_upd'] = df.name_2.apply(update_company_names)

In [31]:
df.head()

Unnamed: 0,pair_id,name_1,name_2,is_duplicate,name_1_upd,name_2_upd
0,1,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0,Iko,Enormous Trade
1,2,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0,Apcotex,Technocraft India
2,3,"Rishichem Distributors Pvt., Ltd.",Dsa,0,Rishichem Distributors,Dsa
3,4,Powermax Rubber Factory,Co. One,0,Powermax Factory,One
4,5,Tress A/S,Longyou Industries Park Zhejiang,0,Tress,Longyou Park Zhejiang


In [32]:
df.to_csv('data_v2_19_oct.csv')

In [None]:
df[df.is_duplicate == True].head()

In [None]:
a = pd.Series(df.name_1.unique().tolist())

In [37]:
df_base = df['name_1'].unique()

In [39]:
df_base = pd.DataFrame({'name_1': df['name_1'].unique()})

In [40]:
df_base.head()

Unnamed: 0,name_1
0,Iko Industries Ltd.
1,Apcotex Industries Ltd.
2,"Rishichem Distributors Pvt., Ltd."
3,Powermax Rubber Factory
4,Tress A/S


In [41]:
df_base['name_1_upd'] = df_base['name_1'].apply(update_company_names)

In [43]:
df_base.to_csv('database.csv')