In [1]:
from collections import defaultdict
from sklearn.model_selection import train_test_split
from typing import List

import pandas as pd

In [2]:
data = pd.read_csv('../data/raw/train.csv')
data.head()

Unnamed: 0,pair_id,name_1,name_2,is_duplicate
0,1,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
1,2,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
2,3,"Rishichem Distributors Pvt., Ltd.",Dsa,0
3,4,Powermax Rubber Factory,Co. One,0
4,5,Tress A/S,Longyou Industries Park Zhejiang,0


## Соотношение классов

In [3]:
data['is_duplicate'].value_counts()

0    494161
1      3658
Name: is_duplicate, dtype: int64

Видим очень сильный дисбаланс в сторону класса 0 (99.3%)

### Класс 1

In [4]:
data[data['is_duplicate']==1]

Unnamed: 0,pair_id,name_1,name_2,is_duplicate
161,162,JX Nippon Oil & Gas Exploration (Brasil) Ltda,JX Nippon Oil & Gas Exploration Technical Serv...,1
603,604,Pirelli Neumaticos S.A.I.C.,"Pirelli Tyre Co., Ltd.",1
835,836,Brenntag Australia (Pty) Ltd.,Brenntag Group,1
1328,1329,"PAUL BAUDER GMBH & CO KG, BOCHUM PLANT",Paul Bauder ag,1
1562,1563,TOTAL CESKA REPUBLIKA s.r.o.,TOTAL FRANCE (ARNAY LE DUC),1
...,...,...,...,...
496574,496575,"Bridgestone （Huizhou）Synthetic Rubber Co., Ltd.","Bridgestone India Pvt., Ltd.",1
496760,496761,Arlanxeo International Group,Arlanxeo Corp.,1
497015,497016,Brenntag Peru S.A.C.,Brenntag Chile Comercial E Industrial Ltda,1
497083,497084,Dow Chemical International Private Ltd.,Dow Chemical Pacific,1


### Сколько всего уникальных названий компаний

In [5]:
unique_companies = pd.concat([data['name_1'], data['name_2']]).unique()
len(unique_companies)

18022

### Сколько из них встречаюся в классе 1:

In [6]:
name_1_duplicates = data.loc[data['is_duplicate'] == 1, 'name_1']
name_2_duplicates = data.loc[data['is_duplicate'] == 1, 'name_2']
unique_companies_class_1 = pd.concat([name_1_duplicates, name_2_duplicates]).unique()
len(unique_companies_class_1)

1394

## Анализ на уровне символов

In [7]:
symbols = set()
for name in unique_companies:
    symbols.update(name.lower())

In [8]:
symbols

{' ',
 '"',
 '#',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '\xa0',
 '«',
 '±',
 '»',
 '¿',
 'á',
 'ã',
 'ç',
 'è',
 'é',
 'í',
 'ñ',
 'ó',
 'õ',
 'ö',
 'ú',
 'ü',
 'ę',
 'ł',
 'ő',
 'ş',
 'ű',
 '̇',
 'а',
 'б',
 'в',
 'г',
 'д',
 'е',
 'ж',
 'з',
 'и',
 'й',
 'к',
 'л',
 'м',
 'н',
 'о',
 'п',
 'р',
 'с',
 'т',
 'у',
 'ф',
 'х',
 'ц',
 'ч',
 'ш',
 'щ',
 'ы',
 'ь',
 'э',
 'ю',
 'я',
 'ё',
 'ر',
 'س',
 'ف',
 'ك',
 'م',
 'و',
 'ي',
 '\u0e00',
 '‘',
 '’',
 '\u3000',
 '。',
 '上',
 '东',
 '京',
 '份',
 '会',
 '公',
 '北',
 '双',
 '司',
 '团',
 '式',
 '彤',
 '技',
 '料',
 '新',
 '方',
 '日',
 '有',
 '术',
 '材',
 '株',
 '水',
 '海',
 '社',
 '程',
 '股',
 '虹',
 '防',
 '限',
 '集',
 '雨',
 '﹠',
 '＆',
 '（',
 '）',
 '，'}

Помимо англ и русских букв встречается много знаков препинания, цифры + китайский и арабский(?) алфавит. Посмотрим насколько это частое явление.

In [9]:
comps = set()
for s in ['\u0e00', '\u3000']:
    comps.update([name for name in unique_companies if s in name])
comps

{'Guangzhou\u3000Yinsen\u3000Enterprise\u3000Co. Ltd.',
 'Panda Logis\u0e00Ics (Ny) Inc.'}

In [10]:
# Арабский символы

companies_with_arabic_symbols = set()
for s in ['ر', 'س', 'ف', 'ك', 'م', 'و', 'ي']:
    companies_with_arabic_symbols.update([name for name in unique_companies if s in name])
companies_with_arabic_symbols

{'Servicom S.A. (BVMT:SERVI) - سرفيكوم'}

Всего 1 компания с арабскими символами в названии

In [11]:
# Китайские иероглифы

companies_with_chinese_symbols = set()
for s in ['上', '东', '京', '份', '会', '公', '北', '双', '司', '团', '式', '彤', '技', '料', '新', '方', '日', '有', '术', '材', '株', '水', '海', '社', '程', '股', '虹', '防', '限', '集', '雨']:
    companies_with_chinese_symbols.update([name for name in unique_companies if s in name])
companies_with_chinese_symbols

{'Beijing Oriental Yuhong Waterproof Technology Co., Ltd. (SZSE:002271) - 北京东方雨虹防水技术股份有限公司',
 'Jsrtrading（上海）Co. Ltd.',
 'Red Avenue New Materials Group Co., Ltd (彤程新材料集团股份有限公司)',
 'Sojitz Corporation (TSE:2768) - 双日株式会社'}

Всего 4 компании с китайскими иероглифами в названии

In [12]:
# Турецкие символы (?)

companies_with_turkish_symbols = set()
for s in ['á',
 'ã',
 'ç',
 'è',
 'é',
 'í',
 'ñ',
 'ó',
 'õ',
 'ö',
 'ú',
 'ü',
 'ę',
 'ł',
 'ő',
 'ş',
 'ű',
 '̇']:
    companies_with_turkish_symbols.update([name for name in unique_companies if s in name])
companies_with_turkish_symbols

{'Beijing Jiao Forme De Prothèses Aotuoboke Industrial Co., Ltd.',
 'Binné & Sohn GmbH & Co. KG Dachbaustoffwerk',
 'Bitbau Dörr',
 'Bitbau Dörr GmbH',
 'Bochem Sp. z o.o. Zakłady Chemiczne',
 'Compañía Española de Petróleos, S.A.U.',
 'Fatih Ökçe Sanayi ve Ticaret A.Ş.',
 'GEORG BÖRNER Chemisches Werk für Dach- und Bautenschutz GmbH & Co. KG',
 'Imperalum - Sociedade Comercial de Revestimentos e Impermeabilizações, S.A.',
 'Instituto Tecnológico De Costa Rica',
 'Isolago - Indústria de Plásticos S.A.',
 'Izolacja-Matizol SA Przedsiębiorstwo Materiałów Izolacyjnych',
 'Lusocopla - Fábrica de Colas Industriais, Lda',
 'MOL Magyar Olaj- es Gazipari Nyilvanosan Mukodo Reszvenytarsasag (BUSE:MOL) - MOL Magyar Olaj és Gázipari Nyilvánosan Működő Részvénytársaság',
 'Mogat-Werke Adolf Böving Bitumen- und Dachpappen Fabrik GmbH',
 'Pdl Endüstriyel Ürünler',
 'Pigipada Oü',
 'Pikasan Plastik Kauçuk Sanayi A.Ş.',
 'Termocompo - Indústria Termoplástica, Lda'}

Пока оставим как есть

In [13]:
companies_with_numbers = set()
for s in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
    companies_with_numbers.update([name for name in unique_companies if s in name])
companies_with_numbers

{'1 4103412224 Tel Ex 1 410341 23',
 '1 7044245522 Tel Ex 1 704424 56',
 '1 8168912400 Tel Ex 1 816891 76',
 '1 Dhl Global Forwarding',
 '1 St Notify Party',
 '1 St World Medical Supplies Solution Inc.',
 '10841 Ambassador Drive',
 '123 E Latin America S De Rl De Cv',
 '128 Yeouidaero Youngdeungpo Gu',
 '14 Th Fl. Guomao Building Hubin',
 '2 Nd Bulkhaul (Usa) Inc.',
 '2 Nd Norify Party Goodyear Canada Inc.',
 '2 Wedmore Close',
 '21 St Century Textiles Ltd.',
 '210 Brands Inc.',
 '210 Brands Inc. Dba Canterbury Usa',
 '210 Brands Inc./Canterbury Offi',
 '2K Polymer Systems Ltd ',
 '3 A Pakistan',
 '3 Ds Traders',
 '3 K Treads (Pvt) Ltd.',
 '3 M Brockville Tape',
 '3 M Do Brasil Ltda',
 '3 M India Ltd.',
 '3 M Manaus Industria De Produtos Quimicos Ltda',
 '3 M United Kingdom Plc 3 M Center',
 '3 Pl Guys',
 '3 Plus Logistics (Atlanta)',
 '3 Plus Logistics Co.',
 '3 Rd Floor Gate 4 Nanpu Building Tia',
 '3 S Fabrications (Pvt) Ltd.',
 '31 Mcu Movement Control Unit',
 '3475 Piedmont Rd. Ne

Кажется, что цифры в названиях чаще являются шумом (возможно их стоит тоже почистить)

## Класс 1

In [14]:
class_1 = data[data['is_duplicate']==1].reset_index(drop=True)
class_1

Unnamed: 0,pair_id,name_1,name_2,is_duplicate
0,162,JX Nippon Oil & Gas Exploration (Brasil) Ltda,JX Nippon Oil & Gas Exploration Technical Serv...,1
1,604,Pirelli Neumaticos S.A.I.C.,"Pirelli Tyre Co., Ltd.",1
2,836,Brenntag Australia (Pty) Ltd.,Brenntag Group,1
3,1329,"PAUL BAUDER GMBH & CO KG, BOCHUM PLANT",Paul Bauder ag,1
4,1563,TOTAL CESKA REPUBLIKA s.r.o.,TOTAL FRANCE (ARNAY LE DUC),1
...,...,...,...,...
3653,496575,"Bridgestone （Huizhou）Synthetic Rubber Co., Ltd.","Bridgestone India Pvt., Ltd.",1
3654,496761,Arlanxeo International Group,Arlanxeo Corp.,1
3655,497016,Brenntag Peru S.A.C.,Brenntag Chile Comercial E Industrial Ltda,1
3656,497084,Dow Chemical International Private Ltd.,Dow Chemical Pacific,1


## Оказывается есть и такие случаи

In [15]:
class_1[class_1['name_1'] == class_1['name_2']]

Unnamed: 0,pair_id,name_1,name_2,is_duplicate
28,5118,"ООО""ИМПОРТ МОДА""","ООО""ИМПОРТ МОДА""",1
82,12161,"ООО""ГУЧЧИ РУС""","ООО""ГУЧЧИ РУС""",1
104,14858,Lohmann GmbH & Co. KG,Lohmann GmbH & Co. KG,1
198,25950,"ООО""ПРИВАТ ТРЭЙД""","ООО""ПРИВАТ ТРЭЙД""",1
293,38346,"ООО""СПЕЦИАЛЬНОЕ ОБОРУДОВАНИЕ""","ООО""СПЕЦИАЛЬНОЕ ОБОРУДОВАНИЕ""",1
994,139210,"ООО""ФАВОРИТ СТАЙЛ","ООО""ФАВОРИТ СТАЙЛ",1
1077,150208,"ООО""ОЗОН ГИЙИМ РСЙ""","ООО""ОЗОН ГИЙИМ РСЙ""",1
1089,151691,"ООО ""ХИММАРКЕТ""","ООО ""ХИММАРКЕТ""",1
1232,169771,"ООО""ВЕРТИКАЛЬ СПОРТ""","ООО""ВЕРТИКАЛЬ СПОРТ""",1
1618,226435,"ООО ""СТАРКОМ""","ООО ""СТАРКОМ""",1


In [16]:
class_1 = class_1[class_1['name_1'] != class_1['name_2']].reset_index(drop=True)

In [17]:
name_1_duplicates = class_1['name_1']
name_2_duplicates = class_1['name_2']
unique_companies_class_1 = pd.concat([name_1_duplicates, name_2_duplicates]).unique()
len(unique_companies_class_1)

1368

# Разделим на train/test

In [18]:
X = data.drop(['is_duplicate'], axis = 1)
y = data.is_duplicate

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    random_state = 666,
    stratify = y,
    test_size = 0.2
)

In [20]:
X_train['is_duplicate'] = y_train
X_test['is_duplicate'] = y_test

## Используем транзитивность

In [21]:
class_1 = X_train[X_train['is_duplicate'] == 1].reset_index(drop=True)
class_1 = class_1[class_1['name_1'] != class_1['name_2']].reset_index(drop=True)
name_1_duplicates = class_1['name_1']
name_2_duplicates = class_1['name_2']
unique_companies_class_1 = pd.concat([name_1_duplicates, name_2_duplicates]).unique()
len(unique_companies_class_1)

1204

In [22]:
names_clusters = {}
n_clusters = 0
for name in sorted(unique_companies_class_1):
    names_part_1 = class_1.loc[class_1['name_1'] == name, 'name_2'].values
    names_part_2 = class_1.loc[class_1['name_2'] == name, 'name_1'].values
    names = set(names_part_1)
    names.update(names_part_2)
    names.add(name)
    if len(names) == 1:
        print(names)
    current_cluster = None
    for cur_name in names:
        if cur_name in names_clusters:
            current_cluster = names_clusters[cur_name]
            break
    if current_cluster is None:
        current_cluster = n_clusters
        n_clusters += 1
    for cur_name in names:
        names_clusters[cur_name] = current_cluster
n_clusters -= 1

In [23]:
print(f'{len(unique_companies_class_1)} компаний из пар с is_duplicate == 1 разбиты на {n_clusters} кластеров')

1204 компаний из пар с is_duplicate == 1 разбиты на 367 кластеров


In [24]:
clusters = defaultdict(list)
for k, v in names_clusters.items():
    clusters[v].append(k)

In [25]:
clusters

defaultdict(list,
            {0: ['ALFAGOMMA INDUSTRIAL SPA', ' Alfagomma'],
             1: ['Softer Us Inc.', ' SO.F.TER. SPA'],
             2: ['ООО "Полимаркет"', 'ООО Полимаркет', '*** ПОЛИМАРКЕТ, ООО'],
             3: ['A. WESTENSEE & PARTNER ROHSTOFF GMBH', 'awp-rohstoffe'],
             4: ['Trinseo API',
              'A.P.I.',
              'A.P.I. APPLICAZIONI PLASTICHE INDUSTRIALI S.P.A.',
              'API',
              'A.P.I. Applicazioni Plastiche Industriali SPA'],
             5: ['ООО"АБЕНА"', 'ABENA INTERNATIONAL A/S'],
             6: ['ADI (SALAMBO)', 'ADI COMMERCE', 'ADI commerce ltd'],
             7: ['ADRIATICA BITUMI',
              'Adriatica Bitumi Spa',
              'ADRIATICA BITUMI S.P.A.'],
             8: ['agip spa',
              'AGIP',
              'AGIP  ( ENI GROUP)',
              'Azienda Generale Italiana Petroli'],
             9: ['ОАО "АЛЬФА ЛАВАЛЬ ПОТОК"', 'ALFA LAVAL LUND AB'],
             10: ['ALIBESA',
              'Andaluza 

## Почистим данные

In [26]:
symbols_to_drop = ['#', '%', '&', '*', '+', ',', '.', '/', ';',
                   '<', '>', '?', '@', '[', '\\', ']', '`', '{', '\xa0', '«', '±', '»', '¿',
                   'ر', 'س', 'ف', 'ك', 'م', 'و', 'ي', '\u0e00', '‘', '’', '\u3000', '。', '上',
                   '东', '京', '份', '会', '公', '北', '双', '司', '团', '式', '彤', '技', '料',
                   '新', '方', '日', '有', '术', '材', '株', '水', '海', '社', '程', '股', '虹',
                   '防', '限', '集', '雨', '﹠', '＆', '（', '）', '，',  '̇']
symbols_to_replace_with_space = ['-', ':', '"', '(', ')', "'"]
forbidden_words = ['ооо','оао', 'зао', 'лимитед', 'раша', 'групп', 
                   'llc', 'gmbh', 'inc', 'co', 'ltd', 'sa', 'slr', 'limited', 'llp',
                   'ltda', 'asphalt', 'asia', 'europe']

def clean_company_name(name: str,
                       symbols_to_drop: List[str] = symbols_to_drop,
                       symbols_to_replace_with_space: List[str] = symbols_to_replace_with_space,
                       forbidden_words: List[str] = forbidden_words) -> str:
    characters = []
    for c in name:
        if c in symbols_to_drop:
            continue
        elif c in symbols_to_replace_with_space:
            characters.append(' ')
        else:
            characters.append(c)
    replace = {'á': 'a',
               'ã': 'a',
               'ç': 'c',
               'è': 'e',
               'é': 'e',
               'í': 'i',
               'ñ': 'n',
               'ó': 'o',
               'õ': 'o',
               'ö': 'o',
               'ú': 'u',
               'ü': 'u',
               'ę': 'e',
               'ł': 'l',
               'ő': 'o',
               'ş': 's',
               'ű': 'u'}
    
    cleaned_name = ''.join((replace.get(c, c) for c in characters))
    cleaned_name = cleaned_name.strip()
    words = cleaned_name.split()
    cleaned_words = [w for w in words if w not in forbidden_words]
    cleaned_name = ' '.join(cleaned_words)
    return cleaned_name

In [27]:
clean_clusters = defaultdict(list)
for k, v in clusters.items():
    clean_clusters[k] = [clean_company_name(n.lower()) for n in v]
clean_clusters

defaultdict(list,
            {0: ['alfagomma industrial spa', 'alfagomma'],
             1: ['softer us', 'softer spa'],
             2: ['полимаркет', 'полимаркет', 'полимаркет'],
             3: ['a westensee partner rohstoff', 'awp rohstoffe'],
             4: ['trinseo api',
              'api',
              'api applicazioni plastiche industriali spa',
              'api',
              'api applicazioni plastiche industriali spa'],
             5: ['абена', 'abena international as'],
             6: ['adi salambo', 'adi commerce', 'adi commerce'],
             7: ['adriatica bitumi',
              'adriatica bitumi spa',
              'adriatica bitumi spa'],
             8: ['agip spa',
              'agip',
              'agip eni group',
              'azienda generale italiana petroli'],
             9: ['альфа лаваль поток', 'alfa laval lund ab'],
             10: ['alibesa',
              'andaluza de ligante bituminosos y betunes',
              'andaluza de ligantes bit

In [28]:
clusters_to_check_manually = []
cluster_words = dict()
for k, v in clean_clusters.items():
    words = set(v[0].split())
    for word in v[1:]:
        words = words.intersection(word.split())
        cluster_words[k] = words
    if len(words) == 0:
        clusters_to_check_manually.append(k)
    else:
        print(f'{k}: {words}')

0: {'alfagomma'}
1: {'softer'}
2: {'полимаркет'}
4: {'api'}
6: {'adi'}
7: {'bitumi', 'adriatica'}
11: {'trading', 'alpha'}
12: {'leetkooh'}
13: {'ankara', 'insaat'}
14: {'aplix'}
15: {'ouest', 'appia', 'liants'}
16: {'aps'}
17: {'arlanxeo'}
19: {'ve', 'yalitim', 'malzemeleri'}
20: {'asfaltos', 'chova'}
21: {'pharm', 'pack'}
22: {'performance', 'automotive'}
23: {'technology', 'achem'}
24: {'exploration', 'corporation', 'and', 'assets', 'resources', 'adams', 'gas', 'oil'}
25: {'afton', 'chemical'}
26: {'grant', 'alan', 'l'}
27: {'arlanxeo'}
28: {'ashland'}
30: {'auriquimica'}
31: {'avebe'}
32: {'avery'}
33: {'bal', 'taban'}
34: {'basf'}
35: {'basf'}
36: {'gussasphalt', 'dortmunder', 'kg'}
38: {'binne'}
39: {'bit', 'mat', 'products'}
40: {'bitbau'}
41: {'bitufa'}
42: {'bitumenka'}
43: {'bitumix'}
44: {'bitumtec'}
46: {'bochem'}
53: {'bulmarket', 'dm'}
54: {'basf'}
55: {'yuhong', 'oriental'}
56: {'bharat'}
57: {'bharat'}
58: {'biesterfeld'}
59: {'binex', 'line', 'corp'}
60: {'adhesives', 

In [29]:
for idx in clusters_to_check_manually:
    print(f'{idx} {clean_clusters[idx]}')

3 ['a westensee partner rohstoff', 'awp rohstoffe']
5 ['абена', 'abena international as']
8 ['agip spa', 'agip', 'agip eni group', 'azienda generale italiana petroli']
9 ['альфа лаваль поток', 'alfa laval lund ab']
10 ['alibesa', 'andaluza de ligante bituminosos y betunes', 'andaluza de ligantes bituminosos y betunes en liquidacion']
18 ['армаселль', 'armacell poland sp zoo']
29 ['tar and bitumen', 'plastisavio spa']
37 ['biesterfeld plastic', 'бистерфельд рус', 'бистерфельд рус']
45 ['saint gobain corporation', 'saint gobain isover austria', 'bituver', 'saint gobain', 'isover saint gobain']
47 ['боффер', 'bofer insaat aluminium san ve tic sti']
48 ['bo9stic', 'bostic']
49 ['bostik', 'бостик', 'bostik', 'bostik spzoo']
50 ['бостик', 'bostik nederland bv']
51 ['braas', 'браас дск 1']
52 ['jsr corporation tse 4185', 'jsr bst elastomer', 'jsr corporation', 'bst elastomers', '', 'japan synthetic rubber', 'jsr america', 'jsr tradig', 'jsr elastomer america']
65 ['calzaturificio franceschett

### Разберем вручную ключевые слова для этих кластеров

In [30]:
cluster_words[3] = set(['rohstoff', 'rohstoffe'])
cluster_words[5] = set(['абена', 'abena'])
cluster_words[8] = set(['agip', 'azienda'])
cluster_words[9] = set(['alfa', 'альфа', 'laval', 'лаваль'])
cluster_words[10] = set(['andaluza', 'ligante', 'bituminosos', 'alibesa'])
cluster_words[18] = set(['armacell', 'армаселль'])
cluster_words[29] = set(['bitumen', 'plastisavio'])
cluster_words[37] = set(['бистерфельд', 'biesterfeld'])
cluster_words[45] = set(['bituver', 'saint', 'gobain'])
cluster_words[47] = set(['bofer', 'боффер'])
cluster_words[48] = set(['bostic', 'bo9stic'])
cluster_words[49] = set(['bostik', 'бостик'])
cluster_words[50] = set(['bostik', 'бостик'])
cluster_words[51] = set(['braas', 'браас'])
cluster_words[52] = set(['jsr'])
cluster_words[65] = set(['franceschetti', 'franchescetti'])
cluster_words[72] = set(['coco', 'paving', 'cocopaving'])
cluster_words[86] = set(['компогал', 'compogal'])
cluster_words[87] = set(['continental', 'automotive', 'континентал', 'аутомотиве'])
cluster_words[93] = set(['deboer', 'boer'])
cluster_words[94] = set(['делаваль', 'delaval'])
cluster_words[96] = set(['демо', 'demo'])
cluster_words[98] = set(['doppelmayr', 'доппельмайр'])
cluster_words[99] = set(['dorken', 'дёркен'])
cluster_words[113] = set(['dowdupont', 'dow', 'dupont'])
cluster_words[116] = set(['dupont', 'eidupont'])
cluster_words[120] = set(['емам', 'emam', 'asfaltos', 'асфальтос'])
cluster_words[123] = set(['epiroc', 'эпирок'])
cluster_words[134] = set(['фольманн', 'follmann'])
cluster_words[135] = set(['форбо', 'еврокол', 'forbo', 'eurocol'])
cluster_words[136] = set(['forbo', 'siegling', 'форбо', 'сиглинг'])
cluster_words[137] = set(['ford', 'форд'])
cluster_words[151] = set(['gardner', 'gibson', 'гарднер', 'гибсон'])
cluster_words[161] = set(['helios', 'хелиос'])
cluster_words[162] = set(['hevea', 'heveachem'])
cluster_words[164] = set(['хилти', 'hilti'])
cluster_words[166] = set(['hp', 'pelzer', 'хп', 'пельцер'])
cluster_words[168] = set(['huesker', 'хюскер'])
cluster_words[180] = set(['indian', 'synthetic', 'rubber', 'isrl'])
cluster_words[183] = set(['zeon'])
cluster_words[190] = set(['intertex', 'интертекс'])
cluster_words[192] = set(['johns', 'mansville', 'джонс', 'мансвил'])
cluster_words[194] = set(['jowat', 'йоват'])
cluster_words[195] = set(['jowat', 'йоват'])
cluster_words[198] = set(['kawasaki', 'кавасаки'])
cluster_words[200] = set(['кимберли', 'kimberly'])
cluster_words[201] = set(['kimberly', 'кимберли'])
cluster_words[202] = set(['кнауф', 'knauf'])
cluster_words[203] = set(['kraiburg', 'крайбург'])
cluster_words[204] = set(['kumho', 'kkpc'])
cluster_words[210] = set(['lectra', 'лектра'])
cluster_words[211] = set(['lgchem', 'lg', 'chem'])
cluster_words[215] = set(['mholland'])
cluster_words[220] = set(['maneki', 'манеки'])
cluster_words[221] = set(['маниту', 'manitou'])
cluster_words[222] = set(['mapei', 'мапеи'])
cluster_words[227] = set(['michelin', 'мишлен'])
cluster_words[230] = set(['мондэлис', 'мон', 'дэлис', 'mondelez'])
cluster_words[237] = set(['nordenia', 'mondi'])
cluster_words[238] = set(['netafim', 'нетафим'])
cluster_words[239] = set(['nexeo', 'нексео'])
cluster_words[240] = set(['нэкст', 'next'])
cluster_words[241] = set(['nizhnekamsk', 'nizhnekamskneftekhim', 'nknh'])
cluster_words[242] = set(['беттерманн', 'обо', 'obo', 'bettermann'])
cluster_words[243] = set(['oldcastle', 'old', 'castle'])
cluster_words[246] = set(['ондулин', 'onduline'])
cluster_words[257] = set(['perel', 'перел'])
cluster_words[258] = set(['peretti', 'perretti'])
cluster_words[275] = set(['pi̇polyglass', 'полиглас'])
cluster_words[280] = set(['рехау', 'rehau'])
cluster_words[282] = set(['rotoflex', 'ротофлекс'])
cluster_words[290] = set(['сахалин', 'энерджи', 'инвестмент', 'sakhalin', 'energy', 'investment'])
cluster_words[292] = set(['selena', 'селена'])
cluster_words[295] = set(['centrum', 'центрум'])
cluster_words[296] = set(['siemens', 'сименс'])
cluster_words[297] = set(['зика', 'sika'])
cluster_words[300] = set(['соудал', 'soudal'])
cluster_words[313] = set(['sodex', 'содекс'])
cluster_words[314] = set(['soprema', 'сопрема'])
cluster_words[316] = set(['sumimoto', 'sumitomo'])
cluster_words[323] = set(['таката', 'takata'])
cluster_words[328] = set(['termoprene', 'thermoprene'])
cluster_words[329] = set(['tetra', 'pak', 'тетра', 'пак'])
cluster_words[330] = set(['tetra', 'pak', 'тетра', 'пак'])
cluster_words[336] = set(['turmerleim', 'tuermerleim'])
cluster_words[337] = set(['tsrc', 'taiwan', 'synthetic', 'rubber'])
cluster_words[339] = set(['tope', 'tohpe'])
cluster_words[349] = set(['vedag', 'ведаг'])
cluster_words[359] = set(['занотти', 'zanotti'])
cluster_words[363] = set(['конимпекс', 'konimpex'])
cluster_words[364] = set(['pirelli', 'пирелли'])

## Найдем кластеры для объединения

In [31]:
cluster_ids = list(cluster_words.keys())

for i in range(len(cluster_words)):
    for j in range(i + 1, len(cluster_words)):
        id_i = cluster_ids[i]
        id_j = cluster_ids[j]
        n_intersection = len(set(cluster_words[id_i]).intersection(cluster_words[id_j]))
        if n_intersection > 0:
            print(f'{id_i} - {cluster_words[id_i]} with {id_j} - {cluster_words[id_j]}')

1 - {'softer'} with 252 - {'softer'}
13 - {'ankara', 'insaat'} with 70 - {'insaat', 'cengiz'}
17 - {'arlanxeo'} with 27 - {'arlanxeo'}
19 - {'ve', 'yalitim', 'malzemeleri'} with 127 - {'ve', 'sanayi', 'elastron', 'ticaret', 'kimya'}
19 - {'ve', 'yalitim', 'malzemeleri'} with 141 - {'okce', 'sanayi', 've', 'ticaret', 'fatih'}
20 - {'asfaltos', 'chova'} with 120 - {'emam', 'asfaltos', 'асфальтос', 'емам'}
22 - {'performance', 'automotive'} with 87 - {'continental', 'континентал', 'аутомотиве', 'automotive'}
22 - {'performance', 'automotive'} with 308 - {'gobain', 'plastics', 'saint', 'performance'}
23 - {'technology', 'achem'} with 353 - {'vestas', 'technology', 'wind'}
24 - {'exploration', 'corporation', 'and', 'assets', 'resources', 'adams', 'gas', 'oil'} with 85 - {'exploration'}
29 - {'plastisavio', 'bitumen'} with 209 - {'lagan', 'bitumen'}
33 - {'bal', 'taban'} with 245 - {'omur', 'taban'}
33 - {'bal', 'taban'} with 335 - {'taban', 'tuba'}
33 - {'bal', 'taban'} with 362 - {'taban',

## Объединим кластеры

In [32]:
union_clusters = [
    [1, 252],
    [17, 27],
    [34, 35, 54],
    [37, 58],
    [45, 307, 308],
    [49, 50],
    [69, 81],
    [102, 118],
    [109, 110, 111, 112, 113],
    [135, 136],
    [152, 153],
    [155, 314],
    [163, 171],
    [175, 185],
    [176, 177],
    [180, 187],
    [183, 360, 361],
    [194, 195],
    [200, 201],
    [204, 207],
    [217, 227],
    [231, 237],
    [253, 270],
    [271, 272],
    [274, 364],
    [277, 297],
    [289, 306],
    [299, 300],
    [316, 317, 319],
    [329, 330],
    [348, 349],
    [350, 352]
]

for uc in union_clusters:
    main_id = uc[0]
    for idx in uc[1:]:
        clusters[main_id] = clusters[main_id] + clusters[idx]
        clean_clusters[main_id] = clean_clusters[main_id] + clean_clusters[idx]
        cluster_words[main_id].update(cluster_words[idx])
        del clusters[idx]
        del clean_clusters[idx]
        del cluster_words[idx]

# Получаем кластеры с ключевыми словами для train

In [34]:
train_clusters = {}
train_clean_clusters = {}
train_cluster_words = {}

for i, k in enumerate(clean_clusters.keys()):
    train_clusters[i] = clusters[k]
    train_clean_clusters[i] = clean_clusters[k]
    train_cluster_words[i] = cluster_words[k]

## Компании, которые желательно кластеризовать

In [44]:
class_0 = X_train[X_train['is_duplicate'] == 0].reset_index(drop=True)
class_0 = class_0[class_0['name_1'] != class_0['name_2']].reset_index(drop=True)
name_1_duplicates = class_0['name_1']
name_2_duplicates = class_0['name_2']
unique_companies_class_0 = pd.concat([name_1_duplicates, name_2_duplicates]).unique()
unique_companies_class_0 = sorted([name for name in unique_companies_class_0 if name not in unique_companies_class_1])
len(unique_companies_class_0)

16647

In [52]:
cleaned_companies_class_0 = [clean_company_name(name.lower()) for name in unique_companies_class_0]

In [57]:
from collections import Counter
all_words = []
for name in cleaned_companies_class_0:
    all_words.extend(name.split())

In [58]:
cnt = Counter(all_words)

In [68]:
tmp = [pair for pair in cnt.most_common() if pair[1] > 2 and pair[1] < 10]
len(tmp)

1993

In [71]:
tmp

[('resin', 9),
 ('ahmed', 9),
 ('abdullah', 9),
 ('stock', 9),
 ('custom', 9),
 ('changshu', 9),
 ('research', 9),
 ('tools', 9),
 ('chi', 9),
 ('chicago', 9),
 ('allied', 9),
 ('merchants', 9),
 ('glass', 9),
 ('school', 9),
 ('healthcare', 9),
 ('asociados', 9),
 ('sleep', 9),
 ('emirates', 9),
 ('exim', 9),
 ('future', 9),
 ('holding', 9),
 ('baker', 9),
 ('hughes', 9),
 ('shop', 9),
 ('tbk', 9),
 ('jie', 9),
 ('korea', 9),
 ('big', 9),
 ('binex', 9),
 ('bostik', 9),
 ('promotional', 9),
 ('oo', 9),
 ('gum', 9),
 ('gasket', 9),
 ('manager', 9),
 ('valley', 9),
 ('metals', 9),
 ('por', 9),
 ('si', 9),
 ('qingyuan', 9),
 ('multimodal', 9),
 ('johnson', 9),
 ('damco', 9),
 ('schenker', 9),
 ('seals', 9),
 ('cargas', 9),
 ('xiang', 9),
 ('contracting', 9),
 ('way', 9),
 ('mega', 9),
 ('fisher', 9),
 ('puerto', 9),
 ('rico', 9),
 ('sri', 9),
 ('pak', 9),
 ('geodis', 9),
 ('se', 9),
 ('pioneer', 9),
 ('philippine', 9),
 ('imperial', 9),
 ('jaya', 9),
 ('modern', 9),
 ('reda', 9),
 ('vangu

In [73]:
for name in cleaned_companies_class_0:
    if 'sanyo' in name:
        print(name)

pt sanyo trading indonesia
sanyo corp
sanyo corporation of america
sanyo e e de cv
sanyo energy suzhou
sanyo energy beijing
sanyo maritime
sanyo touchi shanghai rubber
sanyo trading india


In [50]:
for name in unique_companies_class_0:
    clean_name = clean_company_name(name.lower())
    words = set(clean_name.split())
    max_intersection = set()
    max_key = None
    for k, v in train_cluster_words.items():
        intersection = words.intersection(v)
        if len(intersection) > len(max_intersection):
            max_intersection = intersection
            max_key = k
    if len(max_intersection) > 0:
        print(f'{name} - {max_intersection} - {train_clusters[max_key]}')

 Lanxess International Trading (Shanghai) Co., Ltd. - {'trading'} - ['Alpha Trading S.p.a', 'ALPHA TRADING']
 Lanxess Sas - {'sas'} - ['Gymap Compound Sas', 'Gymap Sas']
 TOTAL OIL INDIA PRIVATE LIMITED,  TOTAL - {'oil'} - ['Adams Resources Exploration Corporation, Oil And Gas Assets In Crocket And Irion Counties In Texas', 'Adams Resources Exploration Corporation, Oil And Gas Assets Outside The Permian Basin']
123 E Latin America S De Rl De Cv - {'e'} - ['Farmacap Industria E Comercio Ltda', 'Farmacap Industries Stria E Com Rcio Ltda']
14 Th Fl. Guomao Building Hubin - {'building'} - ['Jiangsu Canlon Building Materials Co., Ltd.', 'Jiangsu Canlon Building Materials']
2 Nd Bulkhaul (Usa) Inc. - {'usa'} - ['Leman Usa. Atlanta', 'Leman Usa']
2 Nd Norify Party Goodyear Canada Inc. - {'goodyear'} - ['Goodyear ', 'The Goodyear Tire and Rubber Company', 'Goodyear Chemical Co']
210 Brands Inc. Dba Canterbury Usa - {'usa'} - ['Leman Usa. Atlanta', 'Leman Usa']
2K Polymer Systems Ltd  - {'syste