In [1]:
from typing import List

import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('../data/raw/train.csv')
data.head()

Unnamed: 0,pair_id,name_1,name_2,is_duplicate
0,1,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
1,2,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
2,3,"Rishichem Distributors Pvt., Ltd.",Dsa,0
3,4,Powermax Rubber Factory,Co. One,0
4,5,Tress A/S,Longyou Industries Park Zhejiang,0


# Разделим на train/test

In [3]:
X = data.drop(['is_duplicate'], axis = 1)
y = data.is_duplicate

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    random_state = 666,
    stratify = y,
    test_size = 0.2
)

In [5]:
X_train['is_duplicate'] = y_train
X_test['is_duplicate'] = y_test

## Используем транзитивность

In [6]:
class_1 = X_train[X_train['is_duplicate'] == 1].reset_index(drop=True)
class_1 = class_1[class_1['name_1'] != class_1['name_2']].reset_index(drop=True)
name_1_duplicates = class_1['name_1']
name_2_duplicates = class_1['name_2']
unique_companies_class_1 = pd.concat([name_1_duplicates, name_2_duplicates]).unique()
len(unique_companies_class_1)

1204

In [7]:
names_clusters = {}
n_clusters = 0
for name in sorted(unique_companies_class_1):
    names_part_1 = class_1.loc[class_1['name_1'] == name, 'name_2'].values
    names_part_2 = class_1.loc[class_1['name_2'] == name, 'name_1'].values
    names = set(names_part_1)
    names.update(names_part_2)
    names.add(name)
    names = sorted(names)
    if len(names) == 1:
        print(names)
    current_cluster = None
    for cur_name in names:
        if cur_name in names_clusters:
            current_cluster = names_clusters[cur_name]
            break
    if current_cluster is None:
        current_cluster = n_clusters
        n_clusters += 1
    for cur_name in names:
        names_clusters[cur_name] = current_cluster
n_clusters -= 1

In [8]:
print(f'{len(unique_companies_class_1)} компаний из пар с is_duplicate == 1 разбиты на {n_clusters} кластеров')

1204 компаний из пар с is_duplicate == 1 разбиты на 367 кластеров


In [9]:
clusters = defaultdict(list)
for k, v in names_clusters.items():
    clusters[v].append(k)

## Почистим данные

In [10]:
symbols_to_drop = ['#', '%', '&', '*', '+', ',', '.', '/', ';',
                   '<', '>', '?', '@', '[', '\\', ']', '`', '{', '\xa0', '«', '±', '»', '¿',
                   'ر', 'س', 'ف', 'ك', 'م', 'و', 'ي', '\u0e00', '‘', '’', '\u3000', '。', '上',
                   '东', '京', '份', '会', '公', '北', '双', '司', '团', '式', '彤', '技', '料',
                   '新', '方', '日', '有', '术', '材', '株', '水', '海', '社', '程', '股', '虹',
                   '防', '限', '集', '雨', '﹠', '＆', '（', '）', '，',  '̇']
symbols_to_replace_with_space = ['-', ':', '"', '(', ')', "'"]
forbidden_words = ['ооо','оао', 'зао', 'лимитед', 'раша', 'групп', 
                   'llc', 'gmbh', 'inc', 'co', 'ltd', 'sa', 'slr', 'limited', 'llp',
                   'ltda', 'asphalt', 'asia', 'europe']

def clean_company_name(name: str,
                       symbols_to_drop: List[str] = symbols_to_drop,
                       symbols_to_replace_with_space: List[str] = symbols_to_replace_with_space,
                       forbidden_words: List[str] = forbidden_words) -> str:
    characters = []
    for c in name:
        if c in symbols_to_drop:
            continue
        elif c in symbols_to_replace_with_space:
            characters.append(' ')
        else:
            characters.append(c)
    replace = {'á': 'a',
               'ã': 'a',
               'ç': 'c',
               'è': 'e',
               'é': 'e',
               'í': 'i',
               'ñ': 'n',
               'ó': 'o',
               'õ': 'o',
               'ö': 'o',
               'ú': 'u',
               'ü': 'u',
               'ę': 'e',
               'ł': 'l',
               'ő': 'o',
               'ş': 's',
               'ű': 'u'}
    
    cleaned_name = ''.join((replace.get(c, c) for c in characters))
    cleaned_name = cleaned_name.strip()
    words = cleaned_name.split()
    cleaned_words = [w for w in words if w not in forbidden_words]
    cleaned_name = ' '.join(cleaned_words)
    return cleaned_name

### Очищаем названия компаний из полученных кластеров (clean_clusters)

In [11]:
clean_clusters = defaultdict(list)
for k, v in clusters.items():
    clean_clusters[k] = [clean_company_name(n.lower()) for n in v]
clean_clusters

defaultdict(list,
            {0: ['alfagomma', 'alfagomma industrial spa'],
             1: ['softer spa', 'softer us'],
             2: ['полимаркет', 'полимаркет', 'полимаркет'],
             3: ['a westensee partner rohstoff', 'awp rohstoffe'],
             4: ['api',
              'trinseo api',
              'api applicazioni plastiche industriali spa',
              'api',
              'api applicazioni plastiche industriali spa'],
             5: ['abena international as', 'абена'],
             6: ['adi salambo', 'adi commerce', 'adi commerce'],
             7: ['adriatica bitumi',
              'adriatica bitumi spa',
              'adriatica bitumi spa'],
             8: ['agip',
              'agip eni group',
              'agip spa',
              'azienda generale italiana petroli'],
             9: ['alfa laval lund ab', 'альфа лаваль поток'],
             10: ['alibesa',
              'andaluza de ligantes bituminosos y betunes en liquidacion',
              'andaluza

### Находим внутри каждого кластера пересечение слов входящих в него компаний, если пересечения нет - то нужно проверить в ручном режиме

In [12]:
clusters_to_check_manually = []
cluster_words = dict()
for k, v in clean_clusters.items():
    words = set(v[0].split())
    for word in v[1:]:
        words = words.intersection(word.split())
        cluster_words[k] = words
    if len(words) == 0:
        clusters_to_check_manually.append(k)

In [13]:
for idx in clusters_to_check_manually:
    print(f'{idx} {clean_clusters[idx]}')

3 ['a westensee partner rohstoff', 'awp rohstoffe']
5 ['abena international as', 'абена']
8 ['agip', 'agip eni group', 'agip spa', 'azienda generale italiana petroli']
9 ['alfa laval lund ab', 'альфа лаваль поток']
10 ['alibesa', 'andaluza de ligantes bituminosos y betunes en liquidacion', 'andaluza de ligante bituminosos y betunes']
18 ['armacell poland sp zoo', 'армаселль']
29 ['tar and bitumen', 'plastisavio spa']
37 ['biesterfeld plastic', 'бистерфельд рус', 'бистерфельд рус']
45 ['bituver', 'saint gobain corporation', 'saint gobain isover austria', 'isover saint gobain', 'saint gobain']
47 ['bofer insaat aluminium san ve tic sti', 'боффер']
48 ['bostic', 'bo9stic']
49 ['bostik', 'бостик', 'bostik', 'bostik spzoo']
50 ['bostik nederland bv', 'бостик']
51 ['braas', 'браас дск 1']
52 ['bst elastomers', 'jsr corporation tse 4185', 'jsr corporation', 'jsr bst elastomer', '', 'japan synthetic rubber', 'jsr america', 'jsr tradig', 'jsr elastomer america']
65 ['calzaturificio franceschett

### Разберем вручную ключевые слова для этих кластеров

In [14]:
cluster_words[3] = set(['rohstoff', 'rohstoffe'])
cluster_words[5] = set(['абена', 'abena'])
cluster_words[8] = set(['agip', 'azienda'])
cluster_words[9] = set(['alfa', 'альфа', 'laval', 'лаваль'])
cluster_words[10] = set(['andaluza', 'ligante', 'bituminosos', 'alibesa'])
cluster_words[18] = set(['armacell', 'армаселль'])
cluster_words[29] = set(['bitumen', 'plastisavio'])
cluster_words[37] = set(['бистерфельд', 'biesterfeld'])
cluster_words[45] = set(['bituver', 'saint', 'gobain'])
cluster_words[47] = set(['bofer', 'боффер'])
cluster_words[48] = set(['bostic', 'bo9stic'])
cluster_words[49] = set(['bostik', 'бостик'])
cluster_words[50] = set(['bostik', 'бостик'])
cluster_words[51] = set(['braas', 'браас'])
cluster_words[52] = set(['jsr'])
cluster_words[65] = set(['franceschetti', 'franchescetti'])
cluster_words[72] = set(['coco', 'paving', 'cocopaving'])
cluster_words[86] = set(['компогал', 'compogal'])
cluster_words[87] = set(['continental', 'automotive', 'континентал', 'аутомотиве'])
cluster_words[93] = set(['deboer', 'boer'])
cluster_words[94] = set(['делаваль', 'delaval'])
cluster_words[96] = set(['демо', 'demo'])
cluster_words[98] = set(['doppelmayr', 'доппельмайр'])
cluster_words[99] = set(['dorken', 'дёркен'])
cluster_words[113] = set(['dowdupont', 'dow', 'dupont'])
cluster_words[116] = set(['dupont', 'eidupont'])
cluster_words[120] = set(['емам', 'emam', 'asfaltos', 'асфальтос'])
cluster_words[123] = set(['epiroc', 'эпирок'])
cluster_words[134] = set(['фольманн', 'follmann'])
cluster_words[135] = set(['форбо', 'еврокол', 'forbo', 'eurocol'])
cluster_words[136] = set(['forbo', 'siegling', 'форбо', 'сиглинг'])
cluster_words[137] = set(['ford', 'форд'])
cluster_words[151] = set(['gardner', 'gibson', 'гарднер', 'гибсон'])
cluster_words[161] = set(['helios', 'хелиос'])
cluster_words[162] = set(['hevea', 'heveachem'])
cluster_words[164] = set(['хилти', 'hilti'])
cluster_words[166] = set(['hp', 'pelzer', 'хп', 'пельцер'])
cluster_words[168] = set(['huesker', 'хюскер'])
cluster_words[180] = set(['indian', 'synthetic', 'rubber', 'isrl'])
cluster_words[183] = set(['zeon'])
cluster_words[190] = set(['intertex', 'интертекс'])
cluster_words[192] = set(['johns', 'mansville', 'джонс', 'мансвил'])
cluster_words[194] = set(['jowat', 'йоват'])
cluster_words[195] = set(['jowat', 'йоват'])
cluster_words[198] = set(['kawasaki', 'кавасаки'])
cluster_words[200] = set(['кимберли', 'kimberly', 'clark', 'кларк'])
cluster_words[201] = set(['kimberly', 'кимберли', 'clark', 'кларк'])
cluster_words[202] = set(['кнауф', 'knauf'])
cluster_words[203] = set(['kraiburg', 'крайбург'])
cluster_words[204] = set(['kumho', 'kkpc'])
cluster_words[210] = set(['lectra', 'лектра'])
cluster_words[211] = set(['lgchem', 'lg', 'chem'])
cluster_words[215] = set(['mholland'])
cluster_words[220] = set(['maneki', 'манеки'])
cluster_words[221] = set(['маниту', 'manitou'])
cluster_words[222] = set(['mapei', 'мапеи'])
cluster_words[227] = set(['michelin', 'мишлен'])
cluster_words[230] = set(['мондэлис', 'мон', 'дэлис', 'mondelez'])
cluster_words[237] = set(['nordenia', 'mondi'])
cluster_words[238] = set(['netafim', 'нетафим'])
cluster_words[239] = set(['nexeo', 'нексео'])
cluster_words[240] = set(['нэкст', 'next'])
cluster_words[241] = set(['nizhnekamsk', 'nizhnekamskneftekhim', 'nknh'])
cluster_words[242] = set(['беттерманн', 'обо', 'obo', 'bettermann'])
cluster_words[243] = set(['oldcastle', 'old', 'castle'])
cluster_words[246] = set(['ондулин', 'onduline'])
cluster_words[257] = set(['perel', 'перел'])
cluster_words[258] = set(['peretti', 'perretti'])
cluster_words[275] = set(['pi̇polyglass', 'полиглас'])
cluster_words[280] = set(['рехау', 'rehau'])
cluster_words[282] = set(['rotoflex', 'ротофлекс'])
cluster_words[290] = set(['сахалин', 'энерджи', 'инвестмент', 'sakhalin', 'energy', 'investment'])
cluster_words[292] = set(['selena', 'селена'])
cluster_words[295] = set(['centrum', 'центрум'])
cluster_words[296] = set(['siemens', 'сименс'])
cluster_words[297] = set(['зика', 'sika'])
cluster_words[300] = set(['соудал', 'soudal'])
cluster_words[313] = set(['sodex', 'содекс'])
cluster_words[314] = set(['soprema', 'сопрема'])
cluster_words[316] = set(['sumimoto', 'sumitomo'])
cluster_words[323] = set(['таката', 'takata'])
cluster_words[328] = set(['termoprene', 'thermoprene'])
cluster_words[329] = set(['tetra', 'pak', 'тетра', 'пак'])
cluster_words[330] = set(['tetra', 'pak', 'тетра', 'пак'])
cluster_words[336] = set(['turmerleim', 'tuermerleim'])
cluster_words[337] = set(['tsrc', 'taiwan', 'synthetic', 'rubber'])
cluster_words[339] = set(['tope', 'tohpe'])
cluster_words[349] = set(['vedag', 'ведаг'])
cluster_words[359] = set(['занотти', 'zanotti'])
cluster_words[363] = set(['конимпекс', 'konimpex'])
cluster_words[364] = set(['pirelli', 'пирелли'])

## Найдем кластеры для объединения

In [15]:
cluster_ids = list(cluster_words.keys())

for i in range(len(cluster_words)):
    for j in range(i + 1, len(cluster_words)):
        id_i = cluster_ids[i]
        id_j = cluster_ids[j]
        n_intersection = len(set(cluster_words[id_i]).intersection(cluster_words[id_j]))
        if n_intersection > 0:
            print(f'{id_i} - {cluster_words[id_i]} with {id_j} - {cluster_words[id_j]} by {set(cluster_words[id_i]).intersection(cluster_words[id_j])}')

1 - {'softer'} with 252 - {'softer'} by {'softer'}
13 - {'ankara', 'insaat'} with 70 - {'insaat', 'cengiz'} by {'insaat'}
17 - {'arlanxeo'} with 27 - {'arlanxeo'} by {'arlanxeo'}
19 - {'ve', 'yalitim', 'malzemeleri'} with 127 - {'ticaret', 'kimya', 'sanayi', 've', 'elastron'} by {'ve'}
19 - {'ve', 'yalitim', 'malzemeleri'} with 141 - {'ticaret', 've', 'sanayi', 'okce', 'fatih'} by {'ve'}
20 - {'asfaltos', 'chova'} with 120 - {'емам', 'asfaltos', 'emam', 'асфальтос'} by {'asfaltos'}
22 - {'automotive', 'performance'} with 87 - {'automotive', 'аутомотиве', 'континентал', 'continental'} by {'automotive'}
22 - {'automotive', 'performance'} with 308 - {'performance', 'plastics', 'saint', 'gobain'} by {'performance'}
23 - {'achem', 'technology'} with 353 - {'vestas', 'technology', 'wind'} by {'technology'}
24 - {'corporation', 'resources', 'gas', 'oil', 'exploration', 'adams', 'assets', 'and'} with 85 - {'exploration'} by {'exploration'}
29 - {'bitumen', 'plastisavio'} with 209 - {'bitumen',

## Объединим кластеры

In [16]:
union_clusters = [
    [1, 252],
    [17, 27],
    [34, 35, 54],
    [37, 58],
    [45, 307, 308],
    [49, 50],
    [56, 57],
    [69, 81],
    [80, 342],
    [102, 118],
    [109, 110, 111, 112, 113],
    [135, 136],
    [150, 159],
    [152, 153],
    [155, 314],
    [163, 171],
    [175, 185],
    [176, 177],
    [180, 187],
    [183, 360, 361],
    [194, 195],
    [200, 201],
    [204, 207],
    [217, 227],
    [231, 237],
    [253, 270],
    [271, 272],
    [274, 364],
    [277, 297],
    [289, 306],
    [299, 300],
    [316, 317, 318, 319],
    [329, 330],
    [348, 349],
    [350, 352]
]

for uc in union_clusters:
    main_id = uc[0]
    for idx in uc[1:]:
        clusters[main_id] = clusters[main_id] + clusters[idx]
        clean_clusters[main_id] = clean_clusters[main_id] + clean_clusters[idx]
        cluster_words[main_id].update(cluster_words[idx])
        del clusters[idx]
        del clean_clusters[idx]
        del cluster_words[idx]

# Получаем кластеры с ключевыми словами для train

In [17]:
train_clusters = {}
train_clean_clusters = {}
train_cluster_words = {}

for i, k in enumerate(clean_clusters.keys()):
    train_clusters[i] = clusters[k]
    train_clean_clusters[i] = clean_clusters[k]
    train_cluster_words[i] = cluster_words[k]

# Сохраним наши результаты

In [18]:
X_train.to_csv('../data/interim/X_train.csv', index=False)
X_test.to_csv('../data/interim/X_test.csv', index=False)

## Сохраним отдельно в X_train_companies.csv названия компаний с номером кластера, и -1 для компаний, для которых не искали кластеры (is_duplicate==0)


In [19]:
company_cluster = {}
for k, v in train_clusters.items():
    for name in v:
        company_cluster[name] = k

In [20]:
companies_names = sorted(set(list(X_train['name_1'].unique()) + list(X_train['name_2'].unique())))
companies_with_clusters = [(name, company_cluster.get(name, -1)) for name in companies_names]

company_df = pd.DataFrame(companies_with_clusters, columns=['name', 'cluster'])
company_df

Unnamed: 0,name,cluster
0,Alfagomma,0
1,LANXESS Inc.,-1
2,Lanxess Accounting GmbH,-1
3,"Lanxess International Trading (Shanghai) Co.,...",-1
4,Lanxess Sas,-1
...,...,...
17868,ФИЛИАЛ КОМПАНИИ ЭКСОН НЕФТЕГАЗ ЛИМИТЕД,324
17869,"ФИЛИАЛ КОМПАНИИ""ЭКСОН НЕФТЕГАЗ ЛИМИТЕД""",324
17870,"ХИМИНВЕСТ ГРУПП, ООО",-1
17871,"ХИМИНВЕСТ НПФ, ООО",-1


In [22]:
# Уберем компанию 'LTD.' - ее название не говорит ни о чем

company_df = company_df[company_df['name'] != 'LTD.'].reset_index(drop=True)

In [23]:
company_df.to_csv('../data/processed/X_train_companies_with_clusters.csv', index=False)