## Requirements

- beautifulsoup4==4.7.1
- catboost==0.24.2
- geonamescache==1.2.0
- lxml==4.6.2
- numpy==1.18.5
- pandas==1.1.3
- pycountry==20.7.3
- scikit-learn==0.21.3
- spacy==2.3.4
- unidecode==1.1.1

RUN python -m spacy download en_core_web_lg

In [2]:
import numpy as np
import scipy as sp
import pandas as pd
import csv

import matplotlib.pyplot as plt

import pickle

from collections import *

import json
import pathlib
import re
import sys

import spacy

import requests
from bs4 import BeautifulSoup

import unidecode
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import GroupKFold

from IPython.display import display


pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 150

In [3]:
import en_core_web_lg

In [4]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp_en = en_core_web_lg.load()

In [5]:
DATA_DIR = pathlib.Path("")
train = pd.read_csv(DATA_DIR.joinpath('train.csv'), index_col="pair_id")

In [6]:
%%time
docs = dict()
for i, s in enumerate(set(train['name_1']) | set(train['name_2'])):
    if i % 1000 == 0:
        print(i)
    docs[s] = nlp_en(s)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
Wall time: 1min 32s


In [7]:
translitMapRuEn = {
    "инк": ["inc"],
    "ай": ["i"],
    "аш": ["ussi", "h"],
    "дж": ["g", "j"],
    "кс": ["x", "ks", "cs"],
    "а": ["a"],
    "б": ["b"],
    "в": ["v", "w"],
    "г": ["g"],
    "д": ["d"],
    "е": ["e"],
    "ё": ["e", "eu"],
    "ж": ["zh"],
    "з": ["z"],
    "и": ["i", "e"],
    "й": ["i", "y"],
    "к": ["k", "c", "q"],
    "л": ["l"],
    "м": ["m"],
    "н": ["n"],
    "о": ["o"],
    "п": ["p"],
    "р": ["r"],
    "с": ["s", "c"],
    "т": ["t", "th"],
    "у": ["u", "oo"],
    "ф": ["f", "ph"],
    "х": ["kh", "h"],
    "ц": ["ts", "c"],
    "ч": ["ch", "c"],
    "ш": ["sh"],
    "щ": ["shch"],
    "ъ": ["ie", ""],
    "ы": ["y"],
    "ь": [""],
    "э": ["e", "a"],
    "ю": ["iu", "u"],
    "я": ["ia", "ya"],
}

def translit(
    word: str,
    translit_map,
    results_limit = 1,
    add_silent_e: bool = False,
):
    if len(word) == 0 and add_silent_e:
        return {"", "e"}
    elif len(word) == 0:
        return {""}

    res = set()

    # If character is not in a mapping table leave it as is
    if word[0] not in translit_map:
        tres = translit(word[1:], translit_map, results_limit, add_silent_e)
        for replaced_tail in tres:
            res.add(word[0] + replaced_tail)
        return res

    # Recursively add all possible transliteration combinations
    for k, v in sorted(translit_map.items(), key=lambda x: -len(x[0])):
        if word.startswith(k):
            tres = translit(word[len(k) :], translit_map, results_limit, add_silent_e)
            for replaced_tail in tres:
                for replacement_variant in v:
                    if results_limit is not None and len(res) >= results_limit:
                        return res
                    res.add(replacement_variant + replaced_tail)

    return res

In [8]:
non_alphanum_regex = re.compile("[^0-9a-zA-Zа-яА-ЯёЁ ]+")

def simple_transform(s, del_brackets=True):
    s0 = s
    s = s.lower()
    s = s.replace('ооо', '').replace('ооо', '').replace('oao', '').replace('оао', '')
    s = s.replace('ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ'.lower(), '')
    s = s.replace('ь', '')
    s = list(translit(s, translitMapRuEn))[0]
    s = unidecode.unidecode(s)

    s = s.replace(',', ' ').replace('.', '').replace('*', '') \
        .replace('"', ' ').replace("'", ' ').replace('-', ' ').replace('&', ' ') \
        .replace('\\', '').replace('?', ' ')
    s = s.replace('(', ' (')
    s = s.replace(')', ') ')
    s = s.replace('( ', '(')
    s = s.replace(' )', ')')
    if len(s.strip()) > 0 and s.strip()[0] == '(':
        s = s.strip()[1:]
    if del_brackets:
        s = re.sub("\(.*\)", "", s)
    s = s.replace('(', '').replace(')', '')
    s = re.sub(non_alphanum_regex, '', s)
    #s = legal_re.sub('', s)
    s = ' '.join(s.split())
    return s

In [9]:
# res = requests.get('https://en.wikipedia.org/wiki/List_of_legal_entity_types_by_country')
# soup = BeautifulSoup(res.content, "lxml")

# with open('legal_entity_types_by_country.txt', 'w', encoding='utf-8') as fout:
#     fout.write(str(soup))


with open('legal_entity_types_by_country.txt', 'r', encoding='utf-8') as fin:
    s = fin.read()
soup = BeautifulSoup(s, "lxml")

countries0 = []

legal = set()
for e in soup.find_all('h2')[1:]:
    country = e.find('span').text
    countries0.append(country.lower())
    if country == 'See also':
        break
#     print()
    for ee in e.findNext('ul').find_all('li'):
        v = ee.text.split('(')[0].split(':')[0].split('≈')[0].split('=')[0].split('–')[0]
        for vv in v.split('/'):
            vv = vv.strip()
            legal.add(vv.lower())
            legal.add(vv.lower().replace('.', ''))
#    print()

legal.add('pvt')
legal |= {'de', 'international', 'industries', 'industria', 'imp', 'exp'}

len(legal)

714

In [10]:
import pycountry
import geonamescache

gc = geonamescache.GeonamesCache()

countries = countries0 + [country.name.lower() for country in pycountry.countries]
countries.append('usa')
countries.append('africa')
countries.append('asia')
countries.append('europe')
countries.append('america')
countries.append('north')
countries.append('south')
countries.append('west')
countries.append('east')
countries.append('city')
countries.append('area')

countries = set(countries)
print(len(countries))

for k, v in gc.get_countries().items():
    c = simple_transform(v['name'])
    if c not in countries:
        countries.add(c)
print(len(countries))
        
for k, v in gc.get_us_states().items():
    c = simple_transform(v['name'])
    if c not in countries:
        countries.add(c)
print(len(countries))
        
cities = set()
for k, v in gc.get_cities().items():
    c = simple_transform(v['name'])
    cities.add(c)
print(len(cities))

cities_alt = set()
for k, v in gc.get_cities().items():
    c = simple_transform(v['name'])
    cities_alt.add(c)
    for e in v['alternatenames']:
        c = simple_transform(e)
        cities_alt.add(e)
print(len(cities_alt))

269
300
350
23134
263065


In [11]:
legal_tokens = set()
for e in legal:
    legal_tokens.add(re.sub(non_alphanum_regex, '', e))
    for t in e.split():
        ut = unidecode.unidecode(t)
        for tt in [t, re.sub(non_alphanum_regex, '', t),
                   ut, re.sub(non_alphanum_regex, '', ut)]:
            if len(tt) > 2 and tt not in legal and tt not in legal_tokens:
                #print(tt)
                legal_tokens.add(tt)
len(legal_tokens)

1203

In [12]:
def multi_str_replace(strings, debug=True):
    re_str = r'\b(?:' + '|'.join(
        [re.escape(s) for s in strings]
    ) + r')(?!\S)'
    if debug:
        print(re_str)
    return re.compile(re_str, re.UNICODE)

legal_re = multi_str_replace([rf"{entity}" for entity in legal | legal_tokens if len(entity) > 1], debug=False)
countries_re = multi_str_replace([rf"{entity}" for entity in countries], debug=False)

t = r'sibur    gmbh inc. gmbh inc pvgmbh  b.v. bova inc.'
t = legal_re.sub('', t)
t

'sibur     pvgmbh  b.v. bova '

In [13]:
all_tokens = Counter()
for e in docs:
    all_tokens.update(simple_transform(e).split())
len(all_tokens)

16380

In [14]:
docs_tokens = dict()
for i, token in enumerate(all_tokens):
    docs_tokens[token] = nlp_en(token)
    if i % 1000 == 0:
        print(i, len(all_tokens))

0 16380
1000 16380
2000 16380
3000 16380
4000 16380
5000 16380
6000 16380
7000 16380
8000 16380
9000 16380
10000 16380
11000 16380
12000 16380
13000 16380
14000 16380
15000 16380
16000 16380


In [15]:
geo_add = set()
kk = 0
for token in docs_tokens:
    doc = docs_tokens[token]
    for e in doc.ents:
        if e.label_ == 'GPE':
            geo_add.add(token)
            if token not in cities_alt and token not in countries:
                #print(token)
                kk += 1

print(len(geo_add), kk)
geo_re = multi_str_replace([rf"{entity}" for entity in geo_add], debug=False)

869 501


In [16]:
global_s2tokens = dict()
global_s2tokens[(0, 0, 0)] = dict()
global_s2tokens[(1, 0, 0)] = dict()
#global_s2tokens[(1, 1, 0)] = dict()
global_s2tokens[(1, 0, 1)] = dict()

def tokenize(s, del_brackets=True, only_org=False, use_freq=False):
    h = (int(del_brackets), int(only_org), int(use_freq))
    if s not in global_s2tokens[h]:
#         for ent in docs[s].ents:
#             if ent.label_ == 'GPE':
#                 s_new = s.replace(ent.text, '')
#                 if len(s_new) != 0:
#                     s = s_new
        s1 = simple_transform(s, del_brackets=del_brackets)
        s2 = legal_re.sub('', s1)
        s3 = countries_re.sub('', s2)
        s4 = geo_re.sub('', s3)
        
        arr = s4.split()

        arr_new = []
        i = 0
        while i < len(arr) - 1:
            ss = arr[i] + ' ' + arr[i + 1]
            ss1 = arr[i] + arr[i + 1]
            if ss in cities_alt:# or ss1 in legal_tokens and len(ss1) > 2:
                #print(arr[i] + ' ' + arr[i + 1])
#                 if ss in cities_alt:
#                     new_cities.update([ss])
                i += 1
            else:
                arr_new.append(arr[i])
            i += 1

        if i == len(arr) - 1:
            arr_new.append(arr[-1])
        arr = list(arr_new)
        
        if only_org:
            arr = [e for e in arr if e in org_tokens]
        

        s5 = ' '.join(arr)

        s6 = ''
        last = None
        for ch in s5:
            if last != ch:
                last = ch
                s6 += ch
        
        if use_freq and len(arr) > 0:
            arr = s6.split()
            arr = sorted(arr, key=lambda x: -tokens_freq.get(x, 0))[:1]
            s6 = ' '.join(arr)
        
        global_s2tokens[h][s] = None
        if only_org:
            global_s2tokens[h][s] = s6.split()
        else:
            for ss in [s6, s5, s4, s3, s2, s1, s]:
                res = ss.split()
                if len(res) != 0:
                    global_s2tokens[h][s] = res
                    break
        if global_s2tokens[h][s] is None:
            print(s)
            raise
    return global_s2tokens[h][s]

In [17]:
def prepare(train, use_simple=False):
    clusters = list()
    k2ind = dict()

    freq = defaultdict(lambda: [0, 0, 0, 0, 0, 0, 0])
    cl2cl_neg = defaultdict(int)
    for t1, t1s, t2, t2s, y in zip(train['name_1_tokens'], train['name_1_tokens_simple'],
                                   train['name_2_tokens'], train['name_2_tokens_simple'],
                                   train['is_duplicate']):
        if not use_simple:
            st1 = set(t1)
            st2 = set(t2)
        else:
            st1 = t1s
            st2 = t2s

        k1 = tuple(sorted(st1))
        k2 = tuple(sorted(st2))
        c1 = k2ind.get(k1)
        c2 = k2ind.get(k2)
        if y == 1:
#             c1 = c2 = None
#             for i, c in enumerate(clusters):
#                 if k1 in c:
#                     c1 = i
#                 if k2 in c:
#                     c2 = i
            if c1 is not None:
                if c2 is not None:
                    if c1 != c2:
                        clusters[c1] |= clusters[c2]
                        for e in clusters[c2]:
                            k2ind[e] = c1
                        clusters[c2] = set()
                else:
                    clusters[c1] |= set([k2])
                    k2ind[k2] = c1
            else:
#                if c1 is not None:
                if c2 is not None:
                    clusters[c2] |= set([k1])
                    k2ind[k1] = c2
                else:
                    clusters.append({k1, k2})
                    k2ind[k1] = len(clusters) - 1
                    k2ind[k2] = len(clusters) - 1
#         else:
#             if c1 is None:
#                 clusters.append({k1})
#                 k2ind[k1] = len(clusters) - 1
#             if c2 is None:
#                 clusters.append({k2})
#                 k2ind[k2] = len(clusters) - 1
                

        for w1 in t1:
            freq[w1][0] += 1        
            if w1 not in t2:
                freq[w1][1 + y] += 1
            else:
                freq[w1][3 + y] += 1
        for w2 in t2:
            freq[w2][0] += 1        
            if w2 not in t1:
                freq[w2][1 + y] += 1
            else:
                freq[w2][3 + y] += 1

    for t1, t1s, t2, t2s, y in zip(train['name_1_tokens'], train['name_1_tokens_simple'],
                                   train['name_2_tokens'], train['name_2_tokens_simple'],
                                   train['is_duplicate']):
        if not use_simple:
            st1 = set(t1)
            st2 = set(t2)
        else:
            st1 = t1s
            st2 = t2s
        k1 = tuple(sorted(st1))
        k2 = tuple(sorted(st2))
        c1 = k2ind.get(k1)
        c2 = k2ind.get(k2)
        if y == 0:
            cl2cl_neg[(c1, c2)] += 1
            cl2cl_neg[(c2, c1)] += 1
                
    for c in clusters:
        cc = list(c)
        for i1 in range(len(c)):
            for i2 in range(i1 + 1, len(c)):
                for t1, t2 in [(cc[i1], cc[i2]), (cc[i2], cc[i1])]:
                    for w1 in t1:
                        if w1 not in t2:
                            freq[w1][5] += 1
                        else:
                            freq[w1][6] += 1
    
    return clusters, k2ind, freq, cl2cl_neg

In [18]:
tokens_freq = Counter()
for e in set(train['name_1']):
    tokens_freq.update(tokenize(e)[:2])
for e in set(train['name_2']):
    tokens_freq.update(tokenize(e)[:2])

In [19]:
%%time
for col in ['name_1', 'name_2']:
    print(col)
    train[f'{col}_tokens'] = [tokenize(e) for e in train[col]]
    train[f'{col}_tokens_with_br'] = [tokenize(e, del_brackets=False) for e in train[col]]
    train[f'{col}_tokens_simple'] = [simple_transform(e).split() for e in train[col]] 
#     train[f'{col}_tokens_org'] = [tokenize(e, del_brackets=True, only_org=True) for e in train[col]]
    train[f'{col}_tokens_freq'] = [tokenize(e, del_brackets=True, only_org=False, use_freq=True) for e in train[col]]    

name_1
name_2
Wall time: 28.9 s


In [20]:
%%time

clusters, k2ind, freq, cl2cl_neg = prepare(train)
clusters_simple, k2ind_simple, freq_simple, _ = prepare(train, use_simple=True)

Wall time: 5.7 s


In [21]:
train['cv'] = -np.arange(1, len(train) + 1)
ind_positive = (
    train['name_1_tokens_simple'].apply(lambda x: k2ind_simple.get(tuple(sorted(set(x))))) == 
    train['name_2_tokens_simple'].apply(lambda x: k2ind_simple.get(tuple(sorted(set(x)))))
)
train.loc[ind_positive, 'cv'] = train.loc[ind_positive, 'name_1_tokens_simple'] \
    .apply(lambda x: k2ind_simple.get(tuple(sorted(set(x)))))

In [22]:
train['cv'].value_counts()

 81        645
 106       310
 68        214
 33        177
 66        140
          ... 
-117396      1
-127637      1
-129686      1
-123543      1
-225426      1
Name: cv, Length: 494775, dtype: int64

In [23]:
def calc_is_one_cluster(t1, t2, clusters, k2ind):
    st1 = set(t1)
    st2 = set(t2)
    k1 = tuple(sorted(st1))
    k2 = tuple(sorted(st2))
    c1 = k2ind.get(k1)
    c2 = k2ind.get(k2)
    if c1 is not None and c2 is not None and c1 == c2:
        return 1
    return 0


def calc_cnt_negative(t1, t2, clusters, k2ind, cl2cl_neg):
    st1 = set(t1)
    st2 = set(t2)
    k1 = tuple(sorted(st1))
    k2 = tuple(sorted(st2))
    c1 = k2ind.get(k1)
    c2 = k2ind.get(k2)
    if c1 is not None and c2 is not None:
        return cl2cl_neg.get((c1, c2), 0)
    return -1


def get_cluster_size(t1, clusters, k2ind):
    st1 = set(t1)
    k1 = tuple(sorted(st1))
    c1 = k2ind.get(k1)
    return len(clusters[c1]) if c1 is not None else 0


def calc_lcs(s1, s2):
    c = []
    for i in range(len(s1) + 1):
        c.append([0] * (len(s2) + 1))
    for i in range(0, len(s1)):
        for j in range(0, len(s2)):
            if s1[i] == s2[j]:
                c[i + 1][j + 1] = c[i][j] + 1
            else:
                c[i + 1][j + 1] = max(c[i][j + 1], c[i + 1][j])
    return c[len(s1)][len(s2)]

def expand_tokens(t):
    res = set(t)
    for i in range(len(t) - 1):
        res.add(t[i] + t[i + 1])
    for i in range(len(t) - 2):
        res.add(t[i] + t[i + 1] + t[i + 2])
    for i in range(len(t) - 3):
        res.add(t[i][0] + t[i + 1][0] + t[i + 2][0])    
    for i in range(len(t) - 4):
        res.add(t[i][0] + t[i + 1][0] + t[i + 2][0] + t[i + 3][0])    
    return res


def op(a, b, agg):
    if agg == 'min':
        return min(a, b)
    if agg == 'max':
        return max(a, b)
    if agg == 'sum':
        return a + b
    raise


def calc_features(df_te,
                  clusters_list, k2ind_list, freq_list, cl2cl_neg_list,
                  #clusters_simple_list, k2ind_simple_list
                 ):
    res = df_te.copy()
    
    features_te = defaultdict(list)
    
    for i, df, df_features in [
        (0, df_te, features_te)
    ]:
        print(i)
#        for s1, t1, t1s, t1b, t1o, t1f, s2, t2, t2s, t2b, t2o, t2f in (
        for s1, t1, t1s, t1b, t1f, s2, t2, t2s, t2b, t2f in (
            zip(df['name_1'], df['name_1_tokens'], df['name_1_tokens_simple'],
                df['name_1_tokens_with_br'], df['name_1_tokens_freq'],
                df['name_2'], df['name_2_tokens'], df['name_2_tokens_simple'],
                df['name_2_tokens_with_br'], df['name_2_tokens_freq']
        )):
            df_features['len_max'].append(max(len(s1), len(s2)))
            df_features['len_min'].append(min(len(s1), len(s2)))
            df_features['len_diff'].append(abs(len(s1) - len(s2)))
            df_features['len_diff_rel'].append(abs(len(s1) - len(s2)) / (len(s1) + len(s2)))

            df_features['len_t_max'].append(max(len(t1), len(t2)))
            df_features['len_t_min'].append(min(len(t1), len(t2)))
            df_features['len_t_diff'].append(abs(len(t1) - len(t2)))
            df_features['len_t_diff_rel'].append(abs(len(t1) - len(t2)) / (len(t1) + len(t2)))

            st1 = set(t1)
            st2 = set(t2)
            num, den = len(st1 & st2), len(st1 | st2)
            df_features['sim_tokens_num'].append(num)
            df_features['sim_tokens_den'].append(den)
            df_features['sim_tokens'].append(num / den)
            
            st1_exp = expand_tokens(t1) | set(t1b)
            st2_exp = expand_tokens(t2) | set(t2b)
            num1, den1 = len(st1 & st2_exp), len(st1)
            num2, den2 = len(st2 & st1_exp), len(st2)
            df_features['sim_tokens_exp_min'].append(min(num1 / den1, num2 / den2))
            df_features['sim_tokens_exp_max'].append(max(num1 / den1, num2 / den2))
            
            e1, e2 = ''.join(t1), ''.join(t2)
            ll = calc_lcs(e1, e2)
            try:
                df_features['lcs'].append(ll)
                df_features['lcs_norm'].append(ll / min(len(e1), len(e2)))
                df_features['lcs_norm_max'].append(ll / max(len(e1), len(e2)))
                df_features['lcs_norm_sum'].append(ll / (len(e1) + len(e2)))
            except:
                print(e1, e2, t1, t2)
                raise
            
            for i in range(1, 4):
                e1, e2 = ''.join(t1[:i]), ''.join(t2[:i])
                ll = calc_lcs(e1, e2)
                df_features[f'lcs{i}_norm'].append(ll / min(len(e1), len(e2)))

            e1, e2 = simple_transform(s1), simple_transform(s2)
            ll = calc_lcs(e1, e2)
            df_features['lcs_raw_norm'].append(ll / min(len(e1), len(e2)))
            
#             e1, e2 = ''.join(t1o), ''.join(t2o)
#             ll = calc_lcs(e1, e2)
#             df_features['lcs_org_norm'].append((ll + 0.01) / (min(len(e1), len(e2)) + 0.1))

            e1, e2 = ''.join(t1f), ''.join(t2f)
            ll = calc_lcs(e1, e2)
            df_features['lcs_freq_norm'].append((ll + 0.01) / (min(len(e1), len(e2)) + 0.1))
            
            c_features = defaultdict(list)
            for clusters, k2ind, freq, cl2cl_neg in zip(
#            for clusters, k2ind, freq, cl2cl_neg, clusters_simple, k2ind_simple in zip(
                clusters_list, k2ind_list, freq_list, cl2cl_neg_list#, clusters_simple_list, k2ind_simple_list
            ):
                #c_features['is_one_cluster_simple'].append(calc_is_one_cluster(t1s, t2s, clusters_simple, k2ind_simple))
                
                c_features['is_one_cluster'].append(calc_is_one_cluster(t1, t2, clusters, k2ind))
                
                c_features['cnt_neg_cluster'].append(calc_cnt_negative(t1, t2, clusters, k2ind, cl2cl_neg))
                c_features['cluster_size_max'].append(max(get_cluster_size(t1, clusters, k2ind), 
                                                          get_cluster_size(t2, clusters, k2ind)))

                for ind in range(1, 7):
                    for agg in ['sum', 'min', 'max']:
                        score_u = score_i = score_d = 0
                        if agg == 'min':
                            score_u = score_i = score_d = 1e6
                        for word in st1 & st2:
                            score_i = op(score_i, (freq[word][ind] + 0.01) / (freq[word][0] + 1), agg)
                        for word in st1 | st2:
                            score_u = op(score_u, (freq[word][ind] + 0.01) / (freq[word][0] + 1), agg)
                        for word in st1 ^ st2:
                            score_d = op(score_d, (freq[word][ind] + 0.01) / (freq[word][0] + 1), agg)

                        c_features[f'words_freq_sim_{agg}_{ind}_num'].append(score_i)
                        c_features[f'words_freq_sim_{agg}_{ind}_den'].append(score_u)
                        c_features[f'words_freq_sim_{agg}_{ind}_diff'].append(score_d)
                        c_features[f'words_freq_sim_{agg}_{ind}_rel1'].append(score_i / score_u)
                        c_features[f'words_freq_sim_{agg}_{ind}_rel2'].append(score_d / score_u)
            for k, v in c_features.items():
                if k in {'is_one_cluster', 'cluster_size_max'}:
                    df_features[k].append(max(v))
                else:
                    df_features[k].append(sorted(v)[len(clusters_list) // 2])
        
    features = []
    for k, v in sorted(features_te.items()):
        features.append(k)
        res[k] = v
    print(features)
    return res, features

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import GroupKFold

tr_clusters = []
tr_k2ind = []
tr_freq = []
tr_cl2cl_neg_list = []

df_gr = []

features = None
#np.random.seed(777)
group_kfold = GroupKFold(n_splits=3)
for train_index, test_index in group_kfold.split(train, groups=train['cv']):
    print(len(train_index), len(test_index))
    tr = train.iloc[train_index, :].reset_index(drop=True)
    te = train.iloc[test_index, :].reset_index(drop=True)

    clusters, k2ind, freq, cl2cl_neg = prepare(tr)
    #clusters_simple, k2ind_simple, freq_simple, _ = prepare(tr, use_simple=True)

    te['foo'] = [calc_is_one_cluster(e1, e2, clusters, k2ind) for e1, e2 in zip(te['name_1_tokens'], te['name_2_tokens'])]
    
    display(pd.crosstab(te['foo'], te['is_duplicate']))
    
    df_te, features = calc_features(te, [clusters], [k2ind], [freq], [cl2cl_neg])
                                    #[clusters_simple], [k2ind_simple])
    df_gr.append(df_te)
    
    display(pd.crosstab(df_te['is_one_cluster'], df_te['is_duplicate']))
    
    tr_clusters.append(clusters)
    tr_k2ind.append(k2ind)
    tr_freq.append(freq)
    tr_cl2cl_neg_list.append(cl2cl_neg)

331879 165940


is_duplicate,0,1
foo,Unnamed: 1_level_1,Unnamed: 2_level_1
0,164711,593
1,5,631


0
['cluster_size_max', 'cnt_neg_cluster', 'is_one_cluster', 'lcs', 'lcs1_norm', 'lcs2_norm', 'lcs3_norm', 'lcs_freq_norm', 'lcs_norm', 'lcs_norm_max', 'lcs_norm_sum', 'lcs_raw_norm', 'len_diff', 'len_diff_rel', 'len_max', 'len_min', 'len_t_diff', 'len_t_diff_rel', 'len_t_max', 'len_t_min', 'sim_tokens', 'sim_tokens_den', 'sim_tokens_exp_max', 'sim_tokens_exp_min', 'sim_tokens_num', 'words_freq_sim_max_1_den', 'words_freq_sim_max_1_diff', 'words_freq_sim_max_1_num', 'words_freq_sim_max_1_rel1', 'words_freq_sim_max_1_rel2', 'words_freq_sim_max_2_den', 'words_freq_sim_max_2_diff', 'words_freq_sim_max_2_num', 'words_freq_sim_max_2_rel1', 'words_freq_sim_max_2_rel2', 'words_freq_sim_max_3_den', 'words_freq_sim_max_3_diff', 'words_freq_sim_max_3_num', 'words_freq_sim_max_3_rel1', 'words_freq_sim_max_3_rel2', 'words_freq_sim_max_4_den', 'words_freq_sim_max_4_diff', 'words_freq_sim_max_4_num', 'words_freq_sim_max_4_rel1', 'words_freq_sim_max_4_rel2', 'words_freq_sim_max_5_den', 'words_freq_sim

is_duplicate,0,1
is_one_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,164711,593
1,5,631


331879 165940


is_duplicate,0,1
foo,Unnamed: 1_level_1,Unnamed: 2_level_1
0,164719,1151
1,11,59


0
['cluster_size_max', 'cnt_neg_cluster', 'is_one_cluster', 'lcs', 'lcs1_norm', 'lcs2_norm', 'lcs3_norm', 'lcs_freq_norm', 'lcs_norm', 'lcs_norm_max', 'lcs_norm_sum', 'lcs_raw_norm', 'len_diff', 'len_diff_rel', 'len_max', 'len_min', 'len_t_diff', 'len_t_diff_rel', 'len_t_max', 'len_t_min', 'sim_tokens', 'sim_tokens_den', 'sim_tokens_exp_max', 'sim_tokens_exp_min', 'sim_tokens_num', 'words_freq_sim_max_1_den', 'words_freq_sim_max_1_diff', 'words_freq_sim_max_1_num', 'words_freq_sim_max_1_rel1', 'words_freq_sim_max_1_rel2', 'words_freq_sim_max_2_den', 'words_freq_sim_max_2_diff', 'words_freq_sim_max_2_num', 'words_freq_sim_max_2_rel1', 'words_freq_sim_max_2_rel2', 'words_freq_sim_max_3_den', 'words_freq_sim_max_3_diff', 'words_freq_sim_max_3_num', 'words_freq_sim_max_3_rel1', 'words_freq_sim_max_3_rel2', 'words_freq_sim_max_4_den', 'words_freq_sim_max_4_diff', 'words_freq_sim_max_4_num', 'words_freq_sim_max_4_rel1', 'words_freq_sim_max_4_rel2', 'words_freq_sim_max_5_den', 'words_freq_sim

is_duplicate,0,1
is_one_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,164719,1151
1,11,59


331880 165939


is_duplicate,0,1
foo,Unnamed: 1_level_1,Unnamed: 2_level_1
0,164712,943
1,3,281


0
['cluster_size_max', 'cnt_neg_cluster', 'is_one_cluster', 'lcs', 'lcs1_norm', 'lcs2_norm', 'lcs3_norm', 'lcs_freq_norm', 'lcs_norm', 'lcs_norm_max', 'lcs_norm_sum', 'lcs_raw_norm', 'len_diff', 'len_diff_rel', 'len_max', 'len_min', 'len_t_diff', 'len_t_diff_rel', 'len_t_max', 'len_t_min', 'sim_tokens', 'sim_tokens_den', 'sim_tokens_exp_max', 'sim_tokens_exp_min', 'sim_tokens_num', 'words_freq_sim_max_1_den', 'words_freq_sim_max_1_diff', 'words_freq_sim_max_1_num', 'words_freq_sim_max_1_rel1', 'words_freq_sim_max_1_rel2', 'words_freq_sim_max_2_den', 'words_freq_sim_max_2_diff', 'words_freq_sim_max_2_num', 'words_freq_sim_max_2_rel1', 'words_freq_sim_max_2_rel2', 'words_freq_sim_max_3_den', 'words_freq_sim_max_3_diff', 'words_freq_sim_max_3_num', 'words_freq_sim_max_3_rel1', 'words_freq_sim_max_3_rel2', 'words_freq_sim_max_4_den', 'words_freq_sim_max_4_diff', 'words_freq_sim_max_4_num', 'words_freq_sim_max_4_rel1', 'words_freq_sim_max_4_rel2', 'words_freq_sim_max_5_den', 'words_freq_sim

is_duplicate,0,1
is_one_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,164712,943
1,3,281


In [25]:
from catboost import CatBoostClassifier

params = {
    "iterations": 100,
    "learning_rate": 0.03,
    "depth": 6,
    "l2_leaf_reg": 1.0,
    "rsm": 0.9,
    "border_count": 10,
    "max_ctr_complexity": 2,
    "random_strength": 1.0,
    "bagging_temperature": 100.0,
    "grow_policy": "SymmetricTree",
    "min_data_in_leaf": 5,
    "langevin": True,
    "diffusion_temperature": 100000,
    "auto_class_weights": 'SqrtBalanced',
    "random_seed": 777
}

iter_scores = []

for i in range(len(df_gr)):
    df_tr = pd.concat([df_gr[j] for j in range(len(df_gr)) if i != j])
    df_te = df_gr[i]

    features_final = [e for e in features]

    cb = CatBoostClassifier(**params, verbose=True, eval_metric='F1')
    cb.fit(df_tr[features_final], df_tr['is_duplicate'],
           eval_set=(df_te[features_final], df_te['is_duplicate']), metric_period=10, use_best_model=False)

    for e in sorted(zip(features_final, cb.feature_importances_), key=lambda x: -x[1])[:10]:
        print(e)
        
    pred = cb.predict_proba(df_te[features])[:, 1]
    df_te['pred'] = pred

    best_thr = df_te.sort_values('pred', ascending=False)['pred'].values[1200]
    
    best_score = f1_score(df_te['is_duplicate'], (pred > best_thr).astype(int))
    print((best_thr, best_score))
    print(classification_report(df_te['is_duplicate'], (pred > best_thr).astype(int)))
    
    iter_scores.append({
        'best_thr': best_thr,
        'best_score': best_score,
        'df_te': df_te   
    })
    
    print(len(df_te[(df_te['is_duplicate'] == 1) & (pred < best_thr)]))

0:	learn: 0.9467730	test: 0.9311903	best: 0.9311903 (0)	total: 223ms	remaining: 22.1s
10:	learn: 0.9610123	test: 0.9343348	best: 0.9343348 (10)	total: 835ms	remaining: 6.75s
20:	learn: 0.9644749	test: 0.9361903	best: 0.9361903 (20)	total: 1.5s	remaining: 5.66s
30:	learn: 0.9675724	test: 0.9325961	best: 0.9361903 (20)	total: 2.15s	remaining: 4.8s
40:	learn: 0.9697006	test: 0.9359099	best: 0.9361903 (20)	total: 2.76s	remaining: 3.97s
50:	learn: 0.9715173	test: 0.9379157	best: 0.9379157 (50)	total: 3.39s	remaining: 3.26s
60:	learn: 0.9739005	test: 0.9390188	best: 0.9390188 (60)	total: 4.01s	remaining: 2.56s
70:	learn: 0.9756465	test: 0.9391505	best: 0.9391505 (70)	total: 4.61s	remaining: 1.88s
80:	learn: 0.9765670	test: 0.9408725	best: 0.9408725 (80)	total: 5.19s	remaining: 1.22s
90:	learn: 0.9774560	test: 0.9422835	best: 0.9422835 (90)	total: 5.81s	remaining: 574ms
99:	learn: 0.9788131	test: 0.9413860	best: 0.9422835 (90)	total: 6.35s	remaining: 0us

bestTest = 0.9422834857
bestIteration

In [26]:
df_tr_all = pd.concat(df_gr)
df_tr_all = df_tr_all[(df_tr_all['is_one_cluster'] == 0)].reset_index(drop=True)
display(pd.crosstab(df_tr_all['is_one_cluster'], df_tr_all['is_duplicate']))

is_duplicate,0,1
is_one_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,494142,2687


In [27]:
features_final = [e for e in features if e not in {
    'is_one_cluster', 'cluster_size_max'
}]
len(features_final)

113

In [28]:
params = {
    "iterations": 100,
    "learning_rate": 0.03,
    "depth": 6,
    "l2_leaf_reg": 1.0,
    "rsm": 0.9,
    "border_count": 10,
    "max_ctr_complexity": 2,
    "random_strength": 1.0,
    "bagging_temperature": 100.0,
    "grow_policy": "SymmetricTree",
    "min_data_in_leaf": 5,
    "langevin": True,
    "diffusion_temperature": 100000,
    "auto_class_weights": 'SqrtBalanced',
    "random_seed": 777
}

In [29]:
cbs = dict()
for i in range(10):
    print(i)
    params['random_seed'] = i
    cb = CatBoostClassifier(**params, verbose=True, eval_metric='F1')
    cb.fit(df_tr_all[features_final], df_tr_all['is_duplicate'], metric_period=10)
    cbs[i] = cb
    
#     check({i: cb})
    
    #break

0
0:	learn: 0.9161167	total: 91.5ms	remaining: 9.05s
10:	learn: 0.9392618	total: 1.04s	remaining: 8.42s
20:	learn: 0.9426301	total: 1.99s	remaining: 7.5s
30:	learn: 0.9475476	total: 3.04s	remaining: 6.77s
40:	learn: 0.9499150	total: 3.94s	remaining: 5.67s
50:	learn: 0.9550087	total: 4.9s	remaining: 4.71s
60:	learn: 0.9593658	total: 5.87s	remaining: 3.75s
70:	learn: 0.9622686	total: 6.8s	remaining: 2.78s
80:	learn: 0.9639298	total: 7.92s	remaining: 1.86s
90:	learn: 0.9651679	total: 8.94s	remaining: 884ms
99:	learn: 0.9656934	total: 9.8s	remaining: 0us
1
0:	learn: 0.9029725	total: 81.9ms	remaining: 8.11s
10:	learn: 0.9446329	total: 967ms	remaining: 7.83s
20:	learn: 0.9475559	total: 1.84s	remaining: 6.94s
30:	learn: 0.9495980	total: 2.75s	remaining: 6.11s
40:	learn: 0.9518665	total: 3.62s	remaining: 5.2s
50:	learn: 0.9589325	total: 4.51s	remaining: 4.34s
60:	learn: 0.9605202	total: 5.47s	remaining: 3.5s
70:	learn: 0.9628023	total: 6.39s	remaining: 2.61s
80:	learn: 0.9644021	total: 7.28s	r

In [30]:
for e in sorted(zip(features_final, cbs[0].feature_importances_), key=lambda x: -x[1]):
    print(e)

('lcs1_norm', 10.836996578043953)
('words_freq_sim_min_1_rel1', 9.786178011122107)
('lcs', 9.437657717958901)
('words_freq_sim_min_5_diff', 6.392731985589871)
('words_freq_sim_min_1_den', 5.6364777650020645)
('words_freq_sim_min_3_num', 4.383806156131153)
('words_freq_sim_min_3_diff', 3.925656267737122)
('words_freq_sim_min_3_den', 3.3945231886992246)
('words_freq_sim_min_3_rel1', 3.190884986129143)
('lcs2_norm', 2.358971847368973)
('lcs_norm', 2.2557746055435817)
('lcs_raw_norm', 1.8751969460497075)
('words_freq_sim_sum_1_num', 1.5623252447948135)
('words_freq_sim_max_3_den', 1.5516408599644305)
('words_freq_sim_min_4_diff', 1.4906749689136913)
('words_freq_sim_max_6_num', 1.399732928133153)
('words_freq_sim_max_5_den', 1.2434633796984678)
('words_freq_sim_max_4_den', 1.2172952660007752)
('words_freq_sim_sum_6_num', 1.1829128716331652)
('words_freq_sim_sum_1_diff', 1.0447210102283606)
('words_freq_sim_sum_5_num', 1.029237263219043)
('words_freq_sim_min_2_num', 0.9321574977434453)
('wo

# Прогнозы на тесте

In [31]:
test = pd.read_csv(DATA_DIR.joinpath('test.csv'), index_col="pair_id")

In [32]:
k = 0
for i, s in enumerate(set(test['name_1']) | set(test['name_2'])):
    if s not in docs:
        docs[s] = nlp_en(s)
        k += 1
print(k)

255


In [33]:
new_tokens = set()
for e in docs:
    for t in simple_transform(e).split():
        if t not in all_tokens:
            new_tokens.add(t)
print(len(new_tokens))
for token in new_tokens:
    docs_tokens[token] = nlp_en(token)

124


In [34]:
geo_add = set()
kk = 0
for token in docs_tokens:
    doc = docs_tokens[token]
    for e in doc.ents:
        if e.label_ == 'GPE':
            geo_add.add(token)
            if token not in cities_alt and token not in countries:
                #print(token)
                kk += 1

print(len(geo_add), kk)
geo_re = multi_str_replace([rf"{entity}" for entity in geo_add], debug=False)

879 507


In [35]:
%%time
for col in ['name_1', 'name_2']:
    print(col)
    test[f'{col}_tokens'] = [tokenize(e) for e in test[col]]
    test[f'{col}_tokens_with_br'] = [tokenize(e, del_brackets=False) for e in test[col]]
    test[f'{col}_tokens_simple'] = [simple_transform(e).split() for e in test[col]] 
#     test[f'{col}_tokens_org'] = [tokenize(e, del_brackets=True, only_org=True) for e in test[col]]
    test[f'{col}_tokens_freq'] = [tokenize(e, del_brackets=True, only_org=False, use_freq=True) for e in test[col]]    

name_1
name_2
Wall time: 11.2 s


In [36]:
df_te_all_all, _ = calc_features(test, tr_clusters, tr_k2ind, tr_freq, tr_cl2cl_neg_list)

0
['cluster_size_max', 'cnt_neg_cluster', 'is_one_cluster', 'lcs', 'lcs1_norm', 'lcs2_norm', 'lcs3_norm', 'lcs_freq_norm', 'lcs_norm', 'lcs_norm_max', 'lcs_norm_sum', 'lcs_raw_norm', 'len_diff', 'len_diff_rel', 'len_max', 'len_min', 'len_t_diff', 'len_t_diff_rel', 'len_t_max', 'len_t_min', 'sim_tokens', 'sim_tokens_den', 'sim_tokens_exp_max', 'sim_tokens_exp_min', 'sim_tokens_num', 'words_freq_sim_max_1_den', 'words_freq_sim_max_1_diff', 'words_freq_sim_max_1_num', 'words_freq_sim_max_1_rel1', 'words_freq_sim_max_1_rel2', 'words_freq_sim_max_2_den', 'words_freq_sim_max_2_diff', 'words_freq_sim_max_2_num', 'words_freq_sim_max_2_rel1', 'words_freq_sim_max_2_rel2', 'words_freq_sim_max_3_den', 'words_freq_sim_max_3_diff', 'words_freq_sim_max_3_num', 'words_freq_sim_max_3_rel1', 'words_freq_sim_max_3_rel2', 'words_freq_sim_max_4_den', 'words_freq_sim_max_4_diff', 'words_freq_sim_max_4_num', 'words_freq_sim_max_4_rel1', 'words_freq_sim_max_4_rel2', 'words_freq_sim_max_5_den', 'words_freq_sim

In [43]:
df_te_all_all['pred'] = 0
k = 0
for i in cbs:
    pred = cbs[i].predict_proba(df_te_all_all[features_final], ntree_end=80)[:, 1]
    df_te_all_all[f'pred{i}'] = pred
    df_te_all_all['pred'] += pred / len(cbs)
    k += 1
#     if i == 0:
#         break

#df_te_all_all['pred_final'] =  df_te_all_all['pred']
df_te_all_all['pred_final'] =  df_te_all_all[[f'pred{i}' for i in range(k)]].min(axis=1)

topn = 1600

col = 'pred_final'
df_te_all_all.loc[df_te_all_all['is_one_cluster'] == 1, col] = 1
thr_topn = df_te_all_all.sort_values(col)[col].values[::-1][topn]
df_te_all_all['is_duplicate'] = (df_te_all_all[col] > thr_topn).astype(int)

In [44]:
df_te_all_all[['is_duplicate']].to_csv('subm_final.csv')