In [None]:
# install libs
!pip install cleanco
!pip install transliterate
!pip install thefuzz
!pip install abydos
!pip install python-Levenshtein
!pip install jellyfish
!pip install distance

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import re
import unicodedata
from cleanco import basename
from transliterate.decorators import transliterate_function

import difflib
import abydos.distance as abd
from thefuzz import fuzz
import Levenshtein, distance, jellyfish
import warnings

warnings.filterwarnings("ignore")

In [None]:
!wget -O train.csv https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/03C9AjRJqukWcg

In [None]:
df = pd.read_csv('train.csv', index_col='pair_id')
df.head()

Unnamed: 0_level_0,name_1,name_2,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
2,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
3,"Rishichem Distributors Pvt., Ltd.",Dsa,0
4,Powermax Rubber Factory,Co. One,0
5,Tress A/S,Longyou Industries Park Zhejiang,0


In [None]:
df.is_duplicate.value_counts(normalize=True)

0    0.992652
1    0.007348
Name: is_duplicate, dtype: float64

#### Checking how many company names in the columns "name_1" and "name_2" contain Cyrillic characters

In [None]:
a = ord('а') # Cyrillic characters "А"
rus_alphabet = ''.join([chr(i) for i in range(a,a+6)] + [chr(a+33)] + [chr(i) for i in range(a+6,a+32)])

check = (df.name_1 + df.name_2).apply(lambda x: bool(set(rus_alphabet).intersection(set(x.lower()))))
check.value_counts(normalize=True)

False    0.99507
True     0.00493
dtype: float64

#### Target variable distribution when comparing strings with Cyrillic characters:

In [None]:
df[check].is_duplicate.value_counts(normalize=True)

0    0.939283
1    0.060717
Name: is_duplicate, dtype: float64

#### Preprocessing

In [None]:
def preprocessing(x: str) -> str:
    
    def rus_preprocess(x: str) -> str:
        forms_of_ownership = [
            'ооо',
            'оао',
            'общество с ограниченной ответственностью',
            'открытое акционерное общество',
            'филиал компании'
        ]
        
        for form in forms_of_ownership:
            pattern = re.compile(form)
            x = pattern.sub('', x)
            
        return x
    
    @transliterate_function(language_code='ru', reversed=True)
    def translit(x: str) -> str:
        return x
    
    
    x = x.strip().casefold() # analoque of lower(), if simbols in latin1
    x = basename(x) if not bool(set(rus_alphabet).intersection(set(x.lower()))) else rus_preprocess(x)
    x = unicodedata.normalize('NFKD', x).encode('ASCII', 'ignore').decode() if not bool(set(rus_alphabet).intersection(set(x.lower()))) else translit(x)
    x = basename(x)
    x = re.sub(r'[^\w\s]',' ', x)
    
    return ' '.join([s for s in x.split() if len(s) > 1])

In [None]:
df.name_1 = df.name_1.apply(lambda x: preprocessing(x))
df.name_2 = df.name_2.apply(lambda x: preprocessing(x))
df

Unnamed: 0_level_0,name_1,name_2,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,iko industries,enormous industrial trade,0
2,apcotex industries,technocraft industries india,0
3,rishichem distributors,dsa,0
4,powermax rubber factory,co one,0
5,tress,longyou industries park zhejiang,0
...,...,...,...
497815,bit mat products,the goodyear tire and rubber,0
497816,bnd trading,zhong shan yue liang economy trade imp exp,0
497817,xeikon industrial co ltd of dongguan city,yi cheng trading co ltd of dongguan city,0
497818,shanghai kechuan trading,shanghai stationery,0


#### Features generation

In [None]:
# Levenshtein
df.insert(2, 'l', (df.name_1 + '|' + df.name_2).apply(lambda x: Levenshtein.ratio(*x.split('|'))))

# Discounted_Levenshtein
df.insert(3, 'dl', (df.name_1 + '|' + df.name_2).apply(lambda x: abd.DiscountedLevenshtein().sim(*x.split('|'))))

# String Subsequence Kernel Similarity
df.insert(4, 'ssk', (df.name_1 + '|' + df.name_2).apply(lambda x: abd.SSK().sim(*x.split('|'))))

# Token Sort Ratio
df.insert(5, 'ts', (df.name_1 + '|' + df.name_2).apply(lambda x: fuzz.token_sort_ratio(*x.split('|')) / 100))

# Token Set Ratio
df.insert(6, 'tsr', (df.name_1 + '|' + df.name_2).apply(lambda x: fuzz.token_set_ratio(*x.split('|')) / 100))

# Partial Ratio
df.insert(7, 'pr', (df.name_1 + '|' + df.name_2).apply(lambda x: fuzz.partial_ratio(*x.split('|')) / 100))

# Weighted Ratio
df.insert(8, 'wr', (df.name_1 + '|' + df.name_2).apply(lambda x: fuzz.WRatio(*x.split('|')) / 100))

# Sequence Matcher Ratio
df.insert(9, 'smr', (df.name_1 + '|' + df.name_2).apply(lambda x: difflib.SequenceMatcher(None, *x.split('|')).ratio()))


In [None]:
# Jaro Similarity
df.insert(10, 'js', (df.name_1 + '|' + df.name_2).apply(lambda x: jellyfish.jaro_similarity(*x.split('|'))))

# Jaro-Winkler Similarity
df.insert(11, 'jws', (df.name_1 + '|' + df.name_2).apply(lambda x: jellyfish.jaro_winkler_similarity(*x.split('|'))))

# Match Rating Approach
df.insert(12, 'mra', (df.name_1 + '|' + df.name_2).apply(lambda x: int(False if jellyfish.match_rating_comparison(*x.split('|')) is None else jellyfish.match_rating_comparison(*x.split('|')))))

# mean from all previous features
df.insert(13, 'mean', df[df.columns[2:-2]].mean(axis=1))

In [None]:
df.head(2)

Unnamed: 0_level_0,name_1,name_2,l,dl,ssk,ts,tsr,pr,wr,smr,js,jws,mra,mean,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,iko industries,enormous industrial trade,0.564103,0.38109,0.58948,0.56,0.56,0.71,0.64,0.564103,0.59039,0.59039,1,0.574955,0
2,apcotex industries,technocraft industries india,0.608696,0.424665,0.671847,0.52,0.71,0.72,0.86,0.565217,0.695106,0.695106,0,0.647064,0


#### Saving the dataframe with features

In [None]:
df.to_csv('features.csv')