# Вычленение фичей на основании схожести строк

In [113]:
import re
import math
from collections import Counter

import pandas as pd
from fuzzywuzzy import fuzz
import Levenshtein

In [21]:
data = pd.read_csv(
    '../data/processed/unificated_train.csv', 
    index_col='pair_id'
)

In [114]:
data.sample(5)

Unnamed: 0_level_0,name_1,name_2,is_duplicate,transliterated_name_1,transliterated_name_2,concated_name_1,concated_name_2,trans_wratio,trans_partial_ratio,trans_token_sort_ratio,...,trans_levenshtein_ratio,trans_jaro,conc_wratio,conc_partial_ratio,conc_token_sort_ratio,conc_levenshtein,conc_levenshtein_ratio,conc_jaro,trans_cosine,trans_jaccard
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
251537,russell a farrow canada 106 earl,vibac canada inc.,0,russell a farrow canada 106 earl,vibac canada inc.,russellafarrowcanada106earl,vibaccanadainc,0.86,0.53,0.38,...,0.71875,0.52948,0.45,0.5,0.34,20,0.740741,0.43006,0.235702,0.25
312791,leschaco agente de transportes e,ram,0,leschaco agente de transportes e,ram,leschacoagentedetransportese,ram,0.4,0.67,0.11,...,0.9375,0.454861,0.4,0.67,0.13,26,0.928571,0.454861,0.0,0.133333
9905,h.r. international,al khamsan & bhasin gen trading co.,0,h.r. international,al khamsan & bhasin gen trading co.,hrinternational,alkhamsanbhasingentradingco,0.45,0.44,0.37,...,0.8,0.544037,0.43,0.48,0.38,21,0.777778,0.537037,0.0,0.578947
457557,prabhat industries llc,utc,0,prabhat industries llc,utc,prabhatindustriesllc,utc,0.6,0.67,0.24,...,0.863636,0.459596,0.6,0.67,0.26,17,0.85,0.459596,0.0,0.2
476902,technical industrial supply llc,apa industries inc.,0,technical industrial supply llc,apa industries inc.,technicalindustrialsupplyllc,apaindustriesinc,0.55,0.58,0.37,...,0.612903,0.640245,0.56,0.62,0.5,17,0.607143,0.63268,0.0,0.75


## Вычленение фичей из транслитерированных названий

In [23]:
# Ratio of similar characters on both string
data['trans_wratio'] = data.apply(
    lambda row: fuzz.WRatio(
        row.transliterated_name_1,
        row.transliterated_name_2
    ) / 100, 
    axis = 1
)

# Ratio of the most similar substring
data['trans_partial_ratio'] = data.apply(
    lambda row: fuzz.partial_ratio(
        row.transliterated_name_1,
        row.transliterated_name_2
    ) / 100, 
    axis = 1
)

# Measure of the sequences' tokens similarity
data['trans_token_sort_ratio'] = data.apply(
    lambda row: fuzz.token_sort_ratio(
        row.transliterated_name_1,
        row.transliterated_name_2
    ) / 100,
    axis = 1
)

In [25]:
# Levenshtein distance
data['trans_levenshtein'] = data.apply(
    lambda row: Levenshtein.distance(
        row.transliterated_name_1,
        row.transliterated_name_2
    ),
    axis = 1
)

# Levenshtein distance normalized to the maximum length
data['trans_levenshtein_ratio'] = data.apply(
    lambda row: Levenshtein.distance(
        row.transliterated_name_1,
        row.transliterated_name_2
    ) / max(
        len(row.transliterated_name_1),
        len(row.transliterated_name_2)
    ), 
    axis = 1
)

# Jaro distance
data['trans_jaro'] = data.apply(
    lambda row: Levenshtein.jaro(
        row.transliterated_name_1,
        row.transliterated_name_2
    ),
    axis = 1
)

In [33]:
def text_to_vector(text: str) -> Counter:
    '''
    Transform text to vector
    
    Args:
        text: Original text
        
    Returns:
        Corresponding vector
    '''
    word = re.compile(r'\w+')
    words = word.findall(text)
    
    return Counter(words)


def get_cos_distance(string_1: str, string_2: str) -> float:
    '''
    Get cosine distance between two strings
    
    Args:
            string_1: First string
            string_2: Second string
            
    Returns:
        Cosine distance between vectors
    '''
    def _get_cosine(vec_1: Counter, vec_2: Counter) -> float:
        '''
        Get cosine distance between two vectors
        
        Args:
            vec_1: First vector
            vec_2: Second vector
            
        Returns:
            Cosine distance between vectors
        '''
        intersection = set(vec_1.keys()) & set(vec_2.keys())
        numerator = sum([vec_1[x] * vec_2[x] for x in intersection])

        sum1 = sum([vec_1[x]**2 for x in vec_2.keys()])
        sum2 = sum([vec_1[x]**2 for x in vec_2.keys()])
        
        denominator = math.sqrt(sum1) * math.sqrt(sum2)

        if not denominator:
            return 0.0
        else:
            return float(numerator) / denominator
        
    vector_1 = text_to_vector(string_1)
    vector_2 = text_to_vector(string_2)

    return _get_cosine(vector_1, vector_2)

In [34]:
# Cosine distance between strings
data['trans_cosine'] = data.apply(
    lambda row: get_cos_distance(
        row.transliterated_name_1, 
        row.transliterated_name_2
    ), 
    axis = 1
)

In [42]:
def jaccard_similarity(string_1: str, string_2: str) -> float:
    '''
    Get jaccard similarity of two strings

    Args:
      string_1: First string
      string_2: Second string

    Returns:
        Jaccard similarity of string
    '''
    intersection_cardinality = len(set.intersection(*[set(string_1), set(string_2)]))
    union_cardinality = len(set.union(*[set(string_1), set(string_2)]))
    
    return intersection_cardinality/float(union_cardinality)

In [43]:
# Jaccard similarity of strings
data['trans_jaccard'] = data.apply(
    lambda row: jaccard_similarity(
        row.transliterated_name_1, 
        row.transliterated_name_2
    ), 
    axis = 1
)

## Вычленение фичей из очищенных и конкатенированных названий

In [26]:
# Ratio of similar characters on both string
data['conc_wratio'] = data.apply(
    lambda row: fuzz.WRatio(
        row.concated_name_1,
        row.concated_name_2
    ) / 100, 
    axis = 1
)

# Ratio of the most similar substring
data['conc_partial_ratio'] = data.apply(
    lambda row: fuzz.partial_ratio(
        row.concated_name_1, 
        row.concated_name_2
    ) / 100,
    axis = 1
)

# Measure of the sequences' tokens similarity
data['conc_token_sort_ratio'] = data.apply(
    lambda row: fuzz.token_sort_ratio(
        row.concated_name_1, 
        row.concated_name_2
    ) / 100,
    axis = 1
)

In [27]:
# Levenshtein distance
data['conc_levenshtein'] = data.apply(
    lambda row: Levenshtein.distance(
        row.concated_name_1, 
        row.concated_name_2
    ), 
    axis = 1
)

# Levenshtein distance normalized to the maximum length
data['conc_levenshtein_ratio'] = data.apply(
    lambda row: Levenshtein.distance(
        row.concated_name_1, 
        row.concated_name_2
    ) / max(
        len(row.concated_name_1),
        len(row.concated_name_2)
    ), 
    axis = 1
)

# Jaro distance
data['conc_jaro'] = data.apply(
    lambda row: Levenshtein.jaro(
        row.transliterated_name_1, 
        row.concated_name_2
    ), 
    axis = 1
)

## Сохранение датафрейма с фичами

In [115]:
# Dropping all non-numerical columns
data_numerical = data.drop(
    ['name_1',
     'name_2',
     'concated_name_1',
     'concated_name_2',
     'transliterated_name_1',
     'transliterated_name_2'
    ],
    axis=1
)

In [116]:
data_numerical.head()

Unnamed: 0_level_0,is_duplicate,trans_wratio,trans_partial_ratio,trans_token_sort_ratio,trans_levenshtein,trans_levenshtein_ratio,trans_jaro,conc_wratio,conc_partial_ratio,conc_token_sort_ratio,conc_levenshtein,conc_levenshtein_ratio,conc_jaro,trans_cosine,trans_jaccard
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0,0.86,0.68,0.55,20,0.555556,0.651731,0.62,0.69,0.58,16,0.551724,0.620819,0.258199,0.666667
2,0,0.86,0.65,0.59,17,0.485714,0.678951,0.65,0.7,0.65,14,0.482759,0.634433,0.57735,0.7
3,0,0.4,0.67,0.12,31,0.939394,0.409091,0.4,0.67,0.13,25,0.925926,0.409091,0.0,0.105263
4,0,0.43,0.29,0.34,20,0.869565,0.408558,0.36,0.4,0.15,19,0.904762,0.495652,0.0,0.235294
5,0,0.6,0.67,0.24,26,0.8125,0.522652,0.64,0.71,0.28,24,0.827586,0.51341,0.0,0.3


In [117]:
# Saving of preprocessed dataset
data_numerical.to_csv('../data/processed/numerical_train.csv')