In [1]:
import pandas as pd
import numpy as np
import math
import re
from tqdm import tqdm_notebook as tqdm
from tqdm.auto import tqdm as progress
import warnings
from datetime import datetime

import json

from nltk.metrics.distance import edit_distance as ed
from nltk.metrics.distance import jaro_winkler_similarity as jw
from textdistance import hamming as hm

from textdistance import jaccard as jc
from textdistance import lcsstr as lcs

import recordlinkage as rl

Hyperparameters:

In [2]:
threshold = 0.8

Parameters:

In [3]:
rem_warnings = True

CDFs:

In [4]:
def match(pattern, options):
    best_match = None
    min_score = 0

    for item in options:

        (p_list, i_list) = (list(pattern.split()), list(item.split()))

        max_pref = min(len(p_list), len(i_list))
        max_len = max(len(p_list), len(i_list))
        weight = 1/max_pref

        metrics = []

        metrics.append(1 - ed(p_list, i_list)/max_len)
        metrics.append(jw(p_list, i_list, p=weight, max_l=max_pref))
        metrics.append(1 - hm(p_list, i_list)/max_len)

        metrics.append(len(lcs(p_list, i_list))/max_pref)
        metrics.append(jc(p_list, i_list))

        int_metric = np.mean(metrics)

        if int_metric > min_score:
            min_score = int_metric
            best_match = item

    return [best_match, min_score]

Supress warnings:

In [5]:
if rem_warnings is True:
    warnings.filterwarnings('ignore')

Pandas progress:

In [6]:
progress.pandas()

Read first source with distinct:

In [7]:
df_dev = pd.read_csv('data/devices_clean.csv').astype({'model_tac_devices': 'str'})
df_dev.drop_duplicates(inplace=True, keep='first', subset=['model_tac_devices'])

Define types:

In [8]:
type_values = {'category_devices': 'str', 'vendor_devices': 'str', 'model_tac_devices': 'int64'}
type_values.update({'model_name_devices': 'str', 'model_name_devices_clean': 'str'})
df_dev = df_dev.astype(type_values)

Show header:

In [9]:
df_dev.head(0)

Unnamed: 0,category_devices,vendor_devices,model_tac_devices,model_name_devices,model_name_devices_clean


Read second source with distinct:

In [10]:
df_market = pd.read_csv('data/market_clean.csv')
df_market.drop_duplicates(inplace=True, keep='first', subset=['model_id_market'])

Define types:

In [11]:
type_values = {'category_market': 'str', 'vendor_market': 'str', 'model_id_market': 'int64'}
type_values.update({'model_name_market': 'str', 'model_name_market_clean': 'str', 'model_url_market': 'str'})
df_market = df_market.astype(type_values)

Show header:

In [12]:
df_market.head(0)

Unnamed: 0,category_market,vendor_market,model_name_market,model_name_market_clean,model_id_market,model_url_market


Create empty frame for matched dataframe:

In [13]:
cn = ['model_tac_devices', 'model_id_market', 'score', 'category_match']
df_match = pd.DataFrame(columns=cn)
frame_index = 0

Record Linkage Left Join:

In [14]:
for i in tqdm(df_dev.index):
    
    model_clean_d = df_dev['model_name_devices_clean'][i]
    model_tac_d = df_dev['model_tac_devices'][i]
    vendor_d = df_dev[df_dev['model_name_devices_clean'] == model_clean_d]['vendor_devices'].values[0]

    vendors_m = df_market[df_market['vendor_market'] == vendor_d]
    
    if vendors_m.empty:
        continue
    else:
        model_mark_clean, match_score = match(model_clean_d, list(vendors_m['model_name_market_clean']))
        
    if model_mark_clean is None:
        continue
        
    if df_market[df_market['model_name_market_clean'] == model_mark_clean]['vendor_market'].values[0] != vendor_d:
        continue

    category_match_value = 1 if (match_score > threshold) else 0    
    id_market_value = df_market[df_market['model_name_market_clean'] == model_mark_clean]['model_id_market'].values[0]
    
    dict_match = {'model_tac_devices': model_tac_d, 'model_id_market': id_market_value}               
    dict_match.update({'score': match_score, 'category_match': category_match_value})
    df_match = df_match.append(pd.DataFrame(dict_match, index=[frame_index]),  ignore_index=True) 

HBox(children=(IntProgress(value=0, max=9885), HTML(value='')))




Format score:

In [15]:
df_match['score'] = df_match['score'].apply(lambda x: float('{0:.2f}'.format(x)))
df_match = df_match.sort_values(by=['score'], ascending=False)

Matching statistics:

In [16]:
match_min = min(len(set(df_market['model_name_market_clean'])), len(set(df_dev['model_name_devices_clean'])))

print('Number of recognized samples:', round(len(df_match[df_match['category_match']==1])))
print('Number of perfect matches:', len(df_match[df_match['score']==1]))
print('Max. number of recognized samples:', round(match_min))

Number of recognized samples: 212
Number of perfect matches: 116
Max. number of recognized samples: 8868


Add market features:

In [17]:
df_match = pd.merge(df_match, df_market, how ='inner', on ='model_id_market')

Left join to devices catalog:

In [18]:
df_match = pd.merge(df_dev, df_match, how ='left', on ='model_tac_devices')

Add 'date' feature:

In [19]:
df_match['business_dt']=datetime.today().strftime('%Y-%m-%d')

Drop features:

In [20]:
df_match = df_match.drop(['model_name_devices_clean', 'model_name_market_clean'], axis = 1)

Define order:

In [21]:
order = ['category_devices', 'vendor_devices', 'model_tac_devices', 'model_name_devices']
order = order + ['category_market', 'vendor_market', 'model_name_market', 'model_id_market', 'model_url_market']
order = order + ['score', 'category_match', 'business_dt']

Change order of features in dataframe:

In [22]:
df_match = df_match[order]

Load to csv:

In [23]:
df_match.to_csv('data/left_join_dev_mark.csv', index=False)