In [3]:
import pandas as pd
import numpy as np
import math
import re
from tqdm import tqdm_notebook as tqdm
from nltk.metrics.distance import jaro_winkler_similarity
from nltk.metrics.distance import jaccard_distance
from statsmodels.stats.power import TTestIndPower

Hyperparameters:

In [4]:
threshold_min = 0.5  # fuzzy search cutoff
sample_size = 100  # sample volume in %

Power analysis parameters:

In [5]:
effect = 0.9
confidence = 0.9
stat_power = 0.9

Fuzzy match method:

In [6]:
def match(pattern, options):

    best_match = None
    min_score = threshold_min

    for item in options:

        (p_set, i_set) = (set(pattern.split()), set(item.split()))
        (p_list, i_list) = (list(pattern.split()), list(item.split()))
        max_length = min(len(p_list), len(i_list))

        jw_metric = jaro_winkler_similarity(p_list, i_list, p=(1/max_length), max_l=max_length)
        jc_metric = (1-jaccard_distance(p_set, i_set))
        av_metric = 0.5*(jw_metric + jc_metric)

        if av_metric > min_score:
            min_score = av_metric
            best_match = item

    return [best_match, min_score]

Custom functions:

In [7]:
def fill_series(S, E):
    S = S.append(pd.Series(E), ignore_index=True)
    return S

Read device directory in df_devices:

In [8]:
df_dev = pd.read_csv('data/devices_clean.csv')
df_dev = df_dev.astype({'ven_d': 'str', 'model_d': 'str', 'tac_d': 'str'})

Read market catalog:

In [9]:
df_market = pd.read_csv('data/market_clean.csv')[['ven_m', 'class_m', 'model_m']]
df_market = df_market.astype({'ven_m': 'str', 'model_m': 'str', 'class_m': 'str'})

Imbalance between samples:

In [10]:
str('Pattern sample > = Basic sample') if len(df_market) > len(df_dev) else 'Pattern sample < Basic sample'

'Pattern sample > = Basic sample'

Relative base sample size:

In [11]:
ratio = int(100*round(len(df_dev)/len(df_market), 2))
print('Relative base sample size: {}%'.format(ratio))

Relative base sample size: 27%


Series for recognized model, tac numbers and affinity metrics:

In [12]:
match_series, tac_series, score_series = (pd.Series(), pd.Series(), pd.Series())

Shuffle the entries randomly:

In [13]:
df_market = df_market.sample(frac=1).reset_index(drop=True)

Define the sample size:

In [14]:
n_sample = round((sample_size/100)*len(df_market['model_m']))

Matching and filling match_series, tac_series, score_series:

In [15]:
for item in tqdm(df_market['model_m'][:int(n_sample)]):

    m_vendor = df_market[df_market['model_m'] == item]['ven_m'].values[0]

    if not df_dev[df_dev['ven_d'] == m_vendor].empty:
        d_models = list(df_dev[df_dev['ven_d'] == m_vendor]['model_d'])
        match_model, match_score = match(item, d_models)
    else:
        match_series, score_series = (fill_series(match_series, ''), fill_series(score_series, ''))
        tac_series = fill_series(tac_series, '')
        continue

    if match_model is None:
        match_series, score_series = (fill_series(match_series, ''), fill_series(score_series, ''))
        tac_series = fill_series(tac_series, '')
        continue

    tac_series = fill_series(tac_series, df_dev[df_dev['model_d'] == match_model]['tac_d'].values[0])
    score_series, match_series = (fill_series(score_series, match_score), fill_series(match_series, match_model))

HBox(children=(IntProgress(value=0, max=37041), HTML(value='')))




Add the series to the base frame:

In [16]:
df_match = pd.DataFrame({'model_d': match_series, 'score': score_series, 'tac_d': tac_series})
df_market_matched = pd.concat([df_market[:int(n_sample)], df_match], axis=1)

Change empty entries for NaNs:

In [17]:
df_market_matched = df_market_matched.apply(lambda x: x.str.strip() if isinstance(x, str) else x).replace('', np.nan)

Drop records with missing values and order by affinity:

In [18]:
df_market_matched_clean = df_market_matched.dropna(how='any', axis=0)
df_market_matched_clean = df_market_matched_clean.sort_values(by=['score'], ascending=False)

Display first entries:

In [19]:
df_market_matched_clean.head()

Unnamed: 0,ven_m,class_m,model_m,model_d,score,tac_d
16231,zte,мобильные телефоны,blade v 9,blade v 9,1.0,86951103
14158,lenovo,мобильные телефоны,a 6010,a 6010,1.0,99001292
26196,archos,планшеты,70 b helium,70 b helium,1.0,35524607
9191,philips,мобильные телефоны,s 398,s 398,1.0,86858002
19208,supra,планшеты,m 74 ag,m 74 ag,1.0,35163407


Show random sample:

In [20]:
df_market_matched_clean.sample(n=min(5, len(df_market_matched_clean)))

Unnamed: 0,ven_m,class_m,model_m,model_d,score,tac_d
19066,alcatel,мобильные телефоны,pop c 5,one touch pop c 5 dual ot 5036 d,0.555556,86545602
19190,nokia,мобильные телефоны,asha 501 sim,asha 208 dual sim 2 rm 956,0.508598,35727705
779,acer,мультимедиа проекторы,v 7500,v 350,0.583333,35561705
12002,philips,радиотелефоны,d 6051,d 813,0.583333,86496500
23410,meizu,мобильные телефоны,m 6 t 3,m 2 m 578 m 3,0.520833,86844902


Fraction of unrecognized patterns:

In [21]:
print('Fraction of unrecognized patterns:', round(100-100*len(df_market_matched_clean)/len(df_market_matched)), '%')

Доля нераспознанных паттернов: 97 %


Records recognized:

In [22]:
print('Records recognized:', len(df_market_matched_clean))

Объем выборки с распознанными паттернами: 1135


Minimum sample for test:

In [23]:
sample_size = TTestIndPower().solve_power(effect, power=stat_power,nobs1=None, ratio=ratio, alpha=(1-confidence))
print('Minimum sample for test:', int(sample_size))

Minimum sample for test: 11


Definition of statistics:

In [24]:
(std_score, avg_score) = (df_market_matched_clean['score'].std(), df_market_matched_clean['score'].mean())
(min_score, max_score) = (df_market_matched_clean['score'].min(), df_market_matched_clean['score'].max())

Affinity score statistics:

In [25]:
print('Average:', round(avg_score, 2))
print('Standard:', round(std_score, 2))
print('Min:', round(min_score, 2))
print('Max:', round(max_score, 2))

Average: 0.66
Standard: 0.12
Min: 0.5
Max: 1.0


Stability diagram:

<img src='images/stability.png' width = 400 align='left'>

Write the table of fuzzy matches:

In [26]:
df_market_matched_clean.to_csv('data/market_matched_clean.csv', index=False)