In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import pickle
import re
import mmh3

## Utils

In [26]:
class LSHSimilarGidNameFinderExperimental():
    '''
    Class desined for local experiments with LSH approach
    '''
    
    def __init__(self, df):
        self.df = df
        
    def get_candidates(self, gid, bucket_cnt=10, hash_table_size=4099):
        
        idx = self.df[self.df.gid==gid].index[0]
        
        buckets_dcts = generate_bucket_dcts(bucket_cnt=10, hash_table_size=4099)
        
        candidates = set()
        
        fp = self.df.iloc[idx, :].footprint

        for bucket in range(buckets_cnt):

            buckets_dct = buckets_dcts[bucket]

            slc = slice(bucket*bucket_size, (bucket + 1)*bucket_size)

            bucket_fp = tuple(fp[slc])

            candidates = candidates.add(buckets_dct[bucket_fp])
            
        return self.df.iloc[list(candidates), :]

    def get_real_neibougrs(self, gid, jaccard_distance=0.9):
        
        idx = self.df[self.df.gid==gid].index[0]
        sh = self.df.ngramms.values[idx]
        
        dist = self.df.ngramms.apply(lambda x: jaccard_sim(x, sh))
        
        return self.df[dist > jaccard_distance, :]
    
    
    def generate_bucket_dcts(self, bucket_cnt=10, hash_table_size=4099):
        
        footprint = np.vstack(self.df.footprint.values)
        
        footprint = footprint % hash_table_size

        buckets_dcts = []

        bucket_size = footprint.shape[1]//buckets_cnt
        
        for bucket in range(buckets_cnt):
    
            slc = slice(bucket*bucket_size, (bucket + 1)*bucket_size)

            df['bucket_footprint'] = df['footprint'].apply(lambda x: tuple(x[slc]))

            dct = df[['gid', 'bucket_footprint']].groupby('bucket_footprint')['gid'].agg(set).to_dict()

            buckets_dcts.append(dct)
            
        return buckets_dcts
        
    def jaccard_sim(s, k):
        
        num = len(set.intersection(set(s), set(k)))
        denom = len(set.union(set(s), set(k)))

        return num/denom


In [3]:
def parse_emojis(tokens):
    new_tokens = []
    for t in tokens:
        if not "emjxx" in t:
            new_tokens.append(t)
        else:
            if len(t) < 12:
                new_tokens.append("{}".format(chr(int(t[5:]))))
    return new_tokens

In [4]:
def tokenizer(text, special_words=False):
    text = text.lower()
    text = re.sub('\[\[[^\[\]]+\]\]', '', text)
    text = re.sub(r'ё', r'е', text)
    text = re.sub(r'\xa0', r' ', text)
    text = re.sub(r'&quot;', r' ', text)
    text = re.sub(r'&gt;', r' ', text)
    text = re.sub(r'&lt;', r' ', text)
    text = re.sub(r'<br>', r' tokenbr ', text)
    text = re.sub(r'https?://[^\s]+', r' ', text)
    text = re.sub(r'@[\w\d]+', r'uid', text)
    non_text_chars = '|'.join(map(chr, range(0, 10)));
    text = re.sub(non_text_chars, r' ', text)
    text = re.sub(r'\[[^\[\]]+\|([^\[\]]+)\]', r'\1', text)
    text = re.sub(r'emj33', r'', text)
    text = re.sub(r'([\U00010000-\U0010ffff])', r'\1 ', text)
    text = re.sub(r'([\U00002100-\U000027B0])', r'\1 ', text)
    text = re.sub(r'(&#(\d+);)', r' emjxx\2 ', text)
    text = text.strip()
    tokens = text.split(' ')
    tokens = parse_emojis(tokens)
    res = " ".join(tokens)
    return res

In [5]:
ngramm_size = 3

def text_to_ngramms(s, ngramm_size=ngramm_size):
    
    if s == '':
        return []
    
    ngramms = [s[i:i+ngramm_size] for i in range(max(1, len(s) - ngramm_size + 1))]
    
    return ngramms

In [6]:
def ngramms_to_hashes(ngramms):

    hash_values = [hash(ngramm) for ngramm in ngramms]
    
    return hash_values

## Load data

In [7]:
with open("dump_groups_190404.txt", 'r', encoding="utf-8") as f:
    lines = []
    for i, line in enumerate(f):
        try:
            lines.append(eval(line.strip()))
        except Exception:
            break
            

In [8]:
df = pd.DataFrame(lines)

In [9]:
df['ngramms'] = df['name'].astype(str).apply(text_to_ngramms)

In [10]:
df['hashes'] = df['ngramms'].apply(ngramms_to_hashes)

In [11]:
df = df[pd.notnull(df['name'])]

In [32]:
df[['name']].head(20)

Unnamed: 0,name
0,ААААА
1,DKPHOTO
2,&quot;ЭКСПЕРИМЕНТ&quot;
4,Сəукеле
5,Душа анимешника
6,_АК_
7,Теплая постель
8,Minecraft сервера [Реклама своего Сервера]
9,репостики
10,Г


## Process Data

In [12]:
def min_hash(hash_lst, seed=0):
    return min(mmh3.hash(str(item), seed=seed) for item in hash_lst)


In [13]:
for i in range(100):
    
    outputCol="footprint_{}".format(i)
    df[outputCol] = df['hashes'].apply(lambda x: min_hash(x, i))

In [15]:
df['footprint'] = df[["footprint_{}".format(i) for i in range(100)]].apply(lambda x: list(x), axis=1)

In [16]:
df[["footprint"]]

Unnamed: 0,footprint
0,"[-133405918, 1365343571, -19228181, -986427402..."
1,"[-2075049063, -1939387561, -636087726, -125559..."
2,"[-2039305522, -2045440760, -1659674634, -10728..."
4,"[-1207828010, -2063934302, -2055290958, -16232..."
5,"[-1605214427, -2045937031, -1388313928, -18860..."
6,"[-1028138746, -1811144147, -1430974461, -16030..."
7,"[-2010720578, -1533557505, -1984785682, -19604..."
8,"[-2120482215, -2072121601, -1840711445, -19916..."
9,"[-1879490965, -1343120345, -1570830965, -19604..."
10,"[-1011631624, -530094282, -1278676497, -124999..."


In [27]:
pf = LSHSimilarGidNameFinderExperimental(df[['gid', 'name', 'ngramms', 'footprint']])

In [28]:
pf

<__main__.LSHSimilarGidNameFinderExperimental at 0x14f99fe10>