In [18]:
from itertools import combinations
import numpy as np
import random
import binascii
from typing import Union, List

MERSENNE_PRIME = (1 << 61) - 1
MAX_HASH = (1 << 32) - 1
HASH_RANGE = 1 << 32

def shingle_word(text: str, n_gram: int = 15, char_level: bool = False) -> List[str]:
    """
    example
    -------
    >>> shingle_word("hello world from ducky", n_gram=2)
    ['hello_world', 'world_from', 'from_ducky']

    >>> shingle_word("hello world from ducky", n_gram=2, char_level=True)
    ['h_e', 'e_l', 'l_l', 'l_o', 'o_w', 'w_o', 'o_r', 'r_l', 'l_d', 'd_f', 'f_r', 'r_o', 'o_m', 'm_d', 'd_u', 'u_c', 'c_k', 'k_y']
    """
    res = []
    text_words = text.split() if not char_level else text

    for i in range(len(text_words)):
        shingle = text_words[i : i + n_gram]

        if len(shingle) == n_gram:
            res.append("_".join(shingle).encode("utf-8"))

    return res

def generate_minhash(shingles: List, num_perm: int = 64, seed: int = 1) -> np.array:
    def hashfunc(b: bytes) -> bytes:
        return binascii.crc32(b) & MAX_HASH

    hashvalues = np.ones(num_perm, dtype=np.uint64) * MAX_HASH

    generator = np.random.RandomState(seed)
    permutations = np.array(
        [
            (
                generator.randint(1, MERSENNE_PRIME, dtype=np.uint64),
                generator.randint(0, MERSENNE_PRIME, dtype=np.uint64),
            )
            for _ in range(num_perm)
        ],
        dtype=np.uint64,
    ).T

    for shingle in shingles:
        hv = hashfunc(shingle)
        a, b = permutations
        phv = np.bitwise_and((a * hv + b) % MERSENNE_PRIME, np.uint64(MAX_HASH))
        hashvalues = np.minimum(phv, hashvalues)

    return hashvalues

def expand_instances_by_minhash(
    data, expand_size: int, n_gram: int, seed: int = 1, char_level: bool = False
):
    shingles = shingle_word(data["text"], n_gram=n_gram, char_level=char_level)
    minhashes = generate_minhash(shingles, num_perm=expand_size, seed=seed)

    for mh in minhashes.tolist():
        return (str(mh), [dict(**data, shingles=shingles, hashvalues=minhashes)])


def jaccard_by_hashvalues(src_hashvalues, tgt_hashvalues) -> float:
    if len(src_hashvalues) != len(tgt_hashvalues):
        raise ValueError()

    return float(np.count_nonzero(src_hashvalues == tgt_hashvalues)) / float(
        len(src_hashvalues))


def explore_dedup_instance(hash_groups, threshold: float = 0.8):
    if len(hash_groups) <= 1:
        return

    group_represent_text = hash_groups[0][
        "text"
    ]  # not to remove all text instances in group.
    pairs = combinations(hash_groups, 2)

    for d_1, d_2 in pairs:
        sim_score = jaccard_by_hashvalues(d_1["hashvalues"], d_2["hashvalues"])
        if sim_score >= threshold:
            dedup_text = [d_1["text"], d_2["text"]]
            if group_represent_text in dedup_text:
                dedup_text[0] if dedup_text[
                    0
                ] != group_represent_text else dedup_text[1]
            else:
                random.choice(dedup_text)

In [64]:
import pandas as pd
data = [
        {"text": "hello wolrd! Welcome to dataverse."},
        {"text": "hello wolrd! Welcome to dataverrrse."},
        {"text": "a totally different sentence"},
    ]
data = pd.DataFrame(data)

In [65]:
df_expanded = data["text"].apply(lambda x : expand_instances_by_minhash(x, 64, 2, False))

TypeError: string indices must be integers, not 'str'

In [61]:
expand_minhash = expand_instances_by_minhash(data.to_dict(orient="series"), 64, 2, char_level=True)

In [63]:
expand_minhash

('423976591',
 [{'text': 0      hello wolrd! Welcome to dataverse.
   1    hello wolrd! Welcome to dataverrrse.
   2            a totally different sentence
   Name: text, dtype: object,
   'shingles': [b'hello wolrd! Welcome to dataverse._hello wolrd! Welcome to dataverrrse.',
    b'hello wolrd! Welcome to dataverrrse._a totally different sentence'],
   'hashvalues': array([ 423976591, 1179597160, 2173486595, 2656929283, 1905198105,
           466826231, 2483130910, 1749155483, 1906894582,  574160600,
           291037418, 1661721150,  471728005, 2009185705, 2294101579,
           492540358, 1166238982, 1402358624,  773040593,  831153555,
          1510210255, 2078701579, 2744031734, 1490008704, 3654404923,
           358977674,  940225680,  191411290,  358731490,   97368304,
           209788358, 1994545281,  207854605, 1308159746, 3207868941,
          2745819470, 2539655204, 2657916589, 1785459166,   97784208,
          1344271224, 2487440659, 3090907716,  815100042,  781975625,
  

In [41]:
shingles, hashvalues = [], []
for row in data.iterrows():
    v = row[1]
    _, tmp = expand_instances_by_minhash(v, 64,2)
    # print(expand_instances_by_minhash(v, 64,2))
    shingles.append(tmp[0]["shingles"])
    hashvalues.append(tmp[0]["hashvalues"])
data["shingles"] = shingles
data["hashvalues"] = hashvalues

In [49]:
data.to_dict(orient="records")

[{'text': 'hello wolrd! Welcome to dataverse.',
  'shingles': [b'hello_wolrd!',
   b'wolrd!_Welcome',
   b'Welcome_to',
   b'to_dataverse.'],
  'hashvalues': array([1125711376, 1585414850,  113619648, 1095903752, 1007095010,
          349694736,   61628889, 1561244383, 1107734654,  582302568,
          962267556,   86318050,  684024235,  500219653, 2378927720,
          193711265,  471341851, 2201276130,  368868197,  532525545,
         3102877401,   55324660, 1447957220, 2403186128,  180222207,
           75847955,  332420352, 1096705033,  865950441,   77055466,
          379314978,  319043347,   50939237,  494688010, 2029015502,
          123625184, 1933699250,  323107670,  109296489,  106212960,
          220407054,  972873360,  521857045,    4794227, 1421693921,
          774197356, 1173623403, 2020963681,  756702417,   79241917,
          334425944, 2816280587,   53941169,  557843763,    4611238,
         1849989506,  638516427,   80694179, 1884461840, 1485031525,
           13681

In [51]:
result = explore_dedup_instance(data.to_dict(orient="records"))

In [52]:
result

In [15]:
print(explore_dedup_instance(data["text"]))

TypeError: string indices must be integers, not 'str'

#

In [27]:
import sys

from dataverse.etl import ETLPipeline
from dataverse.etl import register_etl
from dataverse.etl import ETLRegistry


In [28]:
ETLRegistry()

Total [ 45 ]
data_ingestion [ 17 ]
deduplication [ 4 ]
cleaning [ 14 ]
pii [ 2 ]
quality [ 1 ]
data_load [ 4 ]
utils [ 3 ]

In [10]:
@register_etl
def data_ingestion___test___generate_accent(spark, *args, **kwargs):
    data = [("café",), ("résumé",), ("piñata",)]
    df = spark.createDataFrame(data, ["text"])

    return df

In [7]:
etl_pipeline = ETLPipeline()
etl_pipeline

  from .autonotebook import tqdm as notebook_tqdm


An error occurred (InvalidClientTokenId) when calling the GetCallerIdentity operation: The security token included in the request is invalid


<dataverse.etl.pipeline.ETLPipeline at 0x7f36a28bd150>

In [13]:
spark, data = etl_pipeline.sample(sample_etl="data_ingestion___test___generate_accent")

[ No AWS Credentials Found] - Failed to set spark conf for S3


In [14]:
spark

In [20]:
from omegaconf import OmegaConf

# load from dict
ETL_config = OmegaConf.create({
    'spark': {
        'appname': 'ETL',
        'driver': {'memory': '16g'},
    },
    'etl': [
        {
            'name': 'data_ingestion___test___generate_accent',
        },
        {'name': 'cleaning___accent___remove'}
    ]
})


In [21]:
print(OmegaConf.to_yaml(ETL_config))

spark:
  appname: ETL
  driver:
    memory: 16g
etl:
- name: data_ingestion___test___generate_accent
- name: cleaning___accent___remove



In [22]:
spark, data = etl_pipeline.run(ETL_config)

[ No AWS Credentials Found] - Failed to set spark conf for S3


24/01/17 05:54:48 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [23]:
spark, data

(<pyspark.sql.session.SparkSession at 0x7f35f03197d0>,
 PythonRDD[21] at RDD at PythonRDD.scala:53)

In [26]:
data.take(2)

[{'text': 'cafe'}, {'text': 'resume'}]

In [29]:
data

PythonRDD[21] at RDD at PythonRDD.scala:53

In [19]:
from dataverse.etl.cleaning.accent import cleaning__accent___remove

KeyError: 'The key [ cleaning___accent___remove ] is already registered'