In [50]:
import collections
import logging
import gc
import inspect
import numpy as np
import pandas as pd
from types import FunctionType
from typing import Iterable

import embeddinglib as el
from embeddinglib import mergingmethods


model = {
    'qwerty' : np.array([1,2,3,4,5]),
    'uiop' : np.array([2,3,4,5,6]),
    'asdf' : np.array([3,4,5,6,7]),
    'ghjkl' : np.array([4,5,6,7,8]),
    'three' : np.array([5,6,7,8,9]),
    'sentence' : np.array([7,8,9,10,11]),
}

txt = el.stringprocessing.tokenize(
    ['qwerty uiop', 'asdf ghjkl', 'three word sentence'],
    expand=True
)

txt

Unnamed: 0,0,1,2
0,qwerty,uiop,
1,asdf,ghjkl,
2,three,word,sentence


In [64]:
def sentenceEmbedding_fast(
        documentCol: Iterable[Iterable[str]],
        model,
        mergingFunction: FunctionType=mergingmethods.sumMerge,
        mergingFunctionKwargs: dict={},
        verbose=True
        )-> np.array:
    res = (
     documentCol.apply(lambda r : r.map(model))
                .aggregate(mergingFunction, 
                       axis=1, result_type='expand',
                       **mergingFunctionKwargs)
    )
    return res


sentenceEmbedding_fast(
    txt,
    model
)

0    0
1    0
2    0
dtype: int64

In [59]:
txt.apply(pd.Series.value_counts)#.sum(axis=1)

Unnamed: 0,0,1,2
asdf,1.0,,
ghjkl,,1.0,
qwerty,1.0,,
sentence,,,1.0
three,1.0,,
uiop,,1.0,
word,,1.0,


In [61]:
pd.DataFrame([
        ["157681", "miles.", "only", "$12,849"], 
        ["used", "2006", "ford", "f-150", 'repeatedword'],
        ["\"city,", "state\"", "format", "(quotes)", 'repeatedword'], 
        ["different", "'quotes'"],
        ["######", "miles", "only", "#####"], 
        ["used", "####", "ford", "f-###"],
        ["city", "state", "format", "quotes"], 
        ["different", "quotes"]])

Unnamed: 0,0,1,2,3,4
0,157681,miles.,only,"$12,849",
1,used,2006,ford,f-150,repeatedword
2,"""city,","state""",format,(quotes),repeatedword
3,different,'quotes',,,
4,######,miles,only,#####,
5,used,####,ford,f-###,
6,city,state,format,quotes,
7,different,quotes,,,
