# Problema Base: comparar cadenas

In [None]:
ord("A")

65

In [None]:
ord("B")

66

In [None]:
"A" == "B"

False

In [None]:
"A" != "B"

True

Tengo implementadada la forma de checkear igualdad y desigualdad estricta, pero no tengo implementado nada flexible

In [None]:
"A" ?? "a"

SyntaxError: invalid syntax (<ipython-input-5-64d5e673526e>, line 1)

La primera vez que nos topamos con eso probablemente fue algo asi

In [None]:
while True:
    print("Hola")
    entrada = input("Desea continuar? S/N")
    if entrada in ["N","n","no","No","NO"]:
        break

Pero para una cadena larga es mas complicado subsanar de esa forma.

In [None]:
"La casa es color verde" ?? "La casa tiene color verde"

SyntaxError: invalid syntax (<ipython-input-6-a177d1ae5b18>, line 1)

# 1 - Tokenize

In [None]:
test_string = """
This is the mostly UNEDITED version of scammers wasting countless hours demanding more money and gift cards
"""

La forma mas sencilla de tokenizar es separar por palabras

In [None]:
words = test_string.split()
words

['This',
 'is',
 'the',
 'mostly',
 'UNEDITED',
 'version',
 'of',
 'scammers',
 'wasting',
 'countless',
 'hours',
 'demanding',
 'more',
 'money',
 'and',
 'gift',
 'cards']

In [None]:
window = [ test_string[i:i+3] for i in range(0,len(test_string),3) ]
window

['\nTh',
 'is ',
 'is ',
 'the',
 ' mo',
 'stl',
 'y U',
 'NED',
 'ITE',
 'D v',
 'ers',
 'ion',
 ' of',
 ' sc',
 'amm',
 'ers',
 ' wa',
 'sti',
 'ng ',
 'cou',
 'ntl',
 'ess',
 ' ho',
 'urs',
 ' de',
 'man',
 'din',
 'g m',
 'ore',
 ' mo',
 'ney',
 ' an',
 'd g',
 'ift',
 ' ca',
 'rds',
 '\n']

In [None]:
slide = [ test_string[i:i+6] for i in range(0,len(test_string),3) ]
slide

['\nThis ',
 'is is ',
 'is the',
 'the mo',
 ' mostl',
 'stly U',
 'y UNED',
 'NEDITE',
 'ITED v',
 'D vers',
 'ersion',
 'ion of',
 ' of sc',
 ' scamm',
 'ammers',
 'ers wa',
 ' wasti',
 'sting ',
 'ng cou',
 'countl',
 'ntless',
 'ess ho',
 ' hours',
 'urs de',
 ' deman',
 'mandin',
 'ding m',
 'g more',
 'ore mo',
 ' money',
 'ney an',
 ' and g',
 'd gift',
 'ift ca',
 ' cards',
 'rds\n',
 '\n']

Pero tecnicamente todas esas cosas que no son palabras son formas validas (capaz no utiles) de tokenizar una cadena.

Podemos ver un ejemplo funcional de un tokenizador que no separa estrictamente por palabras, y es el que se usa para ChatGPT

In [None]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [None]:
import tiktoken

In [None]:
enc = tiktoken.encoding_for_model("gpt-4o")
enc.encode(test_string)

[198,
 2500,
 382,
 290,
 15646,
 9014,
 30370,
 2252,
 3926,
 328,
 134237,
 70415,
 39321,
 5123,
 43799,
 945,
 3905,
 326,
 10317,
 10541,
 198]

In [None]:
[enc.decode([x]) for x in enc.encode(test_string)]

['\n',
 'This',
 ' is',
 ' the',
 ' mostly',
 ' UN',
 'EDIT',
 'ED',
 ' version',
 ' of',
 ' scammers',
 ' wasting',
 ' countless',
 ' hours',
 ' demanding',
 ' more',
 ' money',
 ' and',
 ' gift',
 ' cards',
 '\n']

In [None]:
[enc.decode([x]) for x in enc.encode("El carton corrugado este es horrible")]

['El', ' carton', ' corr', 'ug', 'ado', ' este', ' es', ' horrible']

# 2 - Similaridad

## Representacion de un documento: Bag Of Words

In [None]:
bow = set(sorted(words))
bow

{'This',
 'UNEDITED',
 'and',
 'cards',
 'countless',
 'demanding',
 'gift',
 'hours',
 'is',
 'money',
 'more',
 'mostly',
 'of',
 'scammers',
 'the',
 'version',
 'wasting'}

In [None]:
second_doc = set("A helpful rule of thumb is that one token generally corresponds to ~4 characters of text for common English text".split())
second_doc

{'A',
 'English',
 'characters',
 'common',
 'corresponds',
 'for',
 'generally',
 'helpful',
 'is',
 'of',
 'one',
 'rule',
 'text',
 'that',
 'thumb',
 'to',
 'token',
 '~4'}

Podemos definir una similaridad S tal que S(doc_1,doc_2) = 0 si son distintos, 1 si son iguales o valores intermedios como medida de que tan parecidos o no son

Podemos de forma rustica implementarlo con nuesto BoW (usando los metodos de set) como la cantidad de palabras que comparten sobre la cantidad total de palabras

In [None]:
compartidas = bow.intersection(second_doc)
compartidas

{'is', 'of'}

In [None]:
totales = bow.union(second_doc)
totales

{'A',
 'English',
 'This',
 'UNEDITED',
 'and',
 'cards',
 'characters',
 'common',
 'corresponds',
 'countless',
 'demanding',
 'for',
 'generally',
 'gift',
 'helpful',
 'hours',
 'is',
 'money',
 'more',
 'mostly',
 'of',
 'one',
 'rule',
 'scammers',
 'text',
 'that',
 'the',
 'thumb',
 'to',
 'token',
 'version',
 'wasting',
 '~4'}

In [None]:
similaridad = len(compartidas)  / len(totales)
similaridad

0.06060606060606061

In [None]:
def similaridad_bow(doc_1,doc_2):
    bow_1 = set(doc_1.split())
    bow_2 = set(doc_2.split())

    shared = bow_1.intersection(bow_2)
    total = bow_1.union(bow_2)

    return len(shared)  / len(total)

In [None]:
similaridad_bow(test_string,test_string)

1.0

In [None]:
similaridad_bow(test_string,"")

0.0

In [None]:
similaridad_bow(test_string,"UNEDITED")

0.058823529411764705

- Soluciona el problema posicional
- Pierde informacion
    - Y sin hacer nada extra, no funciona del todo bien

In [None]:
similaridad_bow(test_string,test_string)

1.0

In [None]:
similaridad_bow(test_string,test_string*10)

1.0

In [None]:
similaridad_bow("El cazador mató al león","El león mató al cazador")

1.0

In [None]:
similaridad_bow(test_string.lower(),test_string.upper())

0.0

# Preprocesamiento

In [None]:
test_string

'\nThis is the mostly UNEDITED version of scammers wasting countless hours demanding more money and gift cards\n'

In [None]:
test_string.lower()

'\nthis is the mostly unedited version of scammers wasting countless hours demanding more money and gift cards\n'

In [None]:
def preprocess(my_string):
    return my_string.lower()

In [None]:
def similaridad_con_preprocess(doc_1,doc_2):
    doc_1 = preprocess(doc_1)
    doc_2 = preprocess(doc_2)

    return similaridad_bow(doc_1,doc_2)

In [None]:
similaridad_con_preprocess(test_string.lower(),test_string.upper())

1.0

OJO: aplicar cosas sin pensar nos puede dar problemas

In [None]:
similaridad_bow("Kit Boga","Un kit para pesca de boga en rio")

0.0

In [None]:
similaridad_con_preprocess("Kit Boga","Un kit para pesca de boga en rio")

0.25

# Limpieza

In [None]:
test_string_2="""
This is the mostly UNEDITED version of scammers wasting countless hours demanding more money and gift cards. Whenever they think they are getting close, there's a problem, but that doesn't stop them, and the rage builds.

STOP online scams before they start. Check out https://www.seraphsecure.com/kitboga and download a 14-day FREE trial of Seraph Secure for you and your family/friends. #seraphsecure
"""

In [None]:
preprocess(test_string_2).split()

['this',
 'is',
 'the',
 'mostly',
 'unedited',
 'version',
 'of',
 'scammers',
 'wasting',
 'countless',
 'hours',
 'demanding',
 'more',
 'money',
 'and',
 'gift',
 'cards.',
 'whenever',
 'they',
 'think',
 'they',
 'are',
 'getting',
 'close,',
 "there's",
 'a',
 'problem,',
 'but',
 'that',
 "doesn't",
 'stop',
 'them,',
 'and',
 'the',
 'rage',
 'builds.',
 'stop',
 'online',
 'scams',
 'before',
 'they',
 'start.',
 'check',
 'out',
 'https://www.seraphsecure.com/kitboga',
 'and',
 'download',
 'a',
 '14-day',
 'free',
 'trial',
 'of',
 'seraph',
 'secure',
 'for',
 'you',
 'and',
 'your',
 'family/friends.',
 '#seraphsecure']

In [None]:
import re
regex = r"[^\w\d]+"

clean = re.sub(regex," ",test_string_2)
clean

' This is the mostly UNEDITED version of scammers wasting countless hours demanding more money and gift cards Whenever they think they are getting close there s a problem but that doesn t stop them and the rage builds STOP online scams before they start Check out https www seraphsecure com kitboga and download a 14 day FREE trial of Seraph Secure for you and your family friends seraphsecure '

In [None]:
clean.split()

['This',
 'is',
 'the',
 'mostly',
 'UNEDITED',
 'version',
 'of',
 'scammers',
 'wasting',
 'countless',
 'hours',
 'demanding',
 'more',
 'money',
 'and',
 'gift',
 'cards',
 'Whenever',
 'they',
 'think',
 'they',
 'are',
 'getting',
 'close',
 'there',
 's',
 'a',
 'problem',
 'but',
 'that',
 'doesn',
 't',
 'stop',
 'them',
 'and',
 'the',
 'rage',
 'builds',
 'STOP',
 'online',
 'scams',
 'before',
 'they',
 'start',
 'Check',
 'out',
 'https',
 'www',
 'seraphsecure',
 'com',
 'kitboga',
 'and',
 'download',
 'a',
 '14',
 'day',
 'FREE',
 'trial',
 'of',
 'Seraph',
 'Secure',
 'for',
 'you',
 'and',
 'your',
 'family',
 'friends',
 'seraphsecure']

Me puede no interesar el valor de lo que estoy limpiando, entonces lo reemplazo por un token distintivo que no pueda confundirse con otro:

In [None]:
URL_REGEX = r"(https?://)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"

clean = re.sub(URL_REGEX,"UURRLL",test_string_2)
clean

"\nThis is the mostly UNEDITED version of scammers wasting countless hours demanding more money and gift cards. Whenever they think they are getting close, there's a problem, but that doesn't stop them, and the rage builds.\n\nSTOP online scams before they start. Check out UURRLL and download a 14-day FREE trial of Seraph Secure for you and your family/friends. #seraphsecure \n"

Y si nos importa, podemos hace algo similar, marcando por ej en este caso la palabra como un hashtag, agregando un valor al inicio.

In [None]:
HASHTAG_REGEX = r"#([\w\d]+)"

clean = re.sub(HASHTAG_REGEX,r"HASHTAG\1",test_string_2)
clean

"\nThis is the mostly UNEDITED version of scammers wasting countless hours demanding more money and gift cards. Whenever they think they are getting close, there's a problem, but that doesn't stop them, and the rage builds.\n\nSTOP online scams before they start. Check out https://www.seraphsecure.com/kitboga and download a 14-day FREE trial of Seraph Secure for you and your family/friends. HASHTAGseraphsecure \n"

In [None]:
URL_REGEX = r"(https?://)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
HASHTAG_REGEX = r"#([\w\d]+)"
SYMBOLS_REGEX = r"[^a-zA-Z]+"


def cleanup(my_string):
    clean = re.sub(URL_REGEX,"UURRLL",my_string)
    clean = re.sub(HASHTAG_REGEX,r"HASHTAG\1",clean)
    return re.sub(SYMBOLS_REGEX," ",clean)

cleanup(test_string_2)

' This is the mostly UNEDITED version of scammers wasting countless hours demanding more money and gift cards Whenever they think they are getting close there s a problem but that doesn t stop them and the rage builds STOP online scams before they start Check out UURRLL and download a day FREE trial of Seraph Secure for you and your family friends HASHTAGseraphsecure '

In [None]:
preprocess(cleanup(test_string_2)).split()

['this',
 'is',
 'the',
 'mostly',
 'unedited',
 'version',
 'of',
 'scammers',
 'wasting',
 'countless',
 'hours',
 'demanding',
 'more',
 'money',
 'and',
 'gift',
 'cards',
 'whenever',
 'they',
 'think',
 'they',
 'are',
 'getting',
 'close',
 'there',
 's',
 'a',
 'problem',
 'but',
 'that',
 'doesn',
 't',
 'stop',
 'them',
 'and',
 'the',
 'rage',
 'builds',
 'stop',
 'online',
 'scams',
 'before',
 'they',
 'start',
 'check',
 'out',
 'uurrll',
 'and',
 'download',
 'a',
 'day',
 'free',
 'trial',
 'of',
 'seraph',
 'secure',
 'for',
 'you',
 'and',
 'your',
 'family',
 'friends',
 'hashtagseraphsecure']

In [None]:
similaridad_con_preprocess(cleanup(test_string),cleanup(test_string_2))

0.32075471698113206

In [None]:
len(set(cleanup(test_string_2).split()))

55

In [None]:
!pip install youtube-transcript-api pytube langchain_community --quiet

In [None]:
from langchain_community.document_loaders import YoutubeLoader

steve = YoutubeLoader.from_youtube_url(
    "https://youtu.be/sRMMwpDTs5k",
    add_video_info=True,
    language=["en", "id"]
).load()

steve

[Document(metadata={'source': 'sRMMwpDTs5k', 'title': "The Angriest Scammer I've Ever Called: Steve", 'description': 'Unknown', 'view_count': 19642948, 'thumbnail_url': 'https://i.ytimg.com/vi/sRMMwpDTs5k/hq720.jpg?v=660ea981', 'publish_date': '2020-08-16 00:00:00', 'length': 5086, 'author': 'Kitboga'}, page_content="[Music] do you want your money or do you don't want your money then you need to do as i say you are not my teacher no no no no no no no [Applause] i've been calling scammers for over three years now and although i've had some pretty exciting and interesting learning experiences and just all around crazy moments this has to be the most extreme raw anger that i have ever witnessed the entire call took place over a couple days 10 hours worth of phone calls and ultimately ended in a pretty exciting display of anger it started off like a typical refund scam where they pose as a company like microsoft offering to give you money back for something you purchased they lock your com

In [None]:
steve = steve[0].page_content

In [None]:
len(set(cleanup(steve).split()))

963

In [None]:
two_lovers_first_kiss = YoutubeLoader.from_youtube_url(
    "https://youtu.be/-dJWB9FoMCU",
    add_video_info=True,
    language=["en", "id"]
).load()[0].page_content

len(set(cleanup(two_lovers_first_kiss).split()))

560

In [None]:
similaridad_con_preprocess(cleanup(steve),cleanup(two_lovers_first_kiss))

0.22867853795688847

In [None]:
lolip = set(cleanup(two_lovers_first_kiss).split())
kit = set(cleanup(steve).split())

len(lolip.union(kit))

1262

In [None]:
crow_pro = YoutubeLoader.from_youtube_url(
    "https://youtu.be/wSfDCLQhbns",
    add_video_info=True,
    language=["en", "id"]
).load()[0].page_content

len(set(cleanup(crow_pro).split()))


1176

In [None]:
kit_2 = set(cleanup(crow_pro).split())

len(lolip.union(kit).union(kit_2))

1872

In [None]:
similaridad_con_preprocess(cleanup(steve),cleanup(crow_pro))

0.3380703066566941

# Preprocesamiento: generalizacion

In [None]:
def preprocess(my_string):
    my_string = cleanup(my_string)
    return my_string.lower()

In [None]:
def similaridad_con_preprocess(doc_1,doc_2):
    doc_1 = preprocess(doc_1)
    doc_2 = preprocess(doc_2)

    return similaridad_bow(doc_1,doc_2)

In [None]:
import nltk

In [None]:
nltk.word_tokenize(test_string)

['This',
 'is',
 'the',
 'mostly',
 'UNEDITED',
 'version',
 'of',
 'scammers',
 'wasting',
 'countless',
 'hours',
 'demanding',
 'more',
 'money',
 'and',
 'gift',
 'cards']

In [None]:
nltk.download('punkt')
nltk.word_tokenize(test_string)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['This',
 'is',
 'the',
 'mostly',
 'UNEDITED',
 'version',
 'of',
 'scammers',
 'wasting',
 'countless',
 'hours',
 'demanding',
 'more',
 'money',
 'and',
 'gift',
 'cards']

In [None]:
stemmer = nltk.stem.snowball.EnglishStemmer()

In [None]:
stemmer.stem("cars")

'car'

In [None]:
stemmer.stem("running running")

'running run'

In [None]:
[ stemmer.stem(w) for w in preprocess("running running").split() ]

['run', 'run']

In [None]:
set([ stemmer.stem(w) for w in preprocess("running running").split() ])

{'run'}

In [None]:
set([ stemmer.stem(w) for w in preprocess(test_string).split() ])

{'and',
 'card',
 'countless',
 'demand',
 'gift',
 'hour',
 'is',
 'money',
 'more',
 'most',
 'of',
 'scammer',
 'the',
 'this',
 'unedit',
 'version',
 'wast'}

In [None]:
len(set([ stemmer.stem(w) for w in preprocess(test_string).split() ]))

17

In [None]:
len(set(preprocess(test_string).split()))

17

In [None]:
len(set([ stemmer.stem(w) for w in preprocess(steve).split() ]))

823

In [None]:
len(set(preprocess(steve).split()))

963

In [None]:
len(lolip.union(kit).union(kit_2))

1872

In [None]:
len(set( stemmer.stem(w) for w in lolip.union(kit).union(kit_2) ))

1500

In [None]:
def similaridad_con_preprocess(doc_1,doc_2):
    doc_1 = preprocess(doc_1)
    doc_2 = preprocess(doc_2)

    doc_1 = " ".join(stemmer.stem(w) for w in doc_1.split())
    doc_2 = " ".join(stemmer.stem(w) for w in doc_2.split())

    return similaridad_bow(doc_1,doc_2)

In [None]:
similaridad_con_preprocess(steve,two_lovers_first_kiss)

0.24485981308411214

In [None]:
similaridad_con_preprocess(steve,crow_pro)

0.37059724349157736

In [None]:
similaridad_con_preprocess(steve,steve)

1.0

In [None]:
similaridad_con_preprocess(steve,"")

0.0

In [None]:
similaridad_con_preprocess("red car","red cars")

1.0

In [None]:
stemmer.stem("Darling")

'darl'

In [None]:
nltk.tag.pos_tag(test_string.split())

[('This', 'DT'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('mostly', 'RB'),
 ('UNEDITED', 'NNP'),
 ('version', 'NN'),
 ('of', 'IN'),
 ('scammers', 'NNS'),
 ('wasting', 'VBG'),
 ('countless', 'NN'),
 ('hours', 'NNS'),
 ('demanding', 'VBG'),
 ('more', 'JJR'),
 ('money', 'NN'),
 ('and', 'CC'),
 ('gift', 'NN'),
 ('cards', 'NNS')]

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

nltk.tag.pos_tag(test_string.split(),tagset="universal")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


[('This', 'DET'),
 ('is', 'VERB'),
 ('the', 'DET'),
 ('mostly', 'ADV'),
 ('UNEDITED', 'NOUN'),
 ('version', 'NOUN'),
 ('of', 'ADP'),
 ('scammers', 'NOUN'),
 ('wasting', 'VERB'),
 ('countless', 'NOUN'),
 ('hours', 'NOUN'),
 ('demanding', 'VERB'),
 ('more', 'ADJ'),
 ('money', 'NOUN'),
 ('and', 'CONJ'),
 ('gift', 'NOUN'),
 ('cards', 'NOUN')]

Definiciones en:

https://universaldependencies.org/u/pos/

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemmatizer.lemmatize("cars")

'car'

In [None]:
nltk.download('wordnet')
lemmatizer.lemmatize("cars")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'car'

In [None]:
lemmatizer.lemmatize("darling")

'darling'

In [None]:
lemmatizer.lemmatize("running")

'running'

In [None]:
lemmatizer.lemmatize("running",pos="v")

'run'

In [None]:
nltk.tag.pos_tag(test_string.split(),tagset="universal")

[('This', 'DET'),
 ('is', 'VERB'),
 ('the', 'DET'),
 ('mostly', 'ADV'),
 ('UNEDITED', 'NOUN'),
 ('version', 'NOUN'),
 ('of', 'ADP'),
 ('scammers', 'NOUN'),
 ('wasting', 'VERB'),
 ('countless', 'NOUN'),
 ('hours', 'NOUN'),
 ('demanding', 'VERB'),
 ('more', 'ADJ'),
 ('money', 'NOUN'),
 ('and', 'CONJ'),
 ('gift', 'NOUN'),
 ('cards', 'NOUN')]

*pos (str) – The Part Of Speech tag. Valid options are “n” for nouns, “v” for verbs, “a” for adjectives, “r” for adverbs and “s” for satellite adjectives.*

Nuestros marcadores no estan en ese formato, asi que armamos una tabla de traduccion.

In [None]:
translate_pos = {
    "NOUN":"n",
    "VERB":"v",
    "ADJ":"a",
    "ADV":"r",
}

In [None]:
tagged_string = [ (w,translate_pos.get(pos,"n")) for w,pos in nltk.tag.pos_tag(test_string.split(),tagset="universal")]
tagged_string

[('This', 'n'),
 ('is', 'v'),
 ('the', 'n'),
 ('mostly', 'r'),
 ('UNEDITED', 'n'),
 ('version', 'n'),
 ('of', 'n'),
 ('scammers', 'n'),
 ('wasting', 'v'),
 ('countless', 'n'),
 ('hours', 'n'),
 ('demanding', 'v'),
 ('more', 'a'),
 ('money', 'n'),
 ('and', 'n'),
 ('gift', 'n'),
 ('cards', 'n')]

In [None]:
test_string

'\nThis is the mostly UNEDITED version of scammers wasting countless hours demanding more money and gift cards\n'

In [None]:
" ".join(lemmatizer.lemmatize(w) for w,pos in tagged_string)

'This is the mostly UNEDITED version of scammer wasting countless hour demanding more money and gift card'

In [None]:
" ".join(lemmatizer.lemmatize(w,pos=pos) for w,pos in tagged_string)

'This be the mostly UNEDITED version of scammer waste countless hour demand more money and gift card'

In [None]:
def lematize_string(my_string):
    tagged_string = [ (w,translate_pos.get(pos,"n")) for w,pos in nltk.tag.pos_tag(my_string.split(),tagset="universal")]
    return " ".join(lemmatizer.lemmatize(w,pos=pos) for w,pos in tagged_string)

In [None]:
def similaridad_con_preprocess(doc_1,doc_2):
    doc_1 = preprocess(doc_1)
    doc_2 = preprocess(doc_2)

    doc_1 = lematize_string(doc_1)
    doc_2 = lematize_string(doc_2)

    return similaridad_bow(doc_1,doc_2)

In [None]:
similaridad_con_preprocess(steve,two_lovers_first_kiss)

0.22867853795688847

In [None]:
similaridad_con_preprocess(steve,crow_pro)

0.3380703066566941

In [None]:
len(lolip.union(kit).union(kit_2))

1872

In [None]:
lolip_bow = set(lematize_string(preprocess(two_lovers_first_kiss)).split())

lolip_bow

{'a',
 'able',
 'about',
 'absolutely',
 'accidentally',
 'across',
 'act',
 'actually',
 'admit',
 'again',
 'all',
 'along',
 'also',
 'always',
 'an',
 'and',
 'android',
 'anno',
 'annoy',
 'answer',
 'any',
 'anything',
 'apparently',
 'appear',
 'areow',
 'around',
 'as',
 'at',
 'audience',
 'awareness',
 'awesome',
 'awkward',
 'b',
 'back',
 'background',
 'bad',
 'be',
 'because',
 'become',
 'bed',
 'beginning',
 'behind',
 'believe',
 'bet',
 'between',
 'bite',
 'blasting',
 'bloody',
 'blow',
 'boob',
 'boovie',
 'bottom',
 'boy',
 'break',
 'bull',
 'busy',
 'but',
 'buy',
 'buzzy',
 'by',
 'bye',
 'byebye',
 'c',
 'ca',
 'calm',
 'can',
 'cancer',
 'cannot',
 'car',
 'check',
 'chinese',
 'cight',
 'click',
 'clickable',
 'clicking',
 'cliff',
 'combination',
 'come',
 'comment',
 'computer',
 'connect',
 'consider',
 'control',
 'could',
 'couldn',
 'course',
 'crawl',
 'creepy',
 'cringy',
 'crips',
 'cursor',
 'cut',
 'cute',
 'cuz',
 'd',
 'damn',
 'damnable',
 'dat

In [None]:
lolip_bow = set(lematize_string(preprocess(two_lovers_first_kiss)).split())
kit_1_bow = set(lematize_string(preprocess(steve)).split())
kit_2_bow = set(lematize_string(preprocess(crow_pro)).split())

In [None]:
len(lolip_bow.union(kit_1_bow).union(kit_2_bow))

1535

In [None]:
lolip_bow = set(preprocess(lematize_string(two_lovers_first_kiss)).split())
kit_1_bow = set(preprocess(lematize_string(steve)).split())
kit_2_bow = set(preprocess(lematize_string(crow_pro)).split())

In [None]:
len(lolip_bow.union(kit_1_bow).union(kit_2_bow))

1551

# Count V

In [None]:
def vectorize(tokens):
    d = {}
    for token in tokens:
        d[token] = d.get(token,0) + 1

    return d

In [None]:
vectorize(test_string.split())

{'This': 1,
 'is': 1,
 'the': 1,
 'mostly': 1,
 'UNEDITED': 1,
 'version': 1,
 'of': 1,
 'scammers': 1,
 'wasting': 1,
 'countless': 1,
 'hours': 1,
 'demanding': 1,
 'more': 1,
 'money': 1,
 'and': 1,
 'gift': 1,
 'cards': 1}

In [None]:
vectorize(test_string_2.split())

{'This': 1,
 'is': 1,
 'the': 2,
 'mostly': 1,
 'UNEDITED': 1,
 'version': 1,
 'of': 2,
 'scammers': 1,
 'wasting': 1,
 'countless': 1,
 'hours': 1,
 'demanding': 1,
 'more': 1,
 'money': 1,
 'and': 4,
 'gift': 1,
 'cards.': 1,
 'Whenever': 1,
 'they': 3,
 'think': 1,
 'are': 1,
 'getting': 1,
 'close,': 1,
 "there's": 1,
 'a': 2,
 'problem,': 1,
 'but': 1,
 'that': 1,
 "doesn't": 1,
 'stop': 1,
 'them,': 1,
 'rage': 1,
 'builds.': 1,
 'STOP': 1,
 'online': 1,
 'scams': 1,
 'before': 1,
 'start.': 1,
 'Check': 1,
 'out': 1,
 'https://www.seraphsecure.com/kitboga': 1,
 'download': 1,
 '14-day': 1,
 'FREE': 1,
 'trial': 1,
 'Seraph': 1,
 'Secure': 1,
 'for': 1,
 'you': 1,
 'your': 1,
 'family/friends.': 1,
 '#seraphsecure': 1}

In [None]:
def similaridad_count_v(string_a,string_b):

    vector_a = vectorize(string_a.split())
    vector_b = vectorize(string_b.split())

    keys = set(vector_a.keys()).union(set(vector_b.keys()))

    print(keys)

In [None]:
similaridad_count_v("your car","Hi mark")

{'your', 'Hi', 'mark', 'car'}


In [None]:
def similaridad_count_v(string_a,string_b):

    vector_a = vectorize(string_a.split())
    vector_b = vectorize(string_b.split())

    keys = set(vector_a.keys()).union(set(vector_b.keys()))

    total = 0
    diff = 0
    for key in keys:
        total += vector_a.get(key,0) + vector_b.get(key,0)
        diff += abs( vector_a.get(key,0) - vector_b.get(key,0) )

    return diff / total

In [None]:
similaridad_count_v(test_string,test_string)

0.0

In [None]:
similaridad_count_v(test_string,"")

1.0

In [None]:
similaridad_count_v(test_string,test_string*10)

0.8181818181818182

Me queda invertido porque no es una similaridad lo que estoy calculando sino una distancia (particularmente es una distancia Manhatan). Dos cosas cercanas suelen ser mas parecidas, por ende la similaridad es inversamente proporcional a la distancia.

In [None]:
def similaridad_count_v(string_a,string_b):

    vector_a = vectorize(string_a.split())
    vector_b = vectorize(string_b.split())

    keys = set(vector_a.keys()).union(set(vector_b.keys()))

    total = 0
    diff = 0
    for key in keys:
        total += vector_a.get(key,0) + vector_b.get(key,0)
        diff += abs( vector_a.get(key,0) - vector_b.get(key,0) )

    return 1 - (diff / total)

In [None]:
similaridad_count_v(test_string,test_string)

1.0

In [None]:
similaridad_count_v(test_string,"")

0.0

In [None]:
def similaridad_con_preprocess(doc_1,doc_2):
    doc_1 = preprocess(doc_1)
    doc_2 = preprocess(doc_2)

    doc_1 = lematize_string(doc_1)
    doc_2 = lematize_string(doc_2)

    return similaridad_count_v(doc_1,doc_2)

In [None]:
similaridad_con_preprocess(test_string,test_string)

1.0

In [None]:
similaridad_con_preprocess(test_string,"")

0.0

In [None]:
similaridad_con_preprocess("kit calling scammer",steve)

0.00046290938548776417

In [None]:
similaridad_con_preprocess("kit calling scammer",two_lovers_first_kiss)

0.0

El problema que puedo tener, es que con distancia Manhatan la longitud de la cadena entra en la ecuacion. A medida que aumenta la diferencia de cantidad de tokens entre ambas, aumenta tambien la distancia.

Si queremos implementar un motor de busqueda, vamos a calcular similaridades contra el "query string", que suele ser algo corto, y por ende articulos cortos se priorizarian sobre articulos largos.

Para una similaridad que apute a el contenido de las cadenas, usamos similaridad coseno.

In [None]:
def similaridad_count_v(string_a,string_b):

    vector_a = vectorize(string_a.split())
    vector_b = vectorize(string_b.split())

    nltk.cluster.cosine_distance(vector_a,vector_b)

In [None]:
similaridad_count_v("string_a","string_b")

TypeError: unsupported operand type(s) for *: 'dict' and 'dict'

Ya esto requiere vectores y no diccionarios (por temas de optimizacion) asi que podemos usar el vectorizer de sklearn.

Ojo, que al cambiar de diccionario a vector/lista, no se puede representar de forma dispersa (si no esta un token, implicitamente es un 0). Eso ademas plantea otro problema de que NECESITO de antemano saber TODOS los tokens posibles.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform([steve,two_lovers_first_kiss,crow_pro])

X

<3x1831 sparse matrix of type '<class 'numpy.int64'>'
	with 2664 stored elements in Compressed Sparse Row format>

In [None]:
vectorizer.get_feature_names_out()

array(['00', '000', '0030', ..., 'zeros', 'zip', 'zombocom'], dtype=object)

In [None]:
def similaridad_count_v(vector_a,vector_b):
    return 1 - nltk.cluster.cosine_distance(vector_a,vector_b)

In [None]:
corpus = [ lematize_string(preprocess(doc)) for doc in [test_string,test_string_2,steve,two_lovers_first_kiss,crow_pro] ]
corpus

['this be the mostly unedited version of scammer waste countless hour demand more money and gift card',
 'this be the mostly unedited version of scammer waste countless hour demand more money and gift card whenever they think they be get close there s a problem but that doesn t stop them and the rage build stop online scam before they start check out uurrll and download a day free trial of seraph secure for you and your family friends hashtagseraphsecure',
 'music do you want your money or do you don t want your money then you need to do a i say you be not my teacher no no no no no no no applause i ve be call scammer for over three year now and although i ve have some pretty exciting and interesting learning experience and just all around crazy moment this have to be the most extreme raw anger that i have ever witness the entire call take place over a couple day hour worth of phone call and ultimately end in a pretty excite display of anger it start off like a typical refund scam where

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

X

<5x1522 sparse matrix of type '<class 'numpy.int64'>'
	with 2312 stored elements in Compressed Sparse Row format>

In [None]:
print(vectorizer.transform([test_string]))

  (0, 51)	1
  (0, 276)	1
  (0, 508)	1
  (0, 804)	1
  (0, 809)	1
  (0, 812)	1
  (0, 863)	1
  (0, 1313)	1
  (0, 1326)	1
  (0, 1388)	1
  (0, 1415)	1


In [None]:
print(vectorizer.transform([ lematize_string(preprocess(test_string)) ]))

  (0, 51)	1
  (0, 124)	1
  (0, 198)	1
  (0, 276)	1
  (0, 321)	1
  (0, 508)	1
  (0, 594)	1
  (0, 804)	1
  (0, 809)	1
  (0, 812)	1
  (0, 863)	1
  (0, 1095)	1
  (0, 1313)	1
  (0, 1326)	1
  (0, 1388)	1
  (0, 1415)	1
  (0, 1433)	1


In [None]:
vectorizer.transform([ lematize_string(preprocess(test_string)) ]).toarray()

array([[0, 0, 0, ..., 0, 0, 0]])

In [None]:
vectorizer.get_feature_names_out()

array(['ability', 'able', 'abolutely', ..., 'zero', 'zip', 'zombocom'],
      dtype=object)

In [None]:
vector = vectorizer.transform([ lematize_string(preprocess(test_string)) ]).toarray()[0]
features = vectorizer.get_feature_names_out()

[ feature for value,feature in zip(vector,features) if value.any()]

['and',
 'be',
 'card',
 'countless',
 'demand',
 'gift',
 'hour',
 'money',
 'more',
 'mostly',
 'of',
 'scammer',
 'the',
 'this',
 'unedited',
 'version',
 'waste']

In [None]:
sorted(lematize_string(preprocess(test_string)).split())

['and',
 'be',
 'card',
 'countless',
 'demand',
 'gift',
 'hour',
 'money',
 'more',
 'mostly',
 'of',
 'scammer',
 'the',
 'this',
 'unedited',
 'version',
 'waste']

In [None]:
def similaridad_count_v(string_a,string_b,vectorizer):

    vector_a = vectorizer.transform([ lematize_string(preprocess(string_a)) ]).toarray()[0]
    vector_b = vectorizer.transform([ lematize_string(preprocess(string_b)) ]).toarray()[0]

    return 1-nltk.cluster.cosine_distance(vector_a,vector_b)

In [None]:
similaridad_count_v(steve,crow_pro,vectorizer)

0.939211905425627

In [None]:
similaridad_count_v(two_lovers_first_kiss,crow_pro,vectorizer)

0.8918520743356481

In [None]:
similaridad_count_v(two_lovers_first_kiss,steve,vectorizer)

0.8479207339899647

In [None]:
similaridad_count_v("kit calling two scammers",steve,vectorizer)

0.02954520573766417

In [None]:
similaridad_count_v("kit calling two scammers",two_lovers_first_kiss,vectorizer)

0.0

In [None]:
similaridad_count_v("kit calling two scammers",crow_pro,vectorizer)

0.036316745190080435

Aca se puede ver como internamente se representa como una estructura especial de matriz dispersa. Si bien eso mejora el uso de memoria, no nos salva de tener que saber todos los tokens de antemano. Que el token 25 sea "perro" en todos los vectores (dispersos o no), solo se puede conseguir sabiendo que "perro" aparece en algun documento.

In [None]:
matrix = (vectorizer.transform([ lematize_string(preprocess(test_string_2)) ]))
print(matrix)

  (0, 51)	4
  (0, 124)	2
  (0, 129)	1
  (0, 171)	1
  (0, 177)	1
  (0, 198)	1
  (0, 214)	1
  (0, 233)	1
  (0, 276)	1
  (0, 313)	1
  (0, 321)	1
  (0, 357)	1
  (0, 365)	1
  (0, 436)	1
  (0, 468)	1
  (0, 483)	1
  (0, 486)	1
  (0, 505)	1
  (0, 508)	1
  (0, 548)	1
  (0, 594)	1
  (0, 804)	1
  (0, 809)	1
  (0, 812)	1
  (0, 863)	2
  (0, 877)	1
  (0, 891)	1
  (0, 976)	1
  (0, 1010)	1
  (0, 1093)	1
  (0, 1095)	1
  (0, 1115)	1
  (0, 1125)	1
  (0, 1223)	1
  (0, 1234)	2
  (0, 1312)	1
  (0, 1313)	2
  (0, 1317)	1
  (0, 1320)	1
  (0, 1322)	3
  (0, 1325)	1
  (0, 1326)	1
  (0, 1360)	1
  (0, 1388)	1
  (0, 1407)	1
  (0, 1415)	1
  (0, 1433)	1
  (0, 1453)	1
  (0, 1513)	1
  (0, 1515)	1


In [None]:
matrix.toarray()[0]

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
sum(matrix.toarray()[0])

59

Podemos normalizar los vectores para ver la ingerencia de un token en todo el documento. Se espera que a mientras mas veces aparezca en el mismo documento un token, mas relevante a este va a ser. Piensenlo, tiene logica.

In [None]:
normalized_array = matrix.toarray()[0]

normalized_array = normalized_array / sum(normalized_array)

normalized_array[51]

0.06779661016949153

In [None]:
normalized_array[129]

0.01694915254237288

Eso a nivel documento. Ahora, si pensamos a nivel corpus (coleccion de documentos) y dentro del contexto de recuperacion de datos, es tambien relevante en que documentos aparece cada token.

Si estoy buscando informacion sobre Hawaii, esperaria que el documento diga Hawaii en algun lado. El problema es que si todos los documentos dicen Hawaii, la verdad que no me sirve de mucho.

Entonces, lo que nos va a interesar es la inversa de esa metrica (mas un logaritmo por cuestiones), tanto para ponderar el token como para saber si tengo que mantenerlo o no.

In [None]:
dfs = dictionary.dfs
dfs

{13: 5,
 1: 5,
 12: 5,
 9: 2,
 14: 2,
 15: 2,
 10: 5,
 11: 4,
 16: 4,
 3: 2,
 6: 5,
 4: 3,
 8: 5,
 7: 4,
 0: 5,
 5: 4,
 2: 4,
 50: 3,
 46: 4,
 47: 4,
 30: 4,
 22: 3,
 45: 4,
 36: 4,
 17: 4,
 34: 4,
 20: 4,
 43: 4,
 24: 3,
 42: 4,
 41: 4,
 44: 4,
 35: 2,
 19: 2,
 32: 1,
 37: 3,
 18: 3,
 40: 4,
 21: 4,
 33: 4,
 49: 3,
 25: 1,
 23: 4,
 28: 3,
 48: 2,
 39: 1,
 38: 1,
 27: 4,
 51: 4,
 52: 4,
 26: 3,
 29: 2,
 31: 1,
 471: 3,
 222: 3,
 761: 3,
 505: 3,
 225: 3,
 701: 3,
 476: 3,
 718: 3,
 357: 3,
 612: 3,
 488: 3,
 472: 3,
 688: 1,
 485: 3,
 90: 1,
 754: 3,
 141: 2,
 511: 3,
 710: 3,
 815: 2,
 489: 3,
 77: 1,
 322: 3,
 647: 3,
 558: 3,
 257: 1,
 366: 1,
 398: 1,
 261: 2,
 381: 3,
 73: 3,
 93: 3,
 193: 2,
 459: 2,
 461: 2,
 263: 1,
 580: 1,
 82: 1,
 251: 2,
 795: 1,
 248: 2,
 684: 3,
 543: 3,
 190: 2,
 803: 2,
 535: 2,
 741: 1,
 245: 3,
 362: 3,
 256: 2,
 220: 1,
 374: 3,
 491: 3,
 406: 3,
 738: 1,
 593: 1,
 774: 3,
 551: 1,
 176: 2,
 448: 1,
 493: 1,
 299: 2,
 102: 3,
 649: 3,
 569: 1,
 414: 

In [None]:
from gensim import corpora

dictionary = corpora.Dictionary([doc.split() for doc in corpus])

In [None]:
import math
for k in dfs:
    dfs[k] = math.log(5/dfs[k])

In [None]:
math.log( 5 / 1 )

1.6094379124341003

In [None]:
math.log( 500 / 1 )

6.214608098422191

In [None]:
math.log( 500000 / 1 )

13.122363377404328

In [None]:
math.log( 50000000 / 1 )

17.72753356339242

Un token que no aparece nunca, tiene mucha mas importancia, por eso el idf me da mas alto. En  este caso, saber que esta ese token, me ayuda a identificarlo inequivocamente entre 5.

In [None]:
math.log( 5 / 5 )

0.0

Inversamente, si aparece siempre, no me aporta nada. Y de hecho el logaritmo este me da 0. Esto es literalmente una stopword. No es unicamente una lista de palabras predefinidas, sino palabras que no aportan. Si estoy trabajando en un corpus de manuales de impresoras, la palabra "impresora" probablemente aparezca en todos y sea una stopword por mas que no este en una lista en ningun lado.

Think Mark, think.