In [146]:
import nltk
from nltk.stem import SnowballStemmer
from nltk.util import ngrams
from sklearn.metrics import pairwise
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

In [147]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sergiogonzalez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Recibe 3 parrafos con  3 oraciones

In [148]:
# parrafo_1 = "Esta es una prueba. De los famosos n-gramas. Se esta usando NLTK."
# parrafo_2 = "Esta es otra prueba. Esto es para el ejercicio. Usando n-gramas en clase."

In [149]:
parrafo_1 = "Artificial intelligence (AI) mirrors human intelligence by enabling machines to execute tasks traditionally handled by humans, leveraging technologies like machine learning and neural networks. While AI systems can adapt and improve through learning, concerns persist regarding ethical implications such as privacy infringement and biases in decision-making. As AI continues to evolve, responsible development practices become imperative to navigate its societal impact."
parrafo_2 = "Artificial intelligence (AI) mirrors people intelligence by functioning machines to execute tasks traditionally handled by people, leveraging technologies like machine learning and neural networks. While AI systems can transform and improve through learning, issues persist regarding ethical implications such as privacy infringement and biases in decision-making. As AI continues to grow, responsible development practices become necessary to navigate its societal impact."

### Utiliza steaming para pre-procesar el texto

In [150]:
def pre_procesamiento(parrafo):
    parrafo = re.sub(r'[^\w\s]', '', parrafo)
    tokenized_text = nltk.word_tokenize(parrafo)
    stemmer = SnowballStemmer("english")
    stemmed_text = [stemmer.stem(word) for word in tokenized_text]
    # stemmed_text = ' '.join([stemmer.stem(word) for word in tokenized_text])
    return stemmed_text

In [151]:
parrafos = [parrafo_1, parrafo_2]
pre_procesados = [pre_procesamiento(parrafo) for parrafo in parrafos]
print(pre_procesados)

[['artifici', 'intellig', 'ai', 'mirror', 'human', 'intellig', 'by', 'enabl', 'machin', 'to', 'execut', 'task', 'tradit', 'handl', 'by', 'human', 'leverag', 'technolog', 'like', 'machin', 'learn', 'and', 'neural', 'network', 'while', 'ai', 'system', 'can', 'adapt', 'and', 'improv', 'through', 'learn', 'concern', 'persist', 'regard', 'ethic', 'implic', 'such', 'as', 'privaci', 'infring', 'and', 'bias', 'in', 'decisionmak', 'as', 'ai', 'continu', 'to', 'evolv', 'respons', 'develop', 'practic', 'becom', 'imper', 'to', 'navig', 'it', 'societ', 'impact'], ['artifici', 'intellig', 'ai', 'mirror', 'peopl', 'intellig', 'by', 'function', 'machin', 'to', 'execut', 'task', 'tradit', 'handl', 'by', 'peopl', 'leverag', 'technolog', 'like', 'machin', 'learn', 'and', 'neural', 'network', 'while', 'ai', 'system', 'can', 'transform', 'and', 'improv', 'through', 'learn', 'issu', 'persist', 'regard', 'ethic', 'implic', 'such', 'as', 'privaci', 'infring', 'and', 'bias', 'in', 'decisionmak', 'as', 'ai', 'c

### Calcula la distancia entre parrafos usando Unigramas, trigramas y bigramas

In [152]:
def connvertidor_ngrams(parrafo, n):
    return list(nltk.ngrams(parrafo, n))

In [153]:
unigramas = [connvertidor_ngrams(parrafo, 1) for parrafo in pre_procesados]
bigramas = [connvertidor_ngrams(parrafo, 2) for parrafo in pre_procesados]
trigramas = [connvertidor_ngrams(parrafo, 3) for parrafo in pre_procesados]

print("UNIGRAMAS: ", unigramas)
print("\n")
print("BIGRAMAS: ", bigramas)
print("\n")
print("TRIGRAMAS: ", trigramas)

UNIGRAMAS:  [[('artifici',), ('intellig',), ('ai',), ('mirror',), ('human',), ('intellig',), ('by',), ('enabl',), ('machin',), ('to',), ('execut',), ('task',), ('tradit',), ('handl',), ('by',), ('human',), ('leverag',), ('technolog',), ('like',), ('machin',), ('learn',), ('and',), ('neural',), ('network',), ('while',), ('ai',), ('system',), ('can',), ('adapt',), ('and',), ('improv',), ('through',), ('learn',), ('concern',), ('persist',), ('regard',), ('ethic',), ('implic',), ('such',), ('as',), ('privaci',), ('infring',), ('and',), ('bias',), ('in',), ('decisionmak',), ('as',), ('ai',), ('continu',), ('to',), ('evolv',), ('respons',), ('develop',), ('practic',), ('becom',), ('imper',), ('to',), ('navig',), ('it',), ('societ',), ('impact',)], [('artifici',), ('intellig',), ('ai',), ('mirror',), ('peopl',), ('intellig',), ('by',), ('function',), ('machin',), ('to',), ('execut',), ('task',), ('tradit',), ('handl',), ('by',), ('peopl',), ('leverag',), ('technolog',), ('like',), ('machin',)

In [154]:
# Code Made by Gala Flores
# Modified by: Sergio Gonzalez
def generador_de_matriz(gramas_1 ,gramas_2):
    palabras_conjuntas = set(gramas_1 + gramas_2)
    parrafos = [gramas_1, gramas_2]
    matriz = []
    
    for parrafo in parrafos:
        vector = []
        for palabra in palabras_conjuntas:
            vector.append(1 if palabra in parrafo else 0)
        matriz.append(vector)
    return matriz

In [155]:
matriz_unigramas = generador_de_matriz(unigramas[0], unigramas[1])
matriz_bigramas = generador_de_matriz(bigramas[0], bigramas[1])
matriz_trigramas = generador_de_matriz(trigramas[0], trigramas[1])

In [156]:
matriz_unigramas

[[1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 [1,
  1,
  1,
  1,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1]]

In [157]:
matriz_bigramas

[[0,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  0],
 [1,
  0,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]]

In [158]:
matriz_trigramas

[[0,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  1,
  1,
  1],
 [1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1]]

### Distancias entre parrafos con cosine similarity

In [159]:
similitud_1 = pairwise.cosine_similarity(matriz_unigramas)
similitud_2 = pairwise.cosine_similarity(matriz_bigramas)
similitud_3 = pairwise.cosine_similarity(matriz_trigramas)

In [160]:
print("similitud unigramas: ", similitud_1[0][1])

similitud unigramas:  0.8775510204081639


In [161]:
print("similitud bigramas: ", similitud_2[0][1])

similitud bigramas:  0.7666666666666674


In [162]:
print("similitud trigramas: ", similitud_3[0][1])

similitud trigramas:  0.6440677966101693


### Analisis de las distancias y justificacion de cada n-grama con ventajas y desventajas