### Installer

In [6]:
import nltk

nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Tokenize

In [35]:
texto = "En UTEC somos cheveres, y muy trabajadores."
#print(texto.split(' '))
texto_token = nltk.word_tokenize(texto)

### Stop words

In [36]:
with(open("stop.txt", encoding='UTF-8')) as file:
    stoplist = [line.lower().strip() for line in file]
stoplist += [",", ".", "¿", "?"]
print(stoplist)

['algún', 'alguna', 'algunas', 'alguno', 'algunos', 'ambos', 'ampleamos', 'ante', 'antes', 'aquel', 'aquellas', 'aquellos', 'aqui', 'arriba', 'atras', 'bajo', 'bastante', 'bien', 'cada', 'cierta', 'ciertas', 'cierto', 'ciertos', 'como', 'con', 'conseguimos', 'conseguir', 'consigo', 'consigue', 'consiguen', 'consigues', 'cual', 'cuando', 'dentro', 'desde', 'donde', 'dos', 'el', 'ellas', 'ellos', 'empleais', 'emplean', 'emplear', 'empleas', 'empleo', 'en', 'encima', 'entonces', 'entre', 'era', 'eramos', 'eran', 'eras', 'eres', 'es', 'esta', 'estaba', 'estado', 'estais', 'estamos', 'estan', 'estoy', 'fin', 'fue', 'fueron', 'fui', 'fuimos', 'gueno', 'ha', 'hace', 'haceis', 'hacemos', 'hacen', 'hacer', 'haces', 'hago', 'incluso', 'intenta', 'intentais', 'intentamos', 'intentan', 'intentar', 'intentas', 'intento', 'ir', 'la', 'largo', 'las', 'lo', 'los', 'mientras', 'mio', 'modo', 'muchos', 'muy', 'nos', 'nosotros', 'otro', 'para', 'pero', 'podeis', 'podemos', 'poder', 'podria', 'podriais', 

### Remove stop word

In [39]:
texto_tokens_c = texto_token[:]
for token in texto_token:
    if token.lower() in stoplist:
        texto_tokens_c.remove(token)
print(texto_token)
print(texto_tokens_c)
texto_token = texto_tokens_c

['UTEC', 'cheveres', 'trabajadores']
['UTEC', 'cheveres', 'trabajadores']


### Word reductions

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer() #Migh be able to change language
words = ["program", "programs", "programmer", "programmer", "programmer", "programación"]
for w in words:
    print(w, '->', stemmer.stem(w))

In [40]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('spanish')
for i in range(len(texto_token)):
    texto_token[i] = stemmer.stem(texto_token[i])
print(texto_token)

['utec', 'chever', 'trabaj']


### Inverted index

In [45]:
import nltk
nltk.download('punkt')

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('spanish')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [55]:
def procesamiento(texto):
    texto_token = nltk.word_tokenize(texto)

    with(open("stop.txt", encoding='UTF-8')) as file:
        stoplist = [line.lower().strip() for line in file]
    stoplist += [",", ".", "¿", "?"]

    texto_tokens_c = texto_token[:]
    for token in texto_token:
        if token.lower() in stoplist:
            texto_tokens_c.remove(token)

    texto_token_s = []
    for w in texto_tokens_c:
        texto_token_s.append(stemmer.stem(w))

    return texto_token_s

indice = {}
textos = ["libro1.txt", "libro2.txt", "libro3.txt", "libro4.txt", "libro5.txt", "libro6.txt"]
for file_name in textos:
    file = open("docs/" + file_name, encoding='UTF-8')
    texto = file.read().rstrip()
    texto_filtrado = procesamiento(texto)

    for w in texto_filtrado:
        if w in indice:
            indice[w] = indice[w] + [file_name]
        else:
            indice[w] = [file_name]

#print(indice)

for key in indice:
    indice[key] = sorted(list(set(indice[key])))

import json
print(json.dumps(indice, indent =4))

{
    "obra": [
        "libro1.txt"
    ],
    "comienz": [
        "libro1.txt",
        "libro4.txt",
        "libro5.txt",
        "libro6.txt"
    ],
    "notici": [
        "libro1.txt",
        "libro3.txt"
    ],
    "celebr": [
        "libro1.txt",
        "libro2.txt"
    ],
    "111\u00ba": [
        "libro1.txt"
    ],
    "cumplea\u00f1": [
        "libro1.txt"
    ],
    "bilb": [
        "libro1.txt",
        "libro6.txt"
    ],
    "bolson": [
        "libro1.txt"
    ],
    "comarc": [
        "libro1.txt",
        "libro6.txt"
    ],
    "fiest": [
        "libro1.txt"
    ],
    "motiv": [
        "libro1.txt",
        "libro4.txt"
    ],
    "principal": [
        "libro1.txt"
    ],
    "part": [
        "libro1.txt",
        "libro2.txt",
        "libro3.txt",
        "libro4.txt",
        "libro5.txt",
        "libro6.txt"
    ],
    "viaj": [
        "libro1.txt",
        "libro2.txt",
        "libro4.txt",
        "libro5.txt",
        "libro6.txt"
    ],
    

### Query methods

In [64]:
def L(word):
    word = stemmer.stem(word)
    if word in indice:
        return indice[word]
    else:
        return []

def AND(L1, L2):
    p1 = 0
    p2 = 0
    result = []

    while p1 < len(L1) and p2 < len(L2):
        if L1[p1] == L2[p2]:
            result.append(L1[p1])
            p1+=1
            p2+=1
        elif L1[p1] < L2[p2]:
            p1+=1
        else:
            p2+=1
            
    return result

def OR(L1, L2):
    p1 = p2 = 0
    result = []

    while p1 < len(L1) and p2 < len(L2):
        if L1[p1] == L2[p2]:
            result.append(L1[p1])
            p1 += 1
            p2 += 1
        elif L1[p1] < L2[p2]:
            result.append(L1[p1])
            p1 += 1
        else:
            result.append(L2[p2])
            p2 += 1
    while p1 < len(L1):
        result.append(L1[p1])
        p1 += 1

    while p2 < len(L2):
        result.append(L2[p2])
        p2 += 1

    return result

def AND_NOT(L1, L2):
    pass

In [71]:
#result = AND_NOT(AND(L("Calpurnia"), L("Brutus")), L("Cesar"))
result = OR(L("gandalf"), L("sam"))
print("Los libros recuperados son:")
print(result)

Los libros recuperados son:
['libro1.txt', 'libro2.txt', 'libro3.txt', 'libro4.txt', 'libro5.txt', 'libro6.txt']
