In [34]:
import fitz
import pandas as pd
import nltk
import re
from nltk.tokenize import sent_tokenize
from langdetect import detect
import langid
nltk.download('cess_esp')
from nltk.corpus import cess_esp
from nltk.tokenize import word_tokenize
import os

spanish_words = set(word for word in cess_esp.words())

[nltk_data] Downloading package cess_esp to
[nltk_data]     C:\Users\perez\AppData\Roaming\nltk_data...
[nltk_data]   Package cess_esp is already up-to-date!


In [19]:
def get_text_from_pdf(name):
    doc = fitz.open(f'data/{name}.pdf')
    full_text=""

    for page in doc.pages():
        this_page_text=page.get_text("text", sort=True).replace("\n", " ")
        this_page_text=re.sub(r'\d+', '', this_page_text)
        full_text+=this_page_text+" "
    
    return full_text

def get_all_text_from_pdf(name):
    doc = fitz.open(f'data/{name}.pdf')
    full_text=" ".join([page.get_text("text", sort=True).replace("\n", " ") for page in doc])
    full_text=re.sub(r'\d+', '', full_text)
    return full_text

In [20]:
def spanish_filter(full_text):
    sentences = sent_tokenize(full_text)
    non_spanish_sentences = []

    for sentence in sentences:
        try:
            languages=[]
            languages.append(detect(sentence))
            languages.append(langid.classify(sentence)[0])
            if not any(lang in languages for lang in ["es", "it", "ca", "pt"]):
                non_spanish_sentences.append(sentence)
        except:
            pass

    sentences = non_spanish_sentences
    

    return sentences

def spanish_sentences_filter(sentence):
    try:
        languages=[]
        languages.append(detect(sentence))
        languages.append(langid.classify(sentence)[0])
        if not any(lang in languages for lang in ["es", "it", "ca", "pt"]):
            return True
    except:
        pass

    return False

In [21]:
quechua_alphabet = ['a', 'aa', 'ch', 'chh', 'ch\'', 'ts', 'tr', 'h', 'i', 'ii', 'k', 'kh', 'k\'', 'l', 'll', 'm', 'n', 'ñ', 'p', 'ph', 'p\'', 'q', 'qh', 'q\'', 'r', 's', 'sh', 't', 'th', 't\'', 'u', 'uu', 'w', 'y']

def grafemas_no_en_alfabet(words):
    for word in words:
        for i, letter in enumerate(word):
            #Continue  if letter is not a letter
            if not letter.isalpha():
                continue
            if letter.lower() not in quechua_alphabet:
                #Chequear siguiente letra
                if i+1 >= len(word):
                    return False
                letter = letter + word[i+1]
                if letter.lower() not in quechua_alphabet:
                    if i+2 >= len(word):
                        return False
                    #Chequear siguiente letra
                    letter = letter + word[i+2]
                    if letter.lower() not in quechua_alphabet:
                        return False
    return True

#No funciona bien esta función, porque el detector de lenguaje no es muy bueno para palabras 
def oraciones_mucho_espaniol(words):
    spanish_words = 0
    for word in words:
        try:
            if detect(word) == 'es':
                spanish_words += 1
        except:
            pass
    return spanish_words/len(words) > 0.5

def oracion_mucho_espaniol_v2(words):
    palabras_encontradas = 0
    #Usar un diccionario de palabras en español
    for word in words:
        if word.lower() in spanish_words:
            palabras_encontradas += 1

    return palabras_encontradas/len(words) < 0.25


def oraciones_muy_cortas(words, min_length=3):
    return len(words) > min_length

def oraciones_muy_repititivas(words, threshold=0.4):
    unique_words = set(words)
    ratio = len(unique_words) / len(words)
    return ratio >= threshold

def palabras_muy_largas(words, threshold=40):
    for word in words:
        if len(word) > threshold:
            return False
    return True

def split_tokens(sentence):
    if re.search(r"(\b\w{1,2}\b\s){3,}", sentence):
        return False
    return True

def oraciones_con_matematica(sentence):
    if re.search(r"[\d+\-*/]+", sentence):
        return False
    return True

In [22]:
def rule_based_heuristic(sentence):
    words = word_tokenize(sentence)
    aux_solo_palabras = [word for word in words if word.isalpha()]
    
    valid = oraciones_muy_cortas(aux_solo_palabras) 
    valid = valid and oraciones_muy_repititivas(aux_solo_palabras)
    valid = valid and palabras_muy_largas(words)
    valid = valid and split_tokens(sentence)
    valid = valid and oraciones_con_matematica(sentence)
    valid = valid and oracion_mucho_espaniol_v2(aux_solo_palabras)

    return valid

In [30]:
def pipeline(documents):
    corpus=pd.DataFrame(columns=["document", "sentence"])
    for document in documents:
        text=get_text_from_pdf(document)
        sentences=sent_tokenize(text)
        #Oraciones antes de filtrar
        print("Oraciones antes de filtrar en documento ", document, ": ", len(sentences))
        filtered_sentences=[]
        for sentence in sentences:
            if rule_based_heuristic(sentence) and spanish_sentences_filter(sentence):
                filtered_sentences.append(sentence)
        
        df=pd.DataFrame(filtered_sentences, columns=["sentence"])
        df["document"]=document
        #Quitar duplicados
        df.drop_duplicates(subset="sentence", inplace=True)
        print("Oraciones después de filtrar en documento ", document, ": ", len(df))
        corpus=pd.concat([corpus, df], ignore_index=True)
    print("Total de oraciones: ", len(corpus))
    corpus.drop_duplicates(subset="sentence", inplace=True)
    print("Total de oraciones únicas: ", len(corpus))
    return corpus

In [36]:
path = "data/"

documents = []

# Get all the PDF documents NAMES from the path
for file in os.listdir(path):
    if file.endswith(".pdf"):
        documents.append(file[:-4])

print(documents)

df_corpus=pipeline(documents)
df_corpus.head()

['4 Rimana - Qillqasqa maytu - Qichwa, Chanka. Texto de Comunicación del 4° de Secundaria - Quechua Chanka', '5 Rimana - Qillqasqa maytu - Qichwa, Chanka. Texto de Comunicación del 5° Secundaria - Quechua Chanka', 'Anqarakunapa kawsakuyninmanta. Literatura 2 - 4° Primaria - Quechua Chanka', 'Antuku hampatuwan. Historias y relatos 2- Inicial - Quechua Chanka', 'Ayllunchikpa Willakuyninkuna. Historias y relatos 1 - Inicial - Quechua Chanka', 'Añañaw. Willakuy 4 - Quechua Chanka', 'cuentos_3-3-1 - Quechua Collao', 'Hanaq pachaman qispiq atuqman. Willakuy 9 - Quechua Chanka', 'Kicharisqa ñawiywanmi ñuqa uyarini. Willakuy 2 - Quechua Chanka', 'Llaqtanchikpa kawsayninkuna - Chanka Saberes de los pueblos 1 - 1° Secundaria - Quechua chanka', 'Llaqtaypa Kawsayninkuna. Saberes de los pueblos - 4° Primaria - Quechua Chanka', 'Maychi hampimanta. Historias y relatos 3- Inicial - Quechua Chanka', 'Muki. Willakuy 11 - Quechua Chanka', 'Pawqar llikllachay- Willakuy 3 - Quechua Chanka', 'Poemas en Quec

Unnamed: 0,document,sentence
0,"4 Rimana - Qillqasqa maytu - Qichwa, Chanka. T...",Sapa taqam ñawpa llaqtapa huk llamkananmanta r...
1,"4 Rimana - Qillqasqa maytu - Qichwa, Chanka. T...",Hinaspapas taqapi ruraykunaqa llaqta llamkayni...
2,"4 Rimana - Qillqasqa maytu - Qichwa, Chanka. T...",Kaymanhinam sasachakuy imayna chinkachiypi ya...
3,"4 Rimana - Qillqasqa maytu - Qichwa, Chanka. T...","Kayna: huk taqam papa tarpuymanta riman, papa..."
4,"4 Rimana - Qillqasqa maytu - Qichwa, Chanka. T...","Chaymanhinam qamta munachisunki, imaynataq pap..."


In [37]:
df_corpus.loc[df_corpus["sentence"].apply(lambda x: len(x))<40]

Unnamed: 0,document,sentence
19,"4 Rimana - Qillqasqa maytu - Qichwa, Chanka. T...",Sara musqukuyqa hatun kusikuymi.
39,"4 Rimana - Qillqasqa maytu - Qichwa, Chanka. T...",Nispas kumun masinkunata rimapayasqa.
49,"4 Rimana - Qillqasqa maytu - Qichwa, Chanka. T...",Mitu Huk rikchaq qillqasqa.
51,"4 Rimana - Qillqasqa maytu - Qichwa, Chanka. T...",Runakuna yachankuchu mayman risqanta.
74,"4 Rimana - Qillqasqa maytu - Qichwa, Chanka. T...",¿Pitaq takiq pisqu kanman karqan?
...,...,...
4412,Ñawinchanapaq munay qillqasqakuna 2019 - Quech...,Manas runapas uywapas kanchu.
4437,Ñawinchanapaq munay qillqasqakuna 2019 - Quech...,Kunanqa wachu wachutaña qillqanki.
4440,Ñawinchanapaq munay qillqasqakuna 2019 - Quech...,Kay qillqasqaykiqa chuyachañam kanan.
4442,Ñawinchanapaq munay qillqasqakuna 2019 - Quech...,"Kunanñataq, huk rapiman qillqarquy."


In [38]:
df_corpus.to_csv("data/corpus/avancePaolo.csv", index=False)

In [40]:
# documents name formatted:
for document in documents:
    print(f'- {document}')

- 4 Rimana - Qillqasqa maytu - Qichwa, Chanka. Texto de Comunicación del 4° de Secundaria - Quechua Chanka
- 5 Rimana - Qillqasqa maytu - Qichwa, Chanka. Texto de Comunicación del 5° Secundaria - Quechua Chanka
- Anqarakunapa kawsakuyninmanta. Literatura 2 - 4° Primaria - Quechua Chanka
- Antuku hampatuwan. Historias y relatos 2- Inicial - Quechua Chanka
- Ayllunchikpa Willakuyninkuna. Historias y relatos 1 - Inicial - Quechua Chanka
- Añañaw. Willakuy 4 - Quechua Chanka
- cuentos_3-3-1 - Quechua Collao
- Hanaq pachaman qispiq atuqman. Willakuy 9 - Quechua Chanka
- Kicharisqa ñawiywanmi ñuqa uyarini. Willakuy 2 - Quechua Chanka
- Llaqtanchikpa kawsayninkuna - Chanka Saberes de los pueblos 1 - 1° Secundaria - Quechua chanka
- Llaqtaypa Kawsayninkuna. Saberes de los pueblos - 4° Primaria - Quechua Chanka
- Maychi hampimanta. Historias y relatos 3- Inicial - Quechua Chanka
- Muki. Willakuy 11 - Quechua Chanka
- Pawqar llikllachay- Willakuy 3 - Quechua Chanka
- Poemas en Quechua de Cusco -