# Intento con Pdfminer.six

In [37]:
from pdfminer.high_level import extract_text
from pypdf import PdfReader, PdfWriter
import fitz
import pandas as pd

documents={
    "Kasarakuy raymimanta": (6, 7),
    "Alelipa munaqusqan waqaychanankuna. Historias y Relatos 5 - Inicial - Quechua Collao": (4, 3),
    "Ayllunchikpa willakuyninkuna. Historias y relatos 2 - Inicial - Quechua Collao": (4, 3),
    "Liqichumanta. Historias y relatos 1 - Inicial - Quechua Collao": (4, 3),
    "Muhu papa rikch’arichiymanta. Historias y relatos 3 - Inicial - Quechua Collao":  (4, 3),
    "Papa allay. Historias y Relatos 4 - Inicial - Quechua Collao":  (4, 3),
    "1 Rimana. Qichwa - Qullawpi llamk'ana mayt'u. Cuaderno de trabajo - Comunicación 1° - Quechua Collao": (6, 65),
    # "2 Rimana - Qillqasqa Mayt’u Qichwa Qullaw. Texto de Comunicación del 2° Secundaria - Quechua Collao": (6, 3),
    "3 Rimana - Qillqasqa Mayt’u Qichwa Qullaw. Texto de Comunicación del 3° Secundaria - Quechua Collao": (6, 3),
    "4 Rimana - Qillqasqa Mayt’u Qichwa Qullaw. Texto de Comunicación del 4° Secundaria - Quechua Collao": (6, 3),
    "5 Rimana - Qillqasqa Mayt’u Qichwa Qullaw. Texto de Comunicación del 5° Secundaria - Quechua Collao": (6, 3),
}

In [2]:
import re


def get_text_from_pdf(key):
    doc = fitz.open(f'data/{key}.pdf')
    number_of_pages = doc.page_count

    i=documents[key][0]
    full_text=""
    for page in doc.pages(documents[key][0], number_of_pages-documents[key][1]):
        # print("Page number: ", i)
        this_page_text=page.get_text("text", sort=True).replace("\n", " ")
        this_page_text=re.sub(r'\d+', '', this_page_text)
        full_text+=this_page_text+" "
        i+=1
    return full_text

In [26]:
from nltk.tokenize import sent_tokenize
from langdetect import detect
import langid

def filtrar_espaniol(full_text):
    sentences = sent_tokenize(full_text)
    non_spanish_sentences = []

    
    for sentence in sentences:
        try:
            languages=[]
            languages.append(detect(sentence))
            languages.append(langid.classify(sentence)[0])
            if not any(lang in languages for lang in ["es", "it", "ca", "pt"]):
                non_spanish_sentences.append(sentence)
        except:
            pass

    sentences = non_spanish_sentences
    

    return sentences

def filtrar_oracion_espaniol(sentence):
    try:
        languages=[]
        languages.append(detect(sentence))
        languages.append(langid.classify(sentence)[0])
        if not any(lang in languages for lang in ["es", "it", "ca", "pt"]):
            return True
    except:
        pass

    return False

In [4]:
#Funciones de filtrado
alfabeto_quechua = ['a', 'aa', 'ch', 'chh', 'ch\'', 'ts', 'tr', 'h', 'i', 'ii', 'k', 'kh', 'k\'', 'l', 'll', 'm', 'n', 'ñ', 'p', 'ph', 'p\'', 'q', 'qh', 'q\'', 'r', 's', 'sh', 't', 'th', 't\'', 'u', 'uu', 'w', 'y']

def grafemas_no_en_alfabet(words):
    for word in words:
        for i, letter in enumerate(word):
            #Continue  if letter is not a letter
            if not letter.isalpha():
                continue
            if letter.lower() not in alfabeto_quechua:
                #Chequear siguiente letra
                if i+1 >= len(word):
                    return False
                letter = letter + word[i+1]
                if letter.lower() not in alfabeto_quechua:
                    if i+2 >= len(word):
                        return False
                    #Chequear siguiente letra
                    letter = letter + word[i+2]
                    if letter.lower() not in alfabeto_quechua:
                        return False
    return True

def oraciones_mucho_espaniol(words):
    spanish_words = 0
    for word in words:
        try:
            if detect(word) == 'es':
                spanish_words += 1
        except:
            pass
    return spanish_words/len(words) > 0.5

def oraciones_muy_cortas(words, min_length=3):
    #Conservar solo palabras que no sean puntuación
    aux = [word for word in words if word.isalpha()]
    return len(aux) > min_length

def oraciones_muy_repititivas(words, threshold=0.4):
    unique_words = set(words)
    ratio = len(unique_words) / len(words)
    return ratio >= threshold

def palabras_muy_largas(words, threshold=40):
    for word in words:
        if len(word) > threshold:
            return False
    return True

def split_tokens(sentence):
    # Check for three or more sequential tokens composed of one or two characters
    if re.search(r"(\b\w{1,2}\b\s){3,}", sentence):
        return False
    return True

def oraciones_con_matematica(sentence):
    if re.search(r"[\d+\-*/]+", sentence):
        return False
    return True

In [5]:
#a, aa, ch, chh, ch', ts, tr, h, i, ii, k, kh, k', l, ll, m, n, ñ, p, ph, p', q, qh, q', r, s, sh, t, th, t', u, uu, w, y
from nltk.tokenize import word_tokenize
# Reglas basadas en https://aclanthology.org/2020.lrec-1.356/
def rule_based_heuristic(sentence):
    words = word_tokenize(sentence)
    
    valid = oraciones_muy_cortas(words) 
    valid = valid and oraciones_muy_repititivas(words)
    valid = valid and palabras_muy_largas(words)
    valid = valid and split_tokens(sentence)
    valid = valid and oraciones_con_matematica(sentence)

    return valid

In [33]:
def pipeline(documents):
    corpus=pd.DataFrame(columns=["document", "sentence"])
    for key, value in documents.items():
        text=get_text_from_pdf(key)
        sentences=sent_tokenize(text)
        #Oraciones antes de filtrar
        print("Oraciones antes de filtrar en documento ", key, ": ", len(sentences))
        filtered_sentences=[]
        for sentence in sentences:
            if rule_based_heuristic(sentence) and filtrar_oracion_espaniol(sentence):
                filtered_sentences.append(sentence)
        
        df=pd.DataFrame(filtered_sentences, columns=["sentence"])
        df["document"]=key
        #Quitar duplicados
        df.drop_duplicates(subset="sentence", inplace=True)
        print("Oraciones después de filtrar en documento ", key, ": ", len(df))
        corpus=pd.concat([corpus, df], ignore_index=True)
    print("Total de oraciones: ", len(corpus))
    corpus.drop_duplicates(subset="sentence", inplace=True)
    print("Total de oraciones únicas: ", len(corpus))
    return corpus

In [34]:
df_corpus=pipeline(documents)
df_corpus.head()

Oraciones antes de filtrar en documento  Kasarakuy raymimanta :  1115
Oraciones después de filtrar en documento  Kasarakuy raymimanta :  462
Oraciones antes de filtrar en documento  Alelipa munaqusqan waqaychanankuna. Historias y Relatos 5 - Inicial - Quechua Collao :  26
Oraciones después de filtrar en documento  Alelipa munaqusqan waqaychanankuna. Historias y Relatos 5 - Inicial - Quechua Collao :  20
Oraciones antes de filtrar en documento  Ayllunchikpa willakuyninkuna. Historias y relatos 2 - Inicial - Quechua Collao :  58
Oraciones después de filtrar en documento  Ayllunchikpa willakuyninkuna. Historias y relatos 2 - Inicial - Quechua Collao :  47
Oraciones antes de filtrar en documento  Liqichumanta. Historias y relatos 1 - Inicial - Quechua Collao :  14
Oraciones después de filtrar en documento  Liqichumanta. Historias y relatos 1 - Inicial - Quechua Collao :  10
Oraciones antes de filtrar en documento  Muhu papa rikch’arichiymanta. Historias y relatos 3 - Inicial - Quechua Coll

Unnamed: 0,document,sentence
0,Kasarakuy raymimanta,Kasarakuy runachakuymanta Kasarakuy runachaku...
1,Kasarakuy raymimanta,Imaraykuchus kawsaynin ukhupi uywakuna chharq...
2,Kasarakuy raymimanta,Uywasqam uywakuna imayna mirasqanta ima allin...
3,Kasarakuy raymimanta,"Chaymanhina, ayllu irqikunap yuyayninpiqa, im..."
4,Kasarakuy raymimanta,"Chaymantapas ayllu irqiqa munasqantam ruran, i..."


In [35]:
#Show me the sortest sentences  content
df_corpus.loc[df_corpus["sentence"].apply(lambda x: len(x))<40]

Unnamed: 0,document,sentence
17,Kasarakuy raymimanta,Sipaspa uywan michinan patapi.
36,Kasarakuy raymimanta,Sumaqta tapurispa makiwan chaskichin.
45,Kasarakuy raymimanta,Huk chakrayuq runakuna qhawananpaq.
53,Kasarakuy raymimanta,Chaypi tuta paqariqta tusumunku.
86,Kasarakuy raymimanta,Iskay ñiqin watukuq puriy.
...,...,...
4840,5 Rimana - Qillqasqa Mayt’u Qichwa Qullaw. Tex...,¿Ch’askamanta takiy ima nichkantaq?
4877,5 Rimana - Qillqasqa Mayt’u Qichwa Qullaw. Tex...,Chayta qillqana mayt’upi qillqaykunki.
4883,5 Rimana - Qillqasqa Mayt’u Qichwa Qullaw. Tex...,T’iktu: Dibujo Usnu: Altar.
4885,5 Rimana - Qillqasqa Mayt’u Qichwa Qullaw. Tex...,Chumpillikuy: Envuelve con faja.


In [42]:
df_corpus.to_csv("data/corpus/avanceHarvy.csv", index=False)

# Intento fallido con pypdf

In [9]:
from pypdf import PdfReader, PdfWriter

In [10]:
#Las páginas iniciales y finales de los documentos que se quieren eliminar
documents={
    "Kasarakuy raymimanta": (6, 7)
}

In [11]:

for key in documents:
    reader = PdfReader(f'data/{key}.pdf')
    number_of_pages = len(reader.pages)
    print(f'Number of pages: {number_of_pages}')

    #Eliminar las páginas iniciales y finales
    writer = PdfWriter()

    for page in reader.pages[documents[key][0]:-documents[key][1]]:
        writer.add_page(page)


Number of pages: 92


In [12]:
page=writer.pages[0]

In [13]:
def visitor_body(text, cm, tm, font_dict, font_size):
    curr_font_size = font_size
    print(f'{text} {curr_font_size} {font_size}')

In [14]:
page.extract_text()

'Kasarakuy runachakuymanta \nmunanakuy willakuy\nAyllu irqikunaqa huch’uyninmanta rikunku imaynatas kay pachapi tukuy \nuywapas masachakunku chayta. Imaraykuchus kawsaynin ukhupi uywakuna \nchharqukusqanta, wachasqanta ima rikunku. Uywasqam uywakuna imayna \nmirasqanta ima allinta yachanku. Chaymanhina, ayllu irqikunap yuyayninpiqa, \nimaynatam warmi qharipas masachakun chaytapas yachankum.\n5\nKasarakuy runachakuymanta \nKasarakuy runachakuymanta \nAprendizajes del buen vivir para el matrimonio.\nLos niños y niñas en los andes, en su vida diaria y desde pequeños, ven cómo los animales que crian \nen la familia nacen o se reproducen. Igualmente, saben cómo un hombre y una mujer joven llegan al \nmatrimonio.\n'