# Intento con Pdfminer.six

In [33]:
from pdfminer.high_level import extract_text
from pypdf import PdfReader, PdfWriter
import fitz
import pandas as pd

documents={
    "Kasarakuy raymimanta": (6, 7)
}

In [32]:
import re


def get_text_from_pdf(key):
    doc = fitz.open(f'data/{key}.pdf')
    number_of_pages = doc.page_count

    i=documents[key][0]
    full_text=""
    for page in doc.pages(documents[key][0], number_of_pages-documents[key][1]):
        # print("Page number: ", i)
        this_page_text=page.get_text("text", sort=True).replace("\n", " ")
        this_page_text=re.sub(r'\d+', '', this_page_text)
        full_text+=this_page_text+" "
        i+=1
    return full_text

In [44]:
from nltk.tokenize import sent_tokenize
from langdetect import detect
import langid

def filtrar_espaniol(full_text):
    sentences = sent_tokenize(full_text)
    non_spanish_sentences = []

    #Oraciones antes de filtrar
    print("Oraciones antes de filtrar: ", len(sentences))
    for sentence in sentences:
        try:
            languages=[]
            languages.append(detect(sentence))
            languages.append(langid.classify(sentence)[0])
            if "es" not in languages and "it" not in languages and "ca" not in languages:
                non_spanish_sentences.append(sentence)
        except:
            pass

    sentences = non_spanish_sentences
    print("Oraciones después de filtrar: ", len(sentences))

    return sentences

In [24]:
#Funciones de filtrado
alfabeto_quechua = ['a', 'aa', 'ch', 'chh', 'ch\'', 'ts', 'tr', 'h', 'i', 'ii', 'k', 'kh', 'k\'', 'l', 'll', 'm', 'n', 'ñ', 'p', 'ph', 'p\'', 'q', 'qh', 'q\'', 'r', 's', 'sh', 't', 'th', 't\'', 'u', 'uu', 'w', 'y']

def grafemas_no_en_alfabet(words):
    for word in words:
        for i, letter in enumerate(word):
            #Continue  if letter is not a letter
            if not letter.isalpha():
                continue
            if letter.lower() not in alfabeto_quechua:
                #Chequear siguiente letra
                if i+1 >= len(word):
                    return False
                letter = letter + word[i+1]
                if letter.lower() not in alfabeto_quechua:
                    if i+2 >= len(word):
                        return False
                    #Chequear siguiente letra
                    letter = letter + word[i+2]
                    if letter.lower() not in alfabeto_quechua:
                        return False
    return True

def oraciones_mucho_espaniol(words):
    spanish_words = 0
    for word in words:
        try:
            if detect(word) == 'es':
                spanish_words += 1
        except:
            pass
    return spanish_words/len(words) > 0.5

def oraciones_muy_cortas(words, min_length=3):
    return len(words) > min_length

def oraciones_muy_repititivas(words, threshold=0.4):
    unique_words = set(words)
    ratio = len(unique_words) / len(words)
    return ratio >= threshold

def palabras_muy_largas(words, threshold=40):
    for word in words:
        if len(word) > threshold:
            return False
    return True

def split_tokens(sentence):
    # Check for three or more sequential tokens composed of one or two characters
    if re.search(r"(\b\w{1,2}\b\s){3,}", sentence):
        return False
    return True

def oraciones_con_matematica(sentence):
    if re.search(r"[\d+\-*/]+", sentence):
        return False
    return True

In [45]:
#a, aa, ch, chh, ch', ts, tr, h, i, ii, k, kh, k', l, ll, m, n, ñ, p, ph, p', q, qh, q', r, s, sh, t, th, t', u, uu, w, y
from nltk.tokenize import word_tokenize

def rule_based_heuristic(sentence):
    words = word_tokenize(sentence)
    
    valid = oraciones_muy_cortas(words) 
    valid = valid and oraciones_muy_repititivas(words)
    valid = valid and palabras_muy_largas(words)
    valid = valid and split_tokens(sentence)
    valid = valid and oraciones_con_matematica(sentence)

    return valid

In [46]:
def pipeline(documents):
    corpus=pd.DataFrame(columns=["document", "sentence"])
    for key, value in documents.items():
        text=get_text_from_pdf(key)
        sentences=filtrar_espaniol(text)

        filtered_sentences=[]
        for sentence in sentences:
            if rule_based_heuristic(sentence):
                filtered_sentences.append(sentence)
        
        df=pd.DataFrame(filtered_sentences, columns=["sentence"])
        df["document"]=key
        corpus=pd.concat([corpus, df], ignore_index=True)
    return corpus

In [47]:
df_corpus=pipeline(documents)
df_corpus.head()

Oraciones antes de filtrar:  1115
Oraciones después de filtrar:  509


Unnamed: 0,document,sentence
0,Kasarakuy raymimanta,Kasarakuy runachakuymanta Kasarakuy runachaku...
1,Kasarakuy raymimanta,Imaraykuchus kawsaynin ukhupi uywakuna chharq...
2,Kasarakuy raymimanta,Uywasqam uywakuna imayna mirasqanta ima allin...
3,Kasarakuy raymimanta,"Chaymanhina, ayllu irqikunap yuyayninpiqa, im..."
4,Kasarakuy raymimanta,"Chaymantapas ayllu irqiqa munasqantam ruran, i..."


In [49]:
word_tokenize("Much’anakuy.")

['Much', '’', 'anakuy', '.']

In [48]:
#Show me the sortest sentences  content
df_corpus.loc[df_corpus["sentence"].apply(lambda x: len(x))<40]

Unnamed: 0,document,sentence
17,Kasarakuy raymimanta,Sipaspa uywan michinan patapi.
27,Kasarakuy raymimanta,Sipaspa uywan michinan patapi.
37,Kasarakuy raymimanta,Sumaqta tapurispa makiwan chaskichin.
46,Kasarakuy raymimanta,Huk chakrayuq runakuna qhawananpaq.
54,Kasarakuy raymimanta,Chaypi tuta paqariqta tusumunku.
60,Kasarakuy raymimanta,Much’anakuy.
88,Kasarakuy raymimanta,Iskay ñiqin watukuq puriy.
93,Kasarakuy raymimanta,Sipastaq kuchichikun: —Arí taytáy.
100,Kasarakuy raymimanta,Warmi hurquy p’unchaw chayamuptin.
107,Kasarakuy raymimanta,—Qué bueno que me visites.


# Intento fallido con pypdf

In [None]:
from pypdf import PdfReader, PdfWriter

In [None]:
#Las páginas iniciales y finales de los documentos que se quieren eliminar
documents={
    "Kasarakuy raymimanta": (6, 7)
}

In [None]:

for key in documents:
    reader = PdfReader(f'data/{key}.pdf')
    number_of_pages = len(reader.pages)
    print(f'Number of pages: {number_of_pages}')

    #Eliminar las páginas iniciales y finales
    writer = PdfWriter()

    for page in reader.pages[documents[key][0]:-documents[key][1]]:
        writer.add_page(page)


In [None]:
page=writer.pages[0]

In [None]:
def visitor_body(text, cm, tm, font_dict, font_size):
    curr_font_size = font_size
    print(f'{text} {curr_font_size} {font_size}')

In [None]:
page.extract_text()