In [1]:
from io import StringIO

import os
import re
import PyPDF2
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure, LAParams
from ftlangdetect import detect

In [2]:
def collect_clear_text(text_list):
    text_list = [re.sub(r'\s+', ' ', text_.strip()).strip() for text_ in text_list]
    text = ' '.join(text_list)
    return text.strip()

def contents_references_rmv(text_per_page: dict,pages = 30, char_num = 100, years_num = 10):
    
    def less_char_drop(text,char_num):
        if len(text) < char_num: return True
    def dot_drop(text):
        if (len(text) > char_num) and ((text.count(".") > 40) or (text.count("…")> 10) or (text.count("_")> 10) or (text.count("-")>10)) : return True
    def referencesYear_drop(text):
        year_list = list(range(1900,2025))
        year_count = 0
        for year in year_list:
            if str(year) in text:
                year_count += 1
        if year_count > years_num: return True
    total_pages = len(text_per_page)
    obs_numbers = list(range(0,pages)) + list(range(total_pages-pages,total_pages))
    for page_num in obs_numbers:
        page_word = f'Page_{page_num}'
        if page_num < pages:
            try:
                if (less_char_drop(text_per_page[page_word],char_num)) or (dot_drop(text_per_page[page_word]))  : del text_per_page[page_word]   
            except KeyError:
                continue
        else:
            if (less_char_drop(text_per_page[page_word],char_num)) or (referencesYear_drop(text_per_page[page_word]))  : del text_per_page[page_word]
    return text_per_page

def remove_eng(text_per_page: dict):
    dict_keys = list(text_per_page.keys())
    for keys in dict_keys: 
        text = text_per_page[keys]
        lang = detect(text)
        #print(keys, lang)
        if (lang["lang"] == 'tr') and (lang["score"] < 0.75):
            del text_per_page[keys]
        elif lang["lang"] != "tr":
            del text_per_page[keys]
    return text_per_page

def remove_pageNum(text: str):
    text = text.strip(" ")
    text_list = text.split(" ")
    if text_list[0].strip().isnumeric():
        text_list.remove(text_list[0])
    if text_list[-1].strip().isnumeric():
        text_list.remove(text_list[-1])

    return " ".join(text_list).strip()


In [3]:
pdf_listDir = os.listdir("PDFs") 
for pdf in pdf_listDir:
    print(pdf)
    text_per_page = {}
    pdf_path = f'PDFs/{pdf}'
    for pagenum, page in enumerate(extract_pages(pdf_path, laparams = LAParams(line_margin = 2))):
        page_text = []
        page_elements = [(element.y1, element) for element in page._objs]
        page_elements.sort(key=lambda a: a[0], reverse=True)
        for i, component in enumerate(page_elements):
            element = component[1]
            if isinstance(element, LTTextContainer):
                line_text = element.get_text()
                page_text.append(line_text)
        dctkey = 'Page_'+str(pagenum)
        text_per_page[dctkey]= collect_clear_text(page_text)
    rawTotal = len(text_per_page)
    text_per_page = contents_references_rmv(text_per_page=text_per_page)
    text_per_page = remove_eng(text_per_page=text_per_page)

    clearedTotal = len(text_per_page)
    if clearedTotal < 2:
        print("Bozuk pdf:", pdf)
    print("PDF:", pdf, "Raw_Pages:", rawTotal, "Cleared_Total:", clearedTotal)
    with open(f"txt_folder/{pdf[:-4]}.txt" , 'w+', encoding="utf-8") as txt:
        for text in text_per_page.values():
            txt.write(remove_pageNum(text))
            txt.write('\n')
    os.rename( f"PDFs/{pdf}", f"PDFs_fin/finished_{pdf}")

02_12_2019_103529.pdf




PDF: 02_12_2019_103529.pdf Raw_Pages: 466 Cleared_Total: 428
15_03_2019_015742.pdf
PDF: 15_03_2019_015742.pdf Raw_Pages: 522 Cleared_Total: 489
15_05_2018_114237.pdf
PDF: 15_05_2018_114237.pdf Raw_Pages: 380 Cleared_Total: 350
15_05_2018_114253.pdf
PDF: 15_05_2018_114253.pdf Raw_Pages: 418 Cleared_Total: 397
2022-07-06-02-36-7094347.pdf
PDF: 2022-07-06-02-36-7094347.pdf Raw_Pages: 315 Cleared_Total: 292
2022-07-06-02-44-9821744.pdf
PDF: 2022-07-06-02-44-9821744.pdf Raw_Pages: 251 Cleared_Total: 228
2023-02-22-01-07-1602301.pdf
PDF: 2023-02-22-01-07-1602301.pdf Raw_Pages: 310 Cleared_Total: 284
20_05_2019_042220.pdf
PDF: 20_05_2019_042220.pdf Raw_Pages: 490 Cleared_Total: 459
20_05_2019_042238.pdf
PDF: 20_05_2019_042238.pdf Raw_Pages: 594 Cleared_Total: 550
20_05_2019_042254.pdf
PDF: 20_05_2019_042254.pdf Raw_Pages: 442 Cleared_Total: 416
20_05_2019_042307.pdf
PDF: 20_05_2019_042307.pdf Raw_Pages: 564 Cleared_Total: 516
20_05_2019_042326.pdf
PDF: 20_05_2019_042326.pdf Raw_Pages: 578 Cle