## [Extract Text](https://towardsdatascience.com/extracting-text-from-pdf-files-with-python-a-comprehensive-guide-9fc4003d517)

## [Language Detect](https://github.com/zafercavdar/fasttext-langdetect) 

In [1]:
from io import StringIO

import os
import re
import PyPDF2
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure, LAParams
from ftlangdetect import detect
import pycld2 as cld2

In [2]:
def collect_clear_text(text_list):
    text_list = [re.sub(r'\s+', ' ', text_.strip()).strip() for text_ in text_list]
    text = ' '.join(text_list)
    return text.strip()

def contents_references_rmv(text_per_page: dict,pages = 30, char_num = 1500, years_num = 6):
    def less_char_drop(text,char_num):
        if len(text) < char_num: return True
    def dot_drop(text):
        if (len(text) > char_num) and ((text.count(".") > 40) or (text.count("…")> 10) or (text.count("_")> 10) or (text.count("-")>10)) : return True
    def referencesYear_drop(text):
        year_list = list(range(1900,2025))
        year_count = 0
        for year in year_list:
            if str(year) in text:
                year_count += 1
        if year_count > years_num: return True
    total_pages = len(text_per_page)
    obs_numbers = list(range(0,pages)) + list(range(total_pages-pages,total_pages))
    for page_num in obs_numbers:
        page_word = f'Page_{page_num}'
        if page_num < pages:
            try:
                if (less_char_drop(text_per_page[page_word],char_num)) or (dot_drop(text_per_page[page_word]))  : del text_per_page[page_word]   
            except KeyError:
                continue
        else:
            if (less_char_drop(text_per_page[page_word],char_num)) or (referencesYear_drop(text_per_page[page_word]))  : del text_per_page[page_word]
    return text_per_page
    
def remove_eng(text_per_page: dict):
    dict_keys = list(text_per_page.keys())
    for keys in dict_keys: 
        text = text_per_page[keys]
        try:
            isReliable, textBytesFound, details = cld2.detect(text)
        except Exception as e:
            del text_per_page[keys]
            continue    
        lang = details[0][1]
        score = details[0][2]
        #print(keys, lang)
        if (lang == 'tr') and (score < 0.91):
            del text_per_page[keys]
        elif lang != "tr":
            del text_per_page[keys]
    return text_per_page

In [3]:
pdf_listDir = os.listdir("PKF_PDF")
for pdf in pdf_listDir:
    text_per_page = {}
    pdf_path = f'PKF_PDF/{pdf}'
    try:
        for pagenum, page in enumerate(extract_pages(pdf_path, laparams=LAParams(line_margin=2))):
            page_text = []
            page_elements = [(element.y1, element) for element in page._objs]
            page_elements.sort(key=lambda a: a[0], reverse=True)
            for i, component in enumerate(page_elements):
                element = component[1]
                if isinstance(element, LTTextContainer):
                    line_text = element.get_text()
                    page_text.append(line_text)
            dctkey = 'Page_' + str(pagenum)
            text_per_page[dctkey] = collect_clear_text(page_text)
    except TypeError as e:
        print(f"Error processing {pdf}: {e}")
        os.remove(pdf_path)
        continue
    except ValueError as e:
        print(f"Error processing {pdf}: {e}")
        os.remove(pdf_path)
        continue
    except Exception as e:
        print(f"Error {e}")
        os.remove(pdf_path)
        continue
    rawTotal = len(text_per_page)
    text_per_page = contents_references_rmv(text_per_page=text_per_page)
    text_per_page = remove_eng(text_per_page=text_per_page)
    """
    try:
        text_per_page = remove_eng(text_per_page=text_per_page)
    except Exception as e:
        print(f"Error in remove_eng for {pdf}: {e}")
        os.remove(pdf_path)
        continue
    """
    clearedTotal = len(text_per_page)
    if clearedTotal < 10:
        print("Bozuk pdf:", pdf)
    print("PDF:", pdf, "Raw_Pages:", rawTotal, "Cleared_Total:", clearedTotal)
    with open(f"PKF_TXT/{pdf.split('.')[0]}.txt" , 'w+', encoding="utf-8") as txt:
        for text in text_per_page.values():
            txt.write(text)
            txt.write('\n')
    os.rename( f"PKF_PDF/{pdf}", f"PKF_PDF_fin/finished_{pdf}")

Bozuk pdf: 1563280481.pdf
PDF: 1563280481.pdf Raw_Pages: 2 Cleared_Total: 2
Bozuk pdf: 1563281963.pdf
PDF: 1563281963.pdf Raw_Pages: 4 Cleared_Total: 0
Bozuk pdf: 1563282162.pdf
PDF: 1563282162.pdf Raw_Pages: 7 Cleared_Total: 1
Bozuk pdf: 1563282201.pdf
PDF: 1563282201.pdf Raw_Pages: 2 Cleared_Total: 2
Bozuk pdf: 1563283693.pdf
PDF: 1563283693.pdf Raw_Pages: 7 Cleared_Total: 1
Bozuk pdf: 1563283913.pdf
PDF: 1563283913.pdf Raw_Pages: 2 Cleared_Total: 2
Bozuk pdf: 1563284102.pdf
PDF: 1563284102.pdf Raw_Pages: 5 Cleared_Total: 1
Bozuk pdf: 1563284844.pdf
PDF: 1563284844.pdf Raw_Pages: 2 Cleared_Total: 2
Bozuk pdf: 1563285088.pdf
PDF: 1563285088.pdf Raw_Pages: 5 Cleared_Total: 2
Bozuk pdf: 1563287974.pdf
PDF: 1563287974.pdf Raw_Pages: 3 Cleared_Total: 2
Bozuk pdf: 1563288049.pdf
PDF: 1563288049.pdf Raw_Pages: 6 Cleared_Total: 1
Bozuk pdf: 1566308657.pdf
PDF: 1566308657.pdf Raw_Pages: 2 Cleared_Total: 2
Bozuk pdf: 1566309162.pdf
PDF: 1566309162.pdf Raw_Pages: 6 Cleared_Total: 2
Bozuk pdf: 1