## [Extract Text](https://towardsdatascience.com/extracting-text-from-pdf-files-with-python-a-comprehensive-guide-9fc4003d517)

## [Language Detect](https://github.com/zafercavdar/fasttext-langdetect) 

In [1]:
from io import StringIO

import os
import re
import PyPDF2
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure, LAParams
from ftlangdetect import detect

In [2]:
def collect_clear_text(text_list):
    text_list = [re.sub(r'\s+', ' ', text_.strip()).strip() for text_ in text_list]
    text = ' '.join(text_list)
    return text.strip()

def contents_references_rmv(text_per_page: dict,pages = 30, char_num = 1500, years_num = 6):
    def less_char_drop(text,char_num):
        if len(text) < char_num: return True
    def dot_drop(text):
        if (len(text) > char_num) and ((text.count(".") > 40) or (text.count("…")> 10) or (text.count("_")> 10) or (text.count("-")>10)) : return True
    def referencesYear_drop(text):
        year_list = list(range(1900,2025))
        year_count = 0
        for year in year_list:
            if str(year) in text:
                year_count += 1
        if year_count > years_num: return True
    total_pages = len(text_per_page)
    obs_numbers = list(range(0,pages)) + list(range(total_pages-pages,total_pages))
    for page_num in obs_numbers:
        page_word = f'Page_{page_num}'
        if page_num < pages:
            try:
                if (less_char_drop(text_per_page[page_word],char_num)) or (dot_drop(text_per_page[page_word]))  : del text_per_page[page_word]   
            except KeyError:
                continue
        else:
            if (less_char_drop(text_per_page[page_word],char_num)) or (referencesYear_drop(text_per_page[page_word]))  : del text_per_page[page_word]
    return text_per_page
    
def remove_eng(text_per_page: dict):
    dict_keys = list(text_per_page.keys())
    for keys in dict_keys: 
        text = text_per_page[keys]
        lang = detect(text)
        #print(keys, lang)
        if (lang["lang"] == 'tr') and (lang["score"] < 0.89):
            del text_per_page[keys]
        elif lang["lang"] != "tr":
            del text_per_page[keys]
    return text_per_page

In [3]:
pdf_listDir = os.listdir("YOK_download_pdf")
for pdf in pdf_listDir:
    text_per_page = {}
    pdf_path = f'YOK_download_pdf/{pdf}'
    for pagenum, page in enumerate(extract_pages(pdf_path, laparams = LAParams(line_margin = 2))):
        page_text = []
        page_elements = [(element.y1, element) for element in page._objs]
        page_elements.sort(key=lambda a: a[0], reverse=True)
        for i, component in enumerate(page_elements):
            element = component[1]
            if isinstance(element, LTTextContainer):
                line_text = element.get_text()
                page_text.append(line_text)
        dctkey = 'Page_'+str(pagenum)
        text_per_page[dctkey]= collect_clear_text(page_text)
    rawTotal = len(text_per_page)
    text_per_page = contents_references_rmv(text_per_page=text_per_page)
    text_per_page = remove_eng(text_per_page=text_per_page)

    clearedTotal = len(text_per_page)
    if clearedTotal < 10:
        print("Bozuk pdf:", pdf)
    print("PDF:", pdf, "Raw_Pages:", rawTotal, "Cleared_Total:", clearedTotal)
    with open(f"YOK_TEXT/{pdf.split('.')[0]}.txt" , 'w+', encoding="utf-8") as txt:
        for text in text_per_page.values():
            txt.write(text)
            txt.write('\n')
    os.rename( f"YOK_download_pdf/{pdf}", f"YOK_download_pdf_fin/finished_{pdf}")



PDF: 812461.pdf Raw_Pages: 512 Cleared_Total: 455
PDF: 812462.pdf Raw_Pages: 415 Cleared_Total: 389
PDF: 812466.pdf Raw_Pages: 141 Cleared_Total: 104
PDF: 812472.pdf Raw_Pages: 199 Cleared_Total: 155
PDF: 812473.pdf Raw_Pages: 170 Cleared_Total: 146
PDF: 812475.pdf Raw_Pages: 127 Cleared_Total: 98
PDF: 812480.pdf Raw_Pages: 253 Cleared_Total: 214
PDF: 812486.pdf Raw_Pages: 419 Cleared_Total: 369
PDF: 812487.pdf Raw_Pages: 210 Cleared_Total: 171
PDF: 812489.pdf Raw_Pages: 137 Cleared_Total: 104
PDF: 812497.pdf Raw_Pages: 226 Cleared_Total: 184
PDF: 812499.pdf Raw_Pages: 152 Cleared_Total: 108
PDF: 812916.pdf Raw_Pages: 262 Cleared_Total: 216
PDF: 812927.pdf Raw_Pages: 104 Cleared_Total: 79
PDF: 813009.pdf Raw_Pages: 117 Cleared_Total: 90
PDF: 813016.pdf Raw_Pages: 145 Cleared_Total: 125
PDF: 813039.pdf Raw_Pages: 172 Cleared_Total: 148
PDF: 813141.pdf Raw_Pages: 339 Cleared_Total: 305
PDF: 813268.pdf Raw_Pages: 165 Cleared_Total: 136
PDF: 813335.pdf Raw_Pages: 240 Cleared_Total: 196
PDF