In [1]:
from io import StringIO

import os
import re
import PyPDF2
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure, LAParams
from ftlangdetect import detect

In [2]:
def collect_clear_text(text_list):
    text_list = [re.sub(r'\s+', ' ', text_.strip()).strip() for text_ in text_list]
    text = ' '.join(text_list)
    return text.strip()

def contents_references_rmv(text_per_page: dict,pages = 30, char_num = 100, years_num = 10):
    
    def less_char_drop(text,char_num):
        if len(text) < char_num: return True
    def dot_drop(text):
        if (len(text) > char_num) and ((text.count(".") > 40) or (text.count("…")> 10) or (text.count("_")> 10) or (text.count("-")>10)) : return True
    def referencesYear_drop(text):
        year_list = list(range(1900,2025))
        year_count = 0
        for year in year_list:
            if str(year) in text:
                year_count += 1
        if year_count > years_num: return True
    total_pages = len(text_per_page)
    obs_numbers = list(range(0,pages)) + list(range(total_pages-pages,total_pages))
    for page_num in obs_numbers:
        page_word = f'Page_{page_num}'
        if page_num < pages:
            try:
                if (less_char_drop(text_per_page[page_word],char_num)) or (dot_drop(text_per_page[page_word]))  : del text_per_page[page_word]   
            except KeyError:
                continue
        else:
            if (less_char_drop(text_per_page[page_word],char_num)) or (referencesYear_drop(text_per_page[page_word]))  : del text_per_page[page_word]
    return text_per_page

def remove_eng(text_per_page: dict):
    dict_keys = list(text_per_page.keys())
    for keys in dict_keys: 
        text = text_per_page[keys]
        lang = detect(text)
        #print(keys, lang)
        if (lang["lang"] == 'tr') and (lang["score"] < 0.75):
            del text_per_page[keys]
        elif lang["lang"] != "tr":
            del text_per_page[keys]
    return text_per_page

def remove_pageNum(text: str):
    text = text.strip(" ")
    text_list = text.split(" ")
    if text_list[0].strip().isnumeric():
        text_list.remove(text_list[0])
    if text_list[-1].strip().isnumeric():
        text_list.remove(text_list[-1])

    return " ".join(text_list).strip()


In [3]:
for folder in ["Ankara Barosu Fikri Mülkiyet ve Rekabet Hukuku Dergisi", "KARIŞIK"]: 
    pdf_listDir = os.listdir(folder) 
    for pdf in pdf_listDir:
        print(pdf)
        text_per_page = {}
        pdf_path = f'{folder}/{pdf}'
        for pagenum, page in enumerate(extract_pages(pdf_path, laparams = LAParams(line_margin = 2))):
            page_text = []
            page_elements = [(element.y1, element) for element in page._objs]
            page_elements.sort(key=lambda a: a[0], reverse=True)
            for i, component in enumerate(page_elements):
                element = component[1]
                if isinstance(element, LTTextContainer):
                    line_text = element.get_text()
                    page_text.append(line_text)
            dctkey = 'Page_'+str(pagenum)
            text_per_page[dctkey]= collect_clear_text(page_text)
        rawTotal = len(text_per_page)
        text_per_page = contents_references_rmv(text_per_page=text_per_page)
        text_per_page = remove_eng(text_per_page=text_per_page)
    
        clearedTotal = len(text_per_page)
        if clearedTotal < 2:
            print("Bozuk pdf:", pdf)
        print("PDF:", pdf, "Raw_Pages:", rawTotal, "Cleared_Total:", clearedTotal)
        with open(f"miscel_txt/{pdf[:-4]}.txt" , 'w+', encoding="utf-8") as txt:
            for text in text_per_page.values():
                txt.write(remove_pageNum(text))
                txt.write('\n')
        os.rename( f"{folder}/{pdf}", f"miscel_pdfFin/finished_{pdf}")

2015.pdf




PDF: 2015.pdf Raw_Pages: 1346 Cleared_Total: 1217
2016_cilt1.pdf
PDF: 2016_cilt1.pdf Raw_Pages: 882 Cleared_Total: 800
2016_cilt2.pdf
PDF: 2016_cilt2.pdf Raw_Pages: 849 Cleared_Total: 765
2017_cilt1.pdf
PDF: 2017_cilt1.pdf Raw_Pages: 912 Cleared_Total: 845
2017_cilt2.pdf
PDF: 2017_cilt2.pdf Raw_Pages: 816 Cleared_Total: 740
2018_cilt_01.pdf
PDF: 2018_cilt_01.pdf Raw_Pages: 848 Cleared_Total: 801
2018_cilt_02.pdf
PDF: 2018_cilt_02.pdf Raw_Pages: 856 Cleared_Total: 776
2019_1cilt.pdf
PDF: 2019_1cilt.pdf Raw_Pages: 868 Cleared_Total: 818
2019_2cilt.pdf
PDF: 2019_2cilt.pdf Raw_Pages: 886 Cleared_Total: 823
2020_cilt_1.pdf
PDF: 2020_cilt_1.pdf Raw_Pages: 1013 Cleared_Total: 953
2020_cilt_2.pdf
PDF: 2020_cilt_2.pdf Raw_Pages: 944 Cleared_Total: 878
2021_cilt_1.pdf
PDF: 2021_cilt_1.pdf Raw_Pages: 862 Cleared_Total: 805
2021_cilt_2.pdf
PDF: 2021_cilt_2.pdf Raw_Pages: 846 Cleared_Total: 790
2022_cilt_1.pdf
PDF: 2022_cilt_1.pdf Raw_Pages: 904 Cleared_Total: 849
2022_cilt_2.pdf
PDF: 2022_cilt_2.p