In [None]:
!pip install pdfminer.six

In [None]:
!pip install transliterate

In [None]:
!pip install autocorrect

In [None]:
# Imports
import pandas as pd
import docx2txt
import io
import re
import os

from transliterate import translit
from autocorrect import Speller

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

In [11]:
# installations: pdfminer.six, transliterate, autocorrect
# based on "to_pdf(1).ipynb" by Terekhina Maria and Maria Fedorova

def remove_references(text):
    text = re.sub('(Литература|Список( использованной)? литературы|Библиография|Библиографический список|БИБЛИОГРАФИЧЕСКИЙ СПИСОК|СПИСОК ЛИТЕРАТУРЫ)(\.)?.*', '', text, flags=re.S)
    return text

def get_title(text):
    text_cleared = re.sub('-',' ',text)
    text_cleared = re.sub('\.pdf|\.doc|\.docx', '', text_cleared)
    return translit(text_cleared, 'ru')

def get_abstract(text):
    abstract = re.sub('(Ключевые\s+слова|Введение)(\.)?.*', '', text, flags=re.S)
    return abstract
    
def get_full_text(text, abstract):
    full_text = re.sub(abstract,'', text)
    full_text = remove_references(full_text)
    return full_text

def remove_misc_symbols(text):

    text = re.sub('\f.+\n', '\n', text)
    text = re.sub('-\n','',text)
    text = re.sub('\n+', '\n', text)
    text = re.sub('\n+( )*[0-9]+( )*\n+', '', text)
    text = remove_references(text)

    return text


def convert_to_df_entry(path):

    retstr = io.StringIO()
    device = TextConverter(PDFResourceManager(), retstr, codec='utf-8', laparams=LAParams())
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(PDFResourceManager(), device)

    for page in PDFPage.get_pages(fp, set(), maxpages=0,password="",caching=True,check_extractable=True):
        interpreter.process_page(page)
    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return remove_misc_symbols(text)



In [12]:
if __name__ == '__main__':

    spell = Speller('ru')
    id = 365

    total_count = 0
    df = pd.DataFrame(columns=['id', 'titles', 'abstracts', 'full_texts', 'question_id',
       'unanswerebles', 'questions', 'answers', 'evidences',
       'figures_and_tables', 'split'])
    
    failed_id = 0
    df_failed = pd.DataFrame(columns=['title'])
    
    for i,file in enumerate(os.listdir('C://Users//tohet//NIS_parse//test_pdf')):
        text = convert_to_df_entry('C://Users//tohet//NIS_parse//test_pdf//{0}'.format(file))
        
        title = spell(get_title(file))
        abstract = get_abstract(text)
        full_text = text.replace(abstract, '')

        if full_text == '':
            df_failed.loc[failed_id] = [title]
            failed_id += 1
            continue

        df.loc[i] = [id, title, abstract, full_text, 0, None, None, None, None, None, None]
        id += 1

df.to_csv('df_cl.csv', sep='\t')
df_failed.to_csv('df_failed.csv', sep='\t')