In [23]:
import PyPDF2 as PDF
import docx

class PDFReader:
    def __init__(self, file_path):
        self.file_path = file_path

    def extract_text(self):
        """Extracts text from the entire PDF."""
        with open(self.file_path, 'rb') as pdfFileObj:
            pdfReader = PDF.PdfReader(pdfFileObj)
            text = ""
            for pageNumber in range(len(pdfReader.pages)):
                page = pdfReader.pages[pageNumber]
                text += page.extract_text() + "\n"
            return text

class ContentFilter:
    def __init__(self, keywords):
        self.keywords = keywords

    def filter_text(self, text):
        """Filters text to include only paragraphs containing specified keywords."""
        paragraphs = text.split('\n')
        filtered_paragraphs = [para for para in paragraphs if any(keyword.lower() in para.lower() for keyword in self.keywords)]
        return "\n".join(filtered_paragraphs)

class DocxWriter:
    def __init__(self, file_path):
        self.file_path = file_path
        self.doc = docx.Document()

    def write_text(self, text):
        """Writes text to the .docx document, splitting into paragraphs."""
        paragraphs = text.split('\n')
        for paragraph in paragraphs:
            self.doc.add_paragraph(paragraph)

    def save(self):
        """Saves the .docx document to the specified path."""
        self.doc.save(self.file_path)

# Usage
pdf_path = r'C:\Users\bugat\Prosjekter\Tekstanalyse\Prosjekt_tekstanalyse\git_NLP\Tekstanalyse\PDF_Reader\PDF_filer\AIX_General.pdf'
docx_path = r'C:\Users\bugat\Prosjekter\Tekstanalyse\Prosjekt_tekstanalyse\git_NLP\Tekstanalyse\PDF_Reader\PDF_filer\AIX_General.docx'
keywords = ['AI', 'deep learning']

reader = PDFReader(pdf_path)
extracted_text = reader.extract_text()

filter = ContentFilter(keywords)
filtered_text = filter.filter_text(extracted_text)

In [31]:
filter_red = ContentFilter(['Deepred', 'DeepRED', 'deepred', 'deep red'])
deep_red = filter_red.filter_text(extracted_text)
deep_red

'these few works is DeepRED  algorithm  [257] , which extends  the de- \n[257] J.R. Zilke , E.L. Mencía , F. Janssen , Deepred–rule  extraction  from deep neural '

In [25]:
writer = DocxWriter(docx_path)
writer.write_text(filtered_text)
writer.save()

In [27]:

doc = docx.Document(docx_path)
print(f"Number of paragraphs: {len(doc.paragraphs)}")
for count, para in enumerate(doc.paragraphs, start=1):
    print(f"{count}: {para.text}")

Number of paragraphs: 909
1: Contents  lists available  at ScienceDirect  
2: Explainable  Artiﬁcial  Intelligence  (XAI):  Concepts,  taxonomies,  
3: opportunities  and challenges  toward  responsible  AI 
4: a TECNALIA,  Derio 48160, Spain 
5: b ENSTA, Institute Polytechnique  Paris and INRIA Flowers Team, Palaiseau,  France 
6: c University  of the Basque Country (UPV/EHU),  Bilbao 48013, Spain 
7: d Basque Center for Applied Mathematics  (BCAM), Bilbao 48009, Bizkaia, Spain 
8: g DaSCI Andalusian  Institute of Data Science and Computational  Intelligence,  University  of Granada,  Granada 18071, Spain 
9: h Telefonica,  Madrid 28050, Spain 
10: Explainable  Artiﬁcial Intelligence  
11: Deep Learning 
12: Fairness 
13: In the last few years, Artiﬁcial  Intelligence  (AI) has achieved  a notable momentum  that, if harnessed  appropriately,  
14: Machine  Learning,  the entire community  stands in front of the barrier of explainability,  an inherent  problem  of 
15: in the last hype