### Extracting text from the PDF reports (DATA MINING)

In [None]:
import re
import os
import pdfminer
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy
from spacy import displacy

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

# Set the directory path
reports_dir = './Reports'

# Iterate over the PDF files in the directory
for filename in os.listdir(reports_dir):
    if filename.endswith('.pdf'):
        # Open the PDF file
        with open(os.path.join(reports_dir, filename), 'rb') as f:
            pdf = pdfminer.PDFDocument(f)

            # Extract the text from the PDF
            text = ''
            for page in pdf.get_pages():
                text += page.extract_text()

            # Preprocess the text
            tokens = word_tokenize(text.lower())
            tokens = [t for t in tokens if t not in stopwords.words('english')]

            # Process the text using spaCy
            doc = nlp(text)

            # Extract entities
            entities = [(ent.text, ent.label_) for ent in doc.ents]

            # Extract keywords
            keywords = [token.text for token in doc if token.is_alpha and token.is_stop]

            # Search for patterns
            pattern = r'sustainability|social responsibility|impact report'
            matches = re.findall(pattern, text)

            # Extract relevant information
            information = []
            for match in matches:
                # Extract the sentence surrounding the match
                sentence = ''
                for sent in nltk.sent_tokenize(text):
                    if match in sent:
                        sentence = sent
                        break
                information.append((match, sentence))

            print(f"Processed file: {filename}")
            print(information)
