# 1- Text Extraction 


In [1]:
import os
from pdfminer.high_level import extract_text
import codecs

def extract_text_from_pdf(pdf_filename, txt_filename):
    try:
        # Extract raw text from the PDF
        raw_text = extract_text(pdf_filename)

        # Use UTF-8 encoding to decode the text (assuming UTF-8 is common)
        decoded_text = raw_text.encode('utf-8', 'ignore').decode('utf-8')

        # Clean the text by removing extra spaces and non-breaking spaces
        cleaned_text = ' '.join(decoded_text.split())

        # Save the cleaned text to the specified text file
        with codecs.open(txt_filename, "w", encoding="utf-8") as txt_file:
            txt_file.write(cleaned_text)

        print(f"Text extracted, cleaned, and saved as '{txt_filename}'.")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

# Get the current working directory
current_directory = os.getcwd()

# Create the 'alltext' folder if it doesn't exist
alltext_directory = os.path.join(current_directory, 'alltext')
os.makedirs(alltext_directory, exist_ok=True)

# Specify the directory containing the PDF files
pdf_directory = os.path.join(current_directory, 'pdfs')

# Iterate through the PDF files in the 'pdfs' directory
for pdf_file in os.listdir(pdf_directory):
    if pdf_file.endswith('.pdf'):
        pdf_path = os.path.join(pdf_directory, pdf_file)

        # Create a text file with the same name as the PDF (minus the ".pdf" extension) in 'alltext' folder
        text_filename = os.path.splitext(pdf_file)[0] + '.txt'
        text_file_path = os.path.join(alltext_directory, text_filename)

        # Extract and clean text from the PDF file using the defined function
        extract_text_from_pdf(pdf_path, text_file_path)

print("Text extraction and saving complete.")


Text extracted, cleaned, and saved as '/Users/paria/Desktop/equity/LitExtract/alltext/The Use of Artificial Intelligence with Students with Identified Disabilities  A Systematic Review with Critique.txt'.
Text extracted, cleaned, and saved as '/Users/paria/Desktop/equity/LitExtract/alltext/WilsonFutureEng.txt'.
Text extracted, cleaned, and saved as '/Users/paria/Desktop/equity/LitExtract/alltext/literature-review-on-disability-participation-in-the-engineering-field.txt'.
Text extracted, cleaned, and saved as '/Users/paria/Desktop/equity/LitExtract/alltext/missing-from-the-classroom-current-representations-of-disability-in-engineering-education.txt'.
Text extracted, cleaned, and saved as '/Users/paria/Desktop/equity/LitExtract/alltext/Intelligent Tutoring System in Education for Disabled Learners Using Human Computer Interaction and Augmented Reality .txt'.
Text extracted, cleaned, and saved as '/Users/paria/Desktop/equity/LitExtract/alltext/ChatBot4AgingCogDis.txt'.
Text extracted, cle

# 2-Method Section Extreaction 

In [4]:
import nltk
import os
import re
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

# Define the input folder and output folder
input_folder = 'alltext'
output_folder = 'methods'

def extract_and_generate_sentences(input_filename, output_filename):
    try:
        with open(input_filename, "r", encoding="utf-8") as file:
            text = file.read()

        # Tokenize the text into sentences
        sentences = sent_tokenize(text)

        # Initialize a list to store cleaned and relevant sentences
        relevant_sentences = []

        # Define keywords to search for
        keywords = ["experiment", "survey", "interview"]

        # Define a function to clean up a sentence
        def clean_sentence(sentence):
            # Remove leading/trailing whitespace and extra spaces
            cleaned_sentence = ' '.join(sentence.split())
            return cleaned_sentence

        # Iterate through the sentences to find relevant sentences
        for sentence in sentences:
            for keyword in keywords:
                if keyword in sentence.lower():
                    cleaned_sentence = clean_sentence(re.sub(r'\b' + re.escape(keyword) + r'\b', '', sentence, flags=re.I))
                    if cleaned_sentence:  # Check if the cleaned sentence is not empty
                        relevant_sentences.append(f"{keyword.capitalize()}: {cleaned_sentence}")

        # Generate the output text
        output_text = "\n".join(relevant_sentences)

        # Construct the output file path in the "methods" folder with the same filename
        output_file_path = os.path.join(output_folder, os.path.basename(output_filename))

        # Save the output text to the specified file
        with open(output_file_path, "w", encoding="utf-8") as output_file:
            output_file.write(output_text)

        print(f"Relevant sentences from '{input_filename}' saved in '{output_file_path}'.")

    except FileNotFoundError:
        print(f"File not found: {input_filename}")

if __name__ == "__main__":
    # Create the "methods" folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Get a list of all text files in the input folder
    input_files = [f for f in os.listdir(input_folder) if f.endswith('.txt')]

    for input_file in input_files:
        input_filename = os.path.join(input_folder, input_file)
        output_filename = os.path.join(output_folder, input_file)
        extract_and_generate_sentences(input_filename, output_filename)


[nltk_data] Downloading package punkt to /Users/paria/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Relevant sentences from 'alltext/literature-review-on-disability-participation-in-the-engineering-field.txt' saved in 'methods/literature-review-on-disability-participation-in-the-engineering-field.txt'.
Relevant sentences from 'alltext/missing-from-the-classroom-current-representations-of-disability-in-engineering-education.txt' saved in 'methods/missing-from-the-classroom-current-representations-of-disability-in-engineering-education.txt'.
Relevant sentences from 'alltext/The Use of Artificial Intelligence with Students with Identified Disabilities  A Systematic Review with Critique.txt' saved in 'methods/The Use of Artificial Intelligence with Students with Identified Disabilities  A Systematic Review with Critique.txt'.
Relevant sentences from 'alltext/WilsonFutureEng.txt' saved in 'methods/WilsonFutureEng.txt'.
Relevant sentences from 'alltext/004_Alexa.txt' saved in 'methods/004_Alexa.txt'.
Relevant sentences from 'alltext/Exploring student disability and professional identity  n