In [2]:
import PyPDF2
import re
import csv

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    with open(pdf_file, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in range(len(reader.pages)):
            text += reader.pages[page].extract_text()
    return text

# Function to process extracted text (tokenize and clean)
def process_text(text):
    # Remove punctuation and convert to lowercase
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Tokenize by splitting on whitespace
    words = text.split()
    return words

# Function to remove stopwords (common English words)
def remove_stopwords(words):
    stop_words = set([
        'the', 'is', 'in', 'it', 'of', 'and', 'to', 'a', 'that', 'on', 'for', 
        'this', 'with', 'as', 'by', 'at', 'an', 'be', 'or', 'from', 'which'
    ])  # You can expand this list based on your needs
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

# Function to calculate match percentage
def calculate_match_percentage(extracted_words, reference_words):
    matches = [word for word in extracted_words if word in reference_words]
    match_percentage = (len(matches) / len(reference_words)) * 100
    return match_percentage, matches

# Function to save the report (CSV)
def save_report_to_csv(reference_words, matches, match_percentage):
    with open('match_report.csv', mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Reference Word', 'Matched'])
        for word in reference_words:
            writer.writerow([word, word in matches])
        writer.writerow([])
        writer.writerow(['Match Percentage', f'{match_percentage}%'])

# Main function to execute the task
def main():
    # Path to PDF and reference list
    pdf_path = 'sample.pdf'  # Change this to your actual PDF file path
    reference_list = ['business', 'data', 'statistics', 'model', 'analysis']  # Example reference list

    # Step 1: Extract text from the PDF
    extracted_text = extract_text_from_pdf(pdf_path)
    
    # Step 2: Process the extracted text (tokenize and clean)
    extracted_words = process_text(extracted_text)
    
    # Step 3: Remove stopwords
    cleaned_words = remove_stopwords(extracted_words)
    
    # Step 4: Calculate the match percentage
    match_percentage, matches = calculate_match_percentage(cleaned_words, reference_list)
    
    # Step 5: Produce output
    print(f'Match Percentage: {match_percentage}%')
    print(f'Matched Words: {matches}')
    
    # Save to CSV file
    save_report_to_csv(reference_list, matches, match_percentage)

# Run the main function
if __name__ == '__main__':
    main()


Match Percentage: 320.0%
Matched Words: ['business', 'statistics', 'business', 'analysis', 'data', 'statistics', 'business', 'data', 'data', 'business', 'data', 'data', 'analysis', 'analysis', 'analysis', 'data']
