In [4]:
import os
from docx import Document
from lxml import etree
import win32com.client  # For handling .doc files

# Function to extract text from shapes (text boxes) and join with spaces
def extract_text_from_shapes_and_textboxes(doc):
    extracted_text = []

    # Extract text from textboxes using XML
    for shape in doc.element.xpath('.//w:txbxContent//w:t'):
        extracted_text.append(shape.text)

    return ' '.join(extracted_text).strip()  # Join text with space and remove extra spaces

# Function to extract text from headers and footers and join with spaces
def extract_text_from_headers_and_footers(doc):
    extracted_text = []

    for section in doc.sections:
        # Extract header text
        for paragraph in section.header.paragraphs:
            extracted_text.append(paragraph.text)
        # Extract footer text
        for paragraph in section.footer.paragraphs:
            extracted_text.append(paragraph.text)

    return ' '.join(extracted_text).strip()  # Join text with space and remove extra spaces

# Function to extract text from .doc files using pywin32
def doc_to_txt(input_path, output_path):
    try:
        word = win32com.client.Dispatch("Word.Application")
        doc = word.Documents.Open(input_path)
        doc_text = doc.Content.Text
        doc.Close()
        word.Quit()

        with open(output_path, "w", encoding="utf-8") as txt_file:
            txt_file.write(doc_text)

        print(f"Converted: {input_path} -> {output_path}")
    except Exception as e:
        print(f"Error converting {input_path}: {e}")

# Function to convert DOCX and DOC files to TXT
def convert_docs_to_txt(input_folder, output_folder):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for filename in os.listdir(input_folder):
        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}.txt")

        if filename.endswith(".docx"):
            try:
                doc = Document(input_path)
                
                # Extract regular text from paragraphs
                extracted_text = [paragraph.text for paragraph in doc.paragraphs]
                
                # Extract text from tables
                for table in doc.tables:
                    for row in table.rows:
                        row_data = [cell.text.strip() for cell in row.cells]
                        extracted_text.append(' '.join(row_data))
                
                # Extract text from shapes and textboxes (joined with spaces)
                shapes_text = extract_text_from_shapes_and_textboxes(doc)
                if shapes_text:
                    extracted_text.append(shapes_text)
                
                # Extract text from headers and footers (joined with spaces)
                headers_footers_text = extract_text_from_headers_and_footers(doc)
                if headers_footers_text:
                    extracted_text.append(headers_footers_text)
                
                # Save all extracted text to a .txt file
                with open(output_path, "w", encoding="utf-8") as txt_file:
                    txt_file.write('\n'.join([text for text in extracted_text if text.strip()]))  # Keep normal text separate
                
                print(f"Converted: {filename} -> {output_path}")
            except Exception as e:
                print(f"Error converting {filename}: {e}")

        elif filename.endswith(".doc"):
            doc_to_txt(input_path, output_path)

# Folder paths
input_folder = r"C:\Users\polpi\Desktop\data science\project\docker_project\resumes"
output_folder = r"C:\Users\polpi\Desktop\data science\project\docker_project\resume_txt"

# Run the function
convert_docs_to_txt(input_folder, output_folder)

Converted: Anil kumar.docx -> C:\Users\polpi\Desktop\data science\project\docker_project\resume_txt\Anil kumar.txt
Converted: AradhanaTripathi[4_0].docx -> C:\Users\polpi\Desktop\data science\project\docker_project\resume_txt\AradhanaTripathi[4_0].txt
Converted: Buddha Vamsi.docx -> C:\Users\polpi\Desktop\data science\project\docker_project\resume_txt\Buddha Vamsi.txt
Converted: ChinnaSubbarayuduM_Hexaware.docx -> C:\Users\polpi\Desktop\data science\project\docker_project\resume_txt\ChinnaSubbarayuduM_Hexaware.txt
Converted: Gopi Krishna_Hexaware.docx -> C:\Users\polpi\Desktop\data science\project\docker_project\resume_txt\Gopi Krishna_Hexaware.txt
Converted: C:\Users\polpi\Desktop\data science\project\docker_project\resumes\Hari Krishna M_Hexaware.doc -> C:\Users\polpi\Desktop\data science\project\docker_project\resume_txt\Hari Krishna M_Hexaware.txt
Converted: C:\Users\polpi\Desktop\data science\project\docker_project\resumes\Harikrishna Akula_Hexaware.doc -> C:\Users\polpi\Desktop\d

In [6]:
import os
from docx import Document
from lxml import etree
import win32com.client  # For handling .doc files
from PyPDF2 import PdfReader  # For handling .pdf files

# Function to extract text from shapes (text boxes) and join with spaces
def extract_text_from_shapes_and_textboxes(doc):
    extracted_text = []

    # Extract text from textboxes using XML
    for shape in doc.element.xpath('.//w:txbxContent//w:t'):
        extracted_text.append(shape.text)

    return ' '.join(extracted_text).strip()  # Join text with space and remove extra spaces

# Function to extract text from headers and footers and join with spaces
def extract_text_from_headers_and_footers(doc):
    extracted_text = []

    for section in doc.sections:
        # Extract header text
        for paragraph in section.header.paragraphs:
            extracted_text.append(paragraph.text)
        # Extract footer text
        for paragraph in section.footer.paragraphs:
            extracted_text.append(paragraph.text)

    return ' '.join(extracted_text).strip()  # Join text with space and remove extra spaces

# Function to extract text from .doc files using pywin32
def doc_to_txt(input_path, output_path):
    try:
        word = win32com.client.Dispatch("Word.Application")
        doc = word.Documents.Open(input_path)
        doc_text = doc.Content.Text
        doc.Close()
        word.Quit()

        with open(output_path, "w", encoding="utf-8") as txt_file:
            txt_file.write(doc_text)

        print(f"Converted: {input_path} -> {output_path}")
    except Exception as e:
        print(f"Error converting {input_path}: {e}")

# Function to extract text from PDF files
def pdf_to_txt(input_path, output_path):
    try:
        reader = PdfReader(input_path)
        extracted_text = []

        for page in reader.pages:
            extracted_text.append(page.extract_text())

        with open(output_path, "w", encoding="utf-8") as txt_file:
            txt_file.write('\n'.join(extracted_text))

        print(f"Converted: {input_path} -> {output_path}")
    except Exception as e:
        print(f"Error converting {input_path}: {e}")

# Function to convert DOCX, DOC, and PDF files to TXT
def convert_files_to_txt(input_folder, output_folder):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for filename in os.listdir(input_folder):
        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}.txt")

        if filename.endswith(".docx"):
            try:
                doc = Document(input_path)

                # Extract text from shapes and textboxes (joined with spaces)
                shapes_text = extract_text_from_shapes_and_textboxes(doc)

                # Extract text from headers and footers (joined with spaces)
                headers_footers_text = extract_text_from_headers_and_footers(doc)
                
                # Extract regular text from paragraphs
                paragraphs_text = [paragraph.text for paragraph in doc.paragraphs]

                # Extract text from tables
                for table in doc.tables:
                    for row in table.rows:
                        row_data = [cell.text.strip() for cell in row.cells]
                        paragraphs_text.append(' '.join(row_data))
                
                # Combine all extracted text with the specified order
                combined_text = []
                if shapes_text:
                    combined_text.append(shapes_text)
                if headers_footers_text:
                    combined_text.append(headers_footers_text)
                combined_text.extend([text for text in paragraphs_text if text.strip()])
                
                # Save all extracted text to a .txt file
                with open(output_path, "w", encoding="utf-8") as txt_file:
                    txt_file.write('\n'.join(combined_text))
                
                print(f"Converted: {filename} -> {output_path}")
            except Exception as e:
                print(f"Error converting {filename}: {e}")

        elif filename.endswith(".doc"):
            doc_to_txt(input_path, output_path)

        elif filename.endswith(".pdf"):
            pdf_to_txt(input_path, output_path)

# Folder paths
input_folder = r"C:\Users\polpi\Desktop\data science\project\docker_project\resumes"
output_folder = r"C:\Users\polpi\Desktop\data science\project\docker_project\resume_txt"

# Run the function
convert_files_to_txt(input_folder, output_folder)




ModuleNotFoundError: No module named 'PyPDF2'

In [7]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.
