In [2]:
# Import necessary libraries
import re  # Regular expression library for pattern matching
import os  # Operating system library for file operations
from openpyxl import Workbook  # Library for creating Excel files
import win32com.client  # Library for interacting with Windows applications
import unicodedata  # Library for Unicode character handling


# Function to extract email and phone number from text using regular expressions
def extract_contact_info(text):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    phone_pattern = r'\b\d{10}\b'  # Match 10 consecutive digits for phone number
    emails = re.findall(email_pattern, text)
    phones = re.findall(phone_pattern, text)
    return emails, phones

# Function to extract text from DOC
def extract_text_from_doc(doc_file):
    try:
        word = win32com.client.Dispatch("Word.Application")
        doc = word.Documents.Open(doc_file)
        text = doc.Content.Text
        doc.Close()
        word.Quit()
        return text
    except Exception as e:
        print(f"Error occurred while extracting text from DOC file: {e}")
        return ""

def sanitize_text(text):
    # Define a function to check if a character is printable
    def is_printable(char):
        category = unicodedata.category(char)
        return category not in {'Cc', 'Cf', 'Cs', 'Co', 'Cn'}

    # Remove illegal and non-printable characters
    illegal_characters = ['*', '/', ':', '?', '[', ']', '\\', '|', '<', '>', '=', ',', "'", '"']
    sanitized_text = ''.join(char for char in text if char not in illegal_characters and is_printable(char))
    
    return sanitized_text

# Example usage
def main(doc_paths):
    extracted_info = []

    for doc_path in doc_paths:
        doc_file = os.path.abspath(doc_path)
        if not os.path.exists(doc_file):
            print(f"DOC file not found: {doc_path}")
            continue

        text = extract_text_from_doc(doc_file)
        text = sanitize_text(text)
        print(f"Text extracted from {doc_file}: {text}")

        emails, phones = extract_contact_info(text)
        print(f"Emails extracted: {emails}")
        print(f"Phones extracted: {phones}")

        extracted_info.append({"File": doc_path, "Emails": emails, "Phones": phones, "Text": text})

    # Write the extracted information to an Excel file on desktop
    desktop_path = os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop')
    output_file = os.path.join(desktop_path, "extracted_cv(doc)_info.xlsx")
    
    wb = Workbook()
    ws = wb.active
    ws.append(["File", "Emails", "Phones", "Text"])
    for info in extracted_info:
        ws.append([info["File"], ", ".join(info["Emails"]), ", ".join(info["Phones"]), info["Text"]])
    wb.save(output_file)
    print(f"Extracted information saved to: {output_file}")

if __name__ == "__main__":
    doc_paths = ["C:/Users/Hrishikesh/Desktop/rohan/Sample2/Satyadev.doc",
                "C:/Users/Hrishikesh/Desktop/rohan/Sample2/RohitBhatt.doc"]  # List of DOC file paths
    main(doc_paths)


Text extracted from C:\Users\Hrishikesh\Desktop\rohan\Sample2\Satyadev.doc: SatyadevCredit Manager at Canara Bank 9y 6m  Lucknow  11.0 Lacs 7043876913satyadev.kamal@yahoo.com“ Seasoned professional with 9 years of experience. Expert in Credit AppraisalWorking CapitalCredit RatingCredit AnalysisCredit Monitoring WORK SUMMARY9+ experience in banking . I am having good ability to handle customer and complete all target in all year Industry  Banking Functional Area  Finance & Accounting Role  Finance & Accounting - Other WORK EXPERIENCECredit Manager May 14 - till date Canara BankI am working as credit officer in Canara bank. and I am having good knowledge in loan finance EDUCATION DETAILSU.G.B.TechB.E. (Computers) 2012 Uttar Pradesh Technical University LucknowDESIRED JOB DETAILSJob Type  Permanent Employment Status  Full time US Work Status  Not Mentioned Preferred Location  KanpurLucknowDelhi  NCR PERSONAL DETAILSName  Satyadev Gender  Male Marital Status  Married Date of Birth  4 Oct 1