In [12]:
# Import necessary libraries
import re          # Regular expression operations for pattern matching and string manipulation
import PyPDF2      # Library for reading and manipulating PDF files
from openpyxl import load_workbook, Workbook  # Libraries for working with Excel files
from pathlib import Path  # Module providing classes representing filesystem paths
import os          # Operating system interfaces for file handling and directory operations
import docx        # Library for creating and updating Microsoft Word (.docx) files

# Function to extract email and phone number from text using regular expressions
def extract_contact_info(text):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    phone_pattern = r'\b\d{10}\b'  # Assuming phone number format is 10 digits
    emails = re.findall(email_pattern, text)
    phones = re.findall(phone_pattern, text)
    return emails, phones

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    text = ""
    with open(pdf_file, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
    return text

# Function to extract text from DOCX
def extract_text_from_docx(docx_file):
    doc = docx.Document(docx_file)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

# Function to extract text from CV file (PDF, DOCX, DOC)
def extract_text_from_cv(cv_file):
    file_ext = cv_file.suffix.lower()
    if file_ext == '.pdf':
        return extract_text_from_pdf(cv_file)
    elif file_ext == '.docx':
        return extract_text_from_docx(cv_file)
    elif file_text == '.doc':
        return 
    else:
        print(f"Unsupported file format: {file_ext}")
        return ""

# Example usage
def main(cv_paths):
    desktop_path = os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop')
    output_file = os.path.join(desktop_path, "extracted_cv(pdf,docx)_info.xlsx")

    wb = None
    if os.path.exists(output_file):
        wb = load_workbook(output_file)
    else:
        wb = Workbook()
        wb.active.append(["File", "Emails", "Phones", "Text"])

    ws = wb.active

    for cv_path in cv_paths:
        cv_file = Path(cv_path)
        if not cv_file.exists():
            print(f"CV file not found: {cv_path}")
            continue

        text = extract_text_from_cv(cv_file)
        print(f"Text extracted from {cv_file}: {text}")

        emails, phones = extract_contact_info(text)
        print(f"Emails extracted: {emails}")
        print(f"Phones extracted: {phones}")

        ws.append([cv_path, ", ".join(emails), ", ".join(phones), text])

    wb.save(output_file)
    print(f"Extracted information saved to: {output_file}")


if __name__ == "__main__":
    cv_paths = ["C:/Users/Hrishikesh/Desktop/rohan/Sample2/AnamRehman.docx",
               "C:/Users/Hrishikesh/Desktop/rohan/Sample2/AarushiRohatgi.pdf",
               "C:/Users/Hrishikesh/Desktop/rohan/Sample2/AkashGoel.docx",
               "C:/Users/Hrishikesh/Desktop/rohan/Sample2/AnanyaDas.pdf"]  # List of CV file paths
    main(cv_paths)


Text extracted from C:\Users\Hrishikesh\Desktop\rohan\Sample2\AnamRehman.docx: +91 8586089916 

anamr894@gmail.com 

Anam Rehman
 
SKILLS 
Credit Risk and Analysis, Financial analysis, Financial spreading, Business research, Microsoft excel, Databases – Equifax, Experian, DNBi, LexisNexis, Bloomberg, Capital IQ, Factset etc. 
Personal – Teamwork, time management, focus and attention to details

EXPERIENCE 
Genpact India Pvt. Ltd., Gurugram — Assistant Manager (FLM)
February 2018 – Present  
Credit Risk Analysis – Auto Finance 
Working with a leading US Financial Services company for its car finance division 
Front Line Manager leading a team of 8 members helping achieve the daily/monthly goals for the team 
Review New applications received from Auto Dealerships completing the due diligence, interpreting commercial and consumer bureau reports as well as analysing the financials 
Complete renewals/additional requests for existing dealerships
QC reviews for New Applications, Renewals and 