In [1]:
import os
import glob
import shutil
import re
from docx import Document
from openpyxl import Workbook

import re

def extract_mobile_number(text):
    # Define patterns for different phone number formats
    phone_pattern1 = r'\+\d{1,3}\s?-\s?\d{3,5}\s?-\s?\d{5,8}'
    phone_pattern2 = r'0\d{9}'  # Updated pattern for 10-digit numbers starting with 0
    phone_pattern3 = r'\+[\d\s]+'
    phone_pattern4 = r'\+\d{2}\s-\s\d{10}'
    phone_pattern5 = r'\+\d{12}'
    phone_pattern6 = r'\d{11}'
    phone_pattern7 = r'91-\d{10}'
    phone_pattern8 = r'\d{9}'
    phone_pattern9 = r'\+\d{1,3}\s?-\s?\d{3,5}\s?-\s?\d{5,8}'
    phone_pattern10 = r'0\d{10}'  # Pattern for 10-digit numbers starting with 0
    phone_pattern11 = r'91-\d{10}'  # Pattern for numbers in the format 91-XXXXXXXXXX
    phone_pattern12 = r'\d{10}'  # Pattern for 10-digit numbers without any prefix


    # Combine all patterns to match mobile numbers
    mobile_number_pattern = f'{phone_pattern1}|{phone_pattern2}|{phone_pattern3}|{phone_pattern4}|{phone_pattern5}|{phone_pattern6}|{phone_pattern7}|{phone_pattern8}|{phone_pattern9}|{phone_pattern10}|{phone_pattern11}|{phone_pattern12}'

    # Extract mobile numbers using the pattern
    mobile_numbers = re.findall(mobile_number_pattern, text)

    # Remove illegal characters from mobile numbers and remove leading '+'
    mobile_numbers = [re.sub(r'[\u200b-\u200d\uFEFF+]', '', number) for number in mobile_numbers]

    return mobile_numbers




# Set the source folder path where the CV files are located
source_folder = "C:\\Users\\ASUS\\Downloads\\300+ New_CV"

# Set the destination folder path where you want to save the extracted CVs and data
destination_folder = "C:\\Users\\ASUS\\Desktop\\cv_result"

# Create the destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# Use glob to get a list of all CV files in the source folder
cv_files = glob.glob(os.path.join(source_folder, "*.pdf")) + glob.glob(os.path.join(source_folder, "*.docx"))

# Create a new Excel workbook
workbook = Workbook()
worksheet = workbook.active

# Write headers to the worksheet
worksheet.append(["CV File", "Mobile Number"])

# Iterate over each CV file
for i, cv_file in enumerate(cv_files, start=1):
    # Skip temporary Word files
    if os.path.basename(cv_file).startswith("~$"):
        print(f"Skipping file {cv_file} - temporary Word file.")
        continue

    # Read the content of the CV file
    _, ext = os.path.splitext(cv_file)
    if ext == '.pdf':
        with open(cv_file, 'rb') as file:
            cv_content = file.read().decode('utf-8', errors='ignore')
    elif ext == '.docx':
        doc = Document(cv_file)
        paragraphs = [p.text for p in doc.paragraphs]
        cv_content = '\n'.join(paragraphs)
    else:
        print(f"Skipping file {cv_file} - unsupported file format.")
        continue

    # Extract the mobile number from the CV content
    mobile_number = extract_mobile_number(cv_content)
    if not mobile_number:
        mobile_number = ["N/A"]

    # Extract only the CV name from the file name
    cv_name = os.path.basename(cv_file)

    # Create a new filename for the extracted CV based on the location
    filename = f"cv{i}={cv_name}.pdf"

    # Set the destination path for the extracted CV
    destination_path = os.path.join(destination_folder, filename)

    # Copy the CV file to the destination folder
    shutil.copy2(cv_file, destination_path)

    # Write the CV name and mobile number to the worksheet
    worksheet.append([cv_name, ", ".join(mobile_number)])

# Save the workbook as an Excel file
excel_file = os.path.join(destination_folder, "cv_data.xlsx")
workbook.save(excel_file)

# Print the path to the Excel file
print(f"CV data saved to: {excel_file}")


CV data saved to: C:\Users\ASUS\Desktop\cv_result\cv_data.xlsx
