In [6]:
import glob
import spacy
import os
import shutil
import re
from docx import Document
from openpyxl import Workbook

# Function to extract the age from the CV
import re

def extract_age(text):
    # Extract four-digit numbers using regular expression
    four_digit_pattern = r'\b\d{4}\b'
    four_digit_matches = re.findall(four_digit_pattern, text)

    if len(four_digit_matches) >= 2:
        four_digit_numbers = [int(num) for num in four_digit_matches]
        highest_four_digit = 2023
        lowest_four_digit = min(four_digit_numbers)
        age = highest_four_digit - lowest_four_digit

        if age >= 18 and age <= 60:
            return age
        else:
            return "D.O.B. is not within the valid age range (18-60)."

    elif len(four_digit_matches) == 1:
        # Check if the single four-digit number is the birth year
        birth_year = int(four_digit_matches[0])
        current_year = 2023
        age = current_year - birth_year

        if age >= 18 and age <= 60:
            return age
        else:
            return "D.O.B. is not within the valid age range (18-60)."

    return "Age not found."


# Set the source folder path where the CV files are located
source_folder = "C:\\Users\\ASUS\\Downloads\\300+ CV"

# Set the destination folder path where you want to save the extracted CVs and data
destination_folder = "C:\\Users\\ASUS\\Desktop\\cv_result"

# Create the destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# Use glob to get a list of all CV files in the source folder
cv_files = glob.glob(os.path.join(source_folder, "*.pdf")) + glob.glob(os.path.join(source_folder, "*.docx"))

# Create a new Excel workbook
workbook = Workbook()
worksheet = workbook.active

# Write headers to the worksheet
worksheet.append(["CV File", "Age"])

# Iterate over each CV file
for i, cv_file in enumerate(cv_files, start=1):
    # Skip temporary Word files
    if os.path.basename(cv_file).startswith("~$"):
        print(f"Skipping file {cv_file} - temporary Word file.")
        continue

    # Read the content of the CV file
    _, ext = os.path.splitext(cv_file)
    if ext == '.pdf':
        with open(cv_file, 'rb') as file:
            cv_content = file.read().decode('utf-8', errors='ignore')
    elif ext == '.docx':
        doc = Document(cv_file)
        paragraphs = [p.text for p in doc.paragraphs]
        cv_content = '\n'.join(paragraphs)
    else:
        print(f"Skipping file {cv_file} - unsupported file format.")
        continue

    # Extract the age from the CV content
    age = extract_age(cv_content)

    # Extract only the CV name from the file name
    cv_name = os.path.basename(cv_file)

    # Create a new filename for the extracted CV based on the location
    filename = f"cv{i}={cv_name}.pdf"

    # Set the destination path for the extracted CV
    destination_path = os.path.join(destination_folder, filename)

    # Copy the CV file to the destination folder with the new filename
    shutil.copyfile(cv_file, destination_path)

    # Write the CV file name and age to the worksheet
    worksheet.append([cv_name, age])

# Save the workbook as an Excel file
excel_file = os.path.join(destination_folder, "cv_300+_NEW_CV.xlsx")
workbook.save(excel_file)

# Print the path to the Excel file
print(f"CV data saved to: {excel_file}")


CV data saved to: C:\Users\ASUS\Desktop\cv_result\cv_300+_NEW_CV.xlsx
