In [1]:
from my_database5 import Present_Location, state_district

In [4]:
import glob
import os
import shutil
import re
from docx import Document
from openpyxl import Workbook
import spacy
from spacy.matcher import Matcher

# Function to extract the name from the CV
def extract_name(resume_text):
    nlp = spacy.load('en_core_web_sm')
    matcher = Matcher(nlp.vocab)

    nlp_text = nlp(resume_text)

    # First name and Last name are always Proper Nouns
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]

    matcher.add('NAME', [pattern], on_match=None)

    matches = matcher(nlp_text)

    for match_id, start, end in matches:
        span = nlp_text[start:end]
        return span.text


def find_present_location(text, Present_Location):
    present_location = []
    for location in Present_Location:
        if location.lower() in text.lower():
            present_location.append(location)
    return present_location


import re

def extract_age(text):
    # Extract four-digit numbers using regular expression
    four_digit_pattern = r'\b\d{4}\b'
    four_digit_matches = re.findall(four_digit_pattern, text)

    if len(four_digit_matches) >= 2:
        four_digit_numbers = [int(num) for num in four_digit_matches]
        highest_four_digit = 2023
        lowest_four_digit = min(four_digit_numbers)
        age = highest_four_digit - lowest_four_digit

        if age >= 18 and age <= 60:
            return age
        else:
            return "D.O.B. is not within the valid age range (18-60)."

    elif len(four_digit_matches) == 1:
        # Check if the single four-digit number is the birth year
        birth_year = int(four_digit_matches[0])
        current_year = 2023
        age = current_year - birth_year

        if age >= 18 and age <= 60:
            return age
        else:
            return "D.O.B. is not within the valid age range (18-60)."

    return "Age not found."


import re

def extract_emails(text):
    # Regular expression patterns
    pattern1 = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
    pattern2 = r"\b[A-Za-z0-9._%+-]+\(at\)[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
    pattern3 = r"\b[A-Za-z0-9._%+-]+ at [A-Za-z0-9.-]+ dot [A-Za-z]{2,}\b"

    # Combine the patterns to match email addresses
    email_pattern = f'{pattern1}|{pattern2}|{pattern3}'

    # Find email addresses using regex
    emails = re.findall(email_pattern, text)

    # Return the extracted email addresses as a list
    return emails





import re

def extract_mobile_number(text):
    # Define patterns to match mobile numbers in different formats
    phone_pattern1 = r'\+[\d]{1,3}\s[\d]{5}\s[\d]{5}'
    phone_pattern2 = r'\d{10}'
    phone_pattern3 = r'\+[\d\s]+'
    phone_pattern4 = r'\+\d{2}\s-\s\d{10}'
    phone_pattern5 = r'\+\d{12}'

    # Combine the patterns to match mobile numbers
    mobile_number_pattern = f'{phone_pattern1}|{phone_pattern2}|{phone_pattern3}|{phone_pattern4}|{phone_pattern5}'

    # Extract mobile numbers using the pattern
    mobile_numbers = re.findall(mobile_number_pattern, text)

    # Remove illegal characters from mobile numbers
    mobile_numbers = [re.sub(r'[\u200b-\u200d\uFEFF]', '', number) for number in mobile_numbers]

    return mobile_numbers


# Set the source folder path where the CV files are located
source_folder = "C:\\Users\\ASUS\\Desktop\\cv"

# Set the destination folder path where you want to save the extracted CVs and data
destination_folder = "C:\\Users\\ASUS\\Desktop\\cv_result"

# Create the destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# Use glob to get a list of all CV files in the source folder
cv_files = glob.glob(os.path.join(source_folder, "*.pdf")) + glob.glob(os.path.join(source_folder, "*.docx"))

# Create a new Excel workbook
workbook = Workbook()
worksheet = workbook.active

# Write headers to the worksheet
worksheet.append(["CV File", "Name", "present_location", "Age", "Email", "Mobile Number"])



# Iterate over each CV file
for i, cv_file in enumerate(cv_files, start=1):
    # Skip temporary Word files
    if os.path.basename(cv_file).startswith("~$"):
        print(f"Skipping file {cv_file} - temporary Word file.")
        continue

    # Read the content of the CV file
    _, ext = os.path.splitext(cv_file)
    if ext == '.pdf':
        with open(cv_file, 'rb') as file:
            cv_content = file.read().decode('utf-8', errors='ignore')
    elif ext == '.docx':
        doc = Document(cv_file)
        paragraphs = [p.text for p in doc.paragraphs]
        cv_content = '\n'.join(paragraphs)
    else:
        print(f"Skipping file {cv_file} - unsupported file format.")
        continue

    # Extract the name from the CV content
    name = extract_name(cv_content)

    # Extract the location from the CV content
    location = find_present_location(cv_content, Present_Location)

    # Extract the age from the CV content
    age = extract_age(cv_content)

    # Extract the email addresses from the CV content
    emails = extract_emails(cv_content)
    if not emails:
        emails = ["N/A"]

    # Extract the mobile number from the CV content
    mobile_number = extract_mobile_number(cv_content)
    if not mobile_number:
        mobile_number = ["N/A"]

    # Extract only the CV name from the file name
    cv_name = os.path.basename(cv_file)

    # Create a new filename for the extracted CV based on the location
    filename = f"cv{i}={cv_name}.pdf"

    # Set the destination path for the extracted CV
    destination_path = os.path.join(destination_folder, filename)

    # Copy the CV file to the destination folder with the new filename
    shutil.copyfile(cv_file, destination_path)

    # Write the CV file name, name, location, age, email, and mobile number to the worksheet
    worksheet.append([cv_name, name, ", ".join(location), age, ", ".join(emails), ", ".join(mobile_number)])

# Save the workbook as an Excel file
excel_file = os.path.join(destination_folder, "cv_data0.xlsx")
workbook.save(excel_file)

# Print the path to the Excel file
print(f"CV data saved to: {excel_file}")


CV data saved to: C:\Users\ASUS\Desktop\cv_result\cv_data0.xlsx
