In [11]:
import os
import pdfplumber
import pandas as pd
import re

# Function to convert PDF to text
def convert_pdf_to_text(pdf_path):
    text = ''
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function to extract the "Education" section from the text
def extract_education_section(text):
    # Define patterns to identify the start and end of the Education section
    education_start_pattern = r'(Education|EDUCATION|education)'
    education_end_pattern = r'(Experience|EXPERIENCE|Skills|SKILLS|Certifications|CERTIFICATIONS|Projects|PROJECTS|References|REFERENCES|Honors|HONORS)'
    
    # Find the start of the education section
    start_match = re.search(education_start_pattern, text)
    
    if not start_match:
        return None  # No education section found
    
    start_index = start_match.start()
    
    # Find the end of the education section (next section heading)
    end_match = re.search(education_end_pattern, text[start_index:])
    end_index = end_match.start() + start_index if end_match else len(text)
    
    # Extract the education section
    education_section = text[start_index:end_index].strip()
    
    return education_section

# Function to process PDFs and extract text and education sections
def process_pdfs_to_extract_education(pdf_folder):
    data = []
    for filename in os.listdir(pdf_folder):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder, filename)
            text = convert_pdf_to_text(pdf_path)
            education_section = extract_education_section(text)  # Extract the education section
            data.append({'Filename': filename, 'Text': text, 'Education': education_section})
    
    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(data)
    return df

# Main execution
if __name__ == "__main__":
    # Set the path to the folder containing the PDF files
    pdf_folder = r'C:\Users\shaun.rolph\Desktop\DataResumes'  # <-- Update this with your PDF folder path
    
    # Process PDFs and create DataFrame with filename, full text, and education sections
    df = process_pdfs_to_extract_education(pdf_folder)
    df = df[df["Filename"].str.contains("Pages from Certificate number",case=False, na=False)]
    
    # Display the DataFrame
    print(df)
   

                                              Filename  \
0    Pages from Certificate number 20240919-CBKQ-01...   
1    Pages from Certificate number 20240919-CBKQ-01...   
2    Pages from Certificate number 20240919-CBKQ-01...   
3    Pages from Certificate number 20240919-CBKQ-01...   
4    Pages from Certificate number 20240919-CBKQ-01...   
..                                                 ...   
183  Pages from Certificate number 20240919-CBKQ-01...   
184  Pages from Certificate number 20240919-CBKQ-01...   
185  Pages from Certificate number 20240919-CBKQ-01...   
186  Pages from Certificate number 20240919-CBKQ-01...   
187  Pages from Certificate number 20240919-CBKQ-01...   

                                                  Text  \
0    ALEX ADEYEYE \n23359 Darst Field Trail \nRichm...   
1    Owen Bennett\n(248)778-7428 | ombjmb@gmail.com...   
2    David Mullens\n35 Flynn Dr.\nPensacola, FL 325...   
3    Fardokht Namiranian \n \nAddress: 1255 Eldridg...   
4    Syed Akb

In [12]:
   # Optionally, save the DataFrame to a CSV fil
    
df.to_csv(r'C:\Users\shaun.rolph\Desktop\DSResumeScrapeRobert\resumetext.csv', index=False) 

In [13]:
import pandas as pd
import re

# Function to score resumes based on keyword matches with regex optimization
def score_resume(text, keywords):
    # Ensure the text is treated as a string or an empty string if it is not
    if not isinstance(text, str):
        text = ""  # Convert non-string values to empty strings
    score = 0
    for keyword in keywords:
        # Use word boundaries to avoid partial matches and count occurrences
        pattern = re.compile(rf'\b{keyword}\b', re.IGNORECASE)
        matches = len(pattern.findall(text))
        score += matches
    return score

# Function to rank resumes based on criteria
def rank_resumes(df, keywords):
    # Add a new column to the DataFrame for the score
    df['Score'] = df['Text'].apply(lambda x: score_resume(x, keywords))
   
    # Sort the DataFrame by the score in descending order
    df = df.sort_values(by='Score', ascending=False)
   
    return df

# Main execution
if __name__ == "__main__":
    # Load the DataFrame (ensure file path is correct)
    df = pd.read_csv(r'C:\Users\shaun.rolph\Desktop\DSResumeScrapeRobert\resumetext.csv')

    # Define keywords relevant to a data scientist role
    keywords = [
        'machine learn', 'tableau', 'python', 'data analysis', 'deep learning',
        'statistical', 'sql', 'regression', 'neural network', 'predictive modeling'
    ]

    # Rank resumes based on relevance to the role
    ranked_df = rank_resumes(df, keywords)
   
    # Display the ranked DataFrame with all columns including the score
    print(ranked_df)
   
    # Optionally, save the ranked DataFrame to a new CSV file
    ranked_df.to_csv(r'C:\Users\shaun.rolph\Desktop\DSResumeScrapeRobert\resumetextwithscore.csv', index=False)

                                              Filename  \
178  Pages from Certificate number 20240919-CBKQ-01...   
41   Pages from Certificate number 20240919-CBKQ-01...   
166  Pages from Certificate number 20240919-CBKQ-01...   
19   Pages from Certificate number 20240919-CBKQ-01...   
152  Pages from Certificate number 20240919-CBKQ-01...   
..                                                 ...   
7    Pages from Certificate number 20240919-CBKQ-01...   
90   Pages from Certificate number 20240919-CBKQ-01...   
113  Pages from Certificate number 20240919-CBKQ-01...   
112  Pages from Certificate number 20240919-CBKQ-01...   
72   Pages from Certificate number 20240919-CBKQ-01...   

                                                  Text  \
178  LOGAN THOMAS\nRound Rock, TX 78665\nlogan@data...   
41    \nAugust 20, 2024 \nThe Hiring Manager, \nAgr...   
166  SONAL PATEL\n \nDear Sir/Madam, \nI am excited...   
19   CURRICULUM VITAE\nZHONGSHENG PENG, M.D., Ph. D...   
152  Michael 