In [5]:
import pandas as pd
import re
import subprocess  
import docx
import os
import nltk
from openpyxl import Workbook
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from pyresparser import ResumeParser
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer

def read_docx(file_path):
            doc = docx.Document(file_path)
            full_text = []
            for para in doc.paragraphs:
             full_text.append(para.text)
            return "\n".join(full_text)

def extract_email_addresses(text):
    # Regular expression to match email addresses
    email_regex = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
    email_addresses = re.findall(email_regex, text)
    return email_addresses


def extract_phone_numbers(text):
    # Regular expression to match phone numbers
    phone_regex = r"\b(?:\+\d{1,2}\s?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b"
    phone_numbers = re.findall(phone_regex, text)
    return phone_numbers

def extract_years_of_experience(text):
    
    '''
    Helper function to extract experience from resume text

    :param resume_text: Plain resume text
    :return: list of experience
    '''
    wordnet_lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # word tokenization 
    word_tokens = nltk.word_tokenize(resume_text)

    # remove stop words and lemmatize  
    filtered_sentence = [w for w in word_tokens if not w in stop_words and wordnet_lemmatizer.lemmatize(w) not in stop_words] 
    sent = nltk.pos_tag(filtered_sentence)

    # parse regex
    cp = nltk.RegexpParser('P: {<NNP>+}')
    cs = cp.parse(sent)
    
    # for i in cs.subtrees(filter=lambda x: x.label() == 'P'):
    #     print(i)
    
    test = []
    
    for vp in list(cs.subtrees(filter=lambda x: x.label()=='P')):
        test.append(" ".join([i[0] for i in vp.leaves() if len(vp.leaves()) >= 2]))

    # Search the word 'experience' in the chunk and then print out the text after it
    x = [x[x.lower().index('experience') + 10:] for i, x in enumerate(test) if x and 'experience' in x.lower()]
    return x



def extract_experience_years(resume_text):
   
    # Regular expression to match years of experience
    experience_regex = r"(\d+)\s*\s*?(?:\s*year|yr|years|yrs)"
    matches = re.findall(experience_regex, resume_text, re.IGNORECASE)
    
    total_years = 0
    for match in matches:
        total_years += int(match)
    
    return total_years
    
def extract_skills(resume_text, skills_list):
    extracted_skills = []
    for skill in skills_list:
        if skill.lower() in resume_text.lower():
            extracted_skills.append(skill)
    return extracted_skills

def extract_tfidf(text):
    # Create a TF-IDF vectorizer
    lowercase_list = [item.lower() for item in skills_list]
    
    tfidf_scores =[]

    vectorizer = TfidfVectorizer(vocabulary= skills_list, lowercase=True)

     # Fit the vectorizer on the documents and transform them into TF-IDF vectors
    tfidf_matrix = vectorizer.fit_transform(resume_texts)

     # Get feature names (terms)
    feature_names = vectorizer.get_feature_names_out()

     # Convert TF-IDF matrix to DataFrame for better visualization
    
    df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
        
        #sum_tfidf_scores = df_tfidf.sum(axis=0)
    
    #for row in df_tfidf.values:
       # tfidf_scores.append([int(value) for value in row])

    #total = sum(tfidf_scores)
    #print("KP",df_tfidf)
        
    return df_tfidf

    


skills_list = ['java', 'python', 'C', 'JAVASCRIPT', 'smartcomms']


email =[]
phone =[]
skills =[]
experience=[]
educations =[]
header= ["Email", "Phone", "Skills", "Experience", "years of Experience"]
expInyears =[]
tfidf_list =[]
def read_resume_update_excel(resume_path, excel_path):
   """
   Reads text from a resume file and updates specified fields in an existing Excel sheet.

   Args:
       resume_path (str): Path to the resume text file.
       excel_path (str): Path to the existing Excel sheet.

   Returns:
       None
   """

   

# Example usage:
#file_path = "C:/Users/Prave/Downloads/SortResume/resumes/"
#resume_text = read_docx(file_path)
path = r"C:\Users\Prave\Download\SortResume\ResumeDB\"
  
# Change the directory 
os.chdir(path) 
  
# Read text File 
  
  
resume_texts = []
tfidf_scores=[]
  
# iterate through all file 
for file in os.listdir(): 
    # Check whether file is in text format or not 
    if file.endswith(".docx"): 
        file_path = f"{path}\{file}"
  
        # call read text file function 
        
        resume_text = read_docx(file_path)
        
        resume_texts.append(resume_text.lower())
        
        email.append(', '.join(extract_email_addresses(resume_text)))
        phone.append(', '.join(extract_phone_numbers(resume_text)))
        skills.append(', '.join(extract_skills(resume_text, skills_list)))
        experience.append(', '.join(extract_years_of_experience(resume_text)))
        expInyears.append(extract_experience_years(resume_text))
       
   
    def write_data_to_excel(excel_file, headers, *lists):
        
        wb = Workbook()
        ws = wb.active
        
        # Write headers to the first row
        for col, header in enumerate(headers, start=1):
            ws.cell(row=1, column=col, value=header)
            
        
        
         # Iterate over the lists and write items to respective cells
        for col, data_list in enumerate(lists, start=1):
            for row, item in enumerate(data_list, start=2):
                ws.cell(row=row, column=col, value=item)
        

        # Save the workbook to the specified Excel file
        wb.save(excel_file)
    
    #skills = ', '.join(extract_skills(resume_text, skills_list))
    #experience = extract_years_of_experience(resume_text)

    #data = [  email, phone, skills, experience]
    
    
    #print (email)
    #print (skills)
   # print ("Test", extract_experience_years(resume_text))
    
    
    excel_file = r"C:\Users\Prave\Downloads\SortResume\resume.xlsx"
       # Save the updated DataFrame back to the Excel sheet
    #df.to_excel(r"C:\Users\Prave\Downloads\SortResume\resume.xlsx", index = False)
    write_data_to_excel(excel_file, header, email, phone, skills, experience, expInyears)
    #write_data_to_excel(excel_file,data)
    df_tfidf = pd.DataFrame(extract_tfidf(resume_texts))
    df_tfidf.to_excel(r"C:\Users\Prave\Downloads\SortResume\Tfidf_scores.xlsx", index=False)



       # Extract necessary information from the resume text (customize based on your needs)
       #name = extract_name(resume_text)

       
print("Resume information updated successfully in Excel sheet!")


#except FileNotFoundError as e:
      # print(f"Error: File not found: {e.filename}")
#except Exception as e:
      # print(f"An error occurred: {e}")
       
        







Resume information updated successfully in Excel sheet!




In [8]:
pip install docx2txt




In [7]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

resume_details_csv = r"C:\Users\Prave\Downloads\SortResume\resume.xlsx"  # Path to your CSV file
tfidf_scores_csv = r"C:\Users\Prave\Downloads\SortResume\Tfidf_scores.xlsx"
resumes_df = pd.read_excel(resume_details_csv)
tfidfScores_df = pd.read_excel(tfidf_scores_csv)
#print(resume_df.head())


feature_columns = ['java', 'python', 'JAVASCRIPT', 'smartcomms']

X = tfidfScores_df.drop('C', axis=1)  # Features
y = resumes_df['years of Experience']  # Labels

# Split data into train and test sets
#X_train, X_test, y_train, y_test = train_test_split(tfidf_scores_csv[feature_columns], resumes_df['years of Experience'], test_size=0.2, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the classifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Rank resumes based on predicted probabilities or scores
resumes_df['score'] = classifier.predict_proba(tfidfScores_df[feature_columns])[:, 1]

# Select top 10 candidates
top_10_resumes = resumes_df.sort_values(by='score', ascending=True).head(3)
top_10_resumes.to_excel(r"C:\Users\Prave\Downloads\SortResume\SortedResumes.xlsx", index=False)
print(top_10_resumes)


Accuracy: 0.5
                          Email                   Phone  \
2      praveen.kale33@gmail.com  9059223961, 8499955008   
3  Harshitha.krishnan@gmail.com  7248726876, 5758245285   
4     Bhindu.Daruvuri@gmail.com  4561683143, 5645464645   

                            Skills  \
2      java, python, C, smartcomms   
3      java, python, C, smartcomms   
4  java, C, JAVASCRIPT, smartcomms   

                                          Experience  years of Experience  \
2   Summary Overall Experience,  Ernst,  HCL Expe...                   12   
3   Summary Overall Experience,  Ernst,  HCL Expe...                   13   
4   Summary Overall Experience,  Ernst,  HCL Expe...                   12   

      score  
2  0.002000  
3  0.002000  
4  0.036167  


In [22]:
pip install python-docx

Collecting python-docxNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for python-docx from https://files.pythonhosted.org/packages/5f/d8/6948f7ac00edf74bfa52b3c5e3073df20284bec1db466d13e668fe991707/python_docx-1.1.0-py3-none-any.whl.metadata
  Downloading python_docx-1.1.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.0-py3-none-any.whl (239 kB)
   ---------------------------------------- 0.0/239.6 kB ? eta -:--:--
   - -------------------------------------- 10.2/239.6 kB ? eta -:--:--
   ------ -------------------------------- 41.0/239.6 kB 487.6 kB/s eta 0:00:01
   ---------------------------------------  235.5/239.6 kB 2.0 MB/s eta 0:00:01
   ---------------------------------------- 239.6/239.6 kB 2.1 MB/s eta 0:00:00
Installing collected packages: python-docx
Successfully installed python-docx-1.1.0
