In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
import fitz
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Step 1: Load and preprocess resumes data
def load_and_preprocess_data(resumes_path):
    resumes_df = []
    for resume in resumes_path:
        text = ""
        with fitz.open(resume) as pdf_document:
            for page_number in range(pdf_document.page_count):
                page = pdf_document[page_number]
                text += page.get_text()
        stop_words = set(stopwords.words('english'))
        words = word_tokenize(text)
        words = [word.lower() for word in words if word.isalnum()]
        words = [word for word in words if word not in stop_words]
        text = ' '.join(words)
        print(text)
        resumes_df.append({'resume_text': text})
    resumes_df = pd.DataFrame(resumes_df)
    return resumes_df

# Step 2: Vectorize the resumes using CountVectorizer
def vectorize_resumes(resumes_text):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(resumes_text)
    return X

# Step 3: Apply Latent Dirichlet Allocation (LDA) for topic modeling
def apply_lda(X, num_topics):
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(X)
    return lda

# Step 4: Extract topics for each resume
def extract_topics(lda_model, X):
    topic_weights = lda_model.transform(X)
    return topic_weights

# Step 5: Candidate Ranking based on similarity
def rank_candidates(candidate_vector, resumes_vectors):
    similarities = cosine_similarity(candidate_vector.reshape(1, -1), resumes_vectors)
    ranked_indices = similarities.argsort()[0][::-1]  # Sort in descending order
    return ranked_indices

# Main function
def main():
    # Step 1: Load and preprocess data
    resumes_path = ['sample.pdf', 'sample2.pdf']
    resumes_df = load_and_preprocess_data(resumes_path)

    # Step 2: Vectorize the resumes using CountVectorizer
    resumes_text = resumes_df['resume_text'].tolist()
    X = vectorize_resumes(resumes_text)

    # Step 3: Apply Latent Dirichlet Allocation (LDA)
    num_topics = 5  # You can adjust the number of topics based on your requirements
    lda_model = apply_lda(X, num_topics)

    # Step 4: Extract topics for each resume
    topic_weights = extract_topics(lda_model, X)
    print(topic_weights)

    # Step 5: Candidate Ranking
    candidate_resume_text = "Your candidate's resume text goes here"
    candidate_vector = vectorize_resumes([candidate_resume_text])
    ranked_indices = rank_candidates(candidate_vector, X)

    # Display the ranked candidates
    for i, idx in enumerate(ranked_indices):
        print(f"Rank {i + 1}: Candidate {idx}, Similarity: {ranked_indices[0][idx]}")

if __name__ == "__main__":
    main()


madhava reddy creative programmer madhavso2018 9347156120 vijayawada india summary dedicated computer science engineering student strong passion data analysis machine learning web development seeking position apply technical skills innovative mindset solve complex problems tribute organization success education computer science engineering aiml nri institute technology 2020 2024 pothavarappadu cgpa board intermediate education ap mpc sri chaitanya educational institutions 2018 2020 vijayawada cgpa andhra pradesh board secondary education high school 2017 2018 vijayawada cgpa projects data extractor tripadvisor reviews eapcet flipkart academic performance cases prediction using machine learning developed machine learning models predict cases aiding proactive public health measures online prices monitor bot designed implemented price monitoring telegram bot skills technical skills programming languages python java c data analysis pandas numpy web development html css javascript database 

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 6 while Y.shape[1] == 298

In [4]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim import corpora, models
import pandas as pd
import fitz
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Data Loading and Preprocessing
def load_data(job_description_file, resumes_file):
    resumes_df = []
    stop_words = set(stopwords.words('english'))
    for resume in resumes_file:
        text = ""
        with fitz.open(resume) as pdf_document:
            for page_number in range(pdf_document.page_count):
                page = pdf_document[page_number]
                text += page.get_text()
        words = word_tokenize(text)
        words = [word.lower() for word in words if word.isalnum()]
        words = [word for word in words if word not in stop_words]
        text = ' '.join(words)
        print(text)
        resumes_df.append({'resume_name':resume, 'resume_text': text})
    resumes_df = pd.DataFrame(resumes_df)

    job_descriptions = ""
    with fitz.open(job_description_file) as job_description_file:
        for page_number in range(job_description_file.page_count):
            page = job_description_file[page_number]
            job_descriptions += page.get_text()
    words = word_tokenize(job_descriptions)
    words = [word.lower() for word in words if word.isalnum()]
    words = [word for word in words if word not in stop_words]
    job_descriptions = ' '.join(words)

    return job_descriptions, resumes_df

# Job Role Analysis
def analyze_job_role(job_description, stop_words):
    # Use TF-IDF to extract important keywords from the job description

    # Combine the job description with existing stop words
    stop_words.update(["role", "responsibilities", "skills", "experience", "qualifications"])  # Add additional custom stop words
    job_description = ' '.join([word for word in job_description.lower().split() if word not in stop_words])

    # Create a TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Fit and transform the job description
    tfidf_matrix = tfidf_vectorizer.fit_transform([job_description])

    # Get feature names (words)
    feature_names = tfidf_vectorizer.get_feature_names_out()

    # Get the TF-IDF scores for each feature
    tfidf_scores = tfidf_matrix.toarray()[0]

    # Create a dictionary of feature names and their corresponding TF-IDF scores
    keyword_scores = dict(zip(feature_names, tfidf_scores))

    # Sort the dictionary by TF-IDF scores in descending order
    sorted_keywords = sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)

    # Extract the top N keywords (adjust N based on your requirements)
    top_keywords = [keyword for keyword, score in sorted_keywords[:20]]
    print(top_keywords)
    return top_keywords

# Candidate Resume Analysis
def analyze_candidate_resume(resume, stop_words):
    # Use TF-IDF to extract important information and skills from the candidate's resume

    # Combine the resume with existing stop words
    stop_words.update(["phone", "email", "address", "linkedin", "github"])  # Add additional custom stop words
    resume = ' '.join([word for word in resume.lower().split() if word not in stop_words])

    # Create a TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Fit and transform the resume
    tfidf_matrix = tfidf_vectorizer.fit_transform([resume])

    # Get feature names (words)
    feature_names = tfidf_vectorizer.get_feature_names_out()

    # Get the TF-IDF scores for each feature
    tfidf_scores = tfidf_matrix.toarray()[0]

    # Create a dictionary of feature names and their corresponding TF-IDF scores
    resume_features = dict(zip(feature_names, tfidf_scores))

    return resume_features

# Topic Modeling with LDA
def apply_lda(texts):
    # Tokenize and create a dictionary
    tokenized_texts = [text.lower().split() for text in texts]
    dictionary = corpora.Dictionary(tokenized_texts)
    print(dictionary)
    # Create a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in tokenized_texts]
    print(corpus)
    # Apply LDA model
    num_topics = 5  # Adjust the number of topics based on your requirements
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    
    # Get the topics for each document in the corpus
    corpus_lda = lda_model[corpus]
    
    return lda_model, corpus_lda, dictionary

# Matching and Scoring
def calculate_similarity(job_keywords, resume_features, lda_model, corpus_lda, dictionary):
    # Calculate cosine similarity for TF-IDF or other features
    tfidf_vector_job = [resume_features.get(keyword, 0) for keyword in job_keywords]
    tfidf_vector_resume = [resume_features.get(keyword, 0) for keyword in job_keywords]  # Use the same keywords for both vectors
    
    tfidf_similarity = cosine_similarity([tfidf_vector_job], [tfidf_vector_resume])[0][0]

    lda_topic_job = lda_model[dictionary.doc2bow(job_keywords)]
    lda_topic_resume = lda_model[dictionary.doc2bow(job_keywords)]

    lda_similarity = len(set(lda_topic_job).intersection(lda_topic_resume)) / len(set(lda_topic_job).union(lda_topic_resume))
    final_score = 0.7 * tfidf_similarity + 0.3 * lda_similarity

    return final_score

# Ranking System
def rank_candidates(job_description, resumes, stop_words):
    job_keywords = analyze_job_role(job_description, stop_words)
    lda_model, corpus_lda, dictionary = apply_lda(job_description)

    ranking_scores = []
    for resume in resumes["resume_text"][:]:
        resume_features = analyze_candidate_resume(resume, stop_words)
        score = calculate_similarity(job_keywords, resume_features, lda_model, corpus_lda, dictionary)
        ranking_scores.append(score)
    ranked_candidates = sorted(enumerate(ranking_scores), key=lambda x: x[1], reverse=True)
    return ranked_candidates

job_description_file = 'sample_JD.pdf'
resumes_file = ["sample2.pdf","sample.pdf","sample3.pdf"]
job_description, resumes = load_data(job_description_file, resumes_file)
stop_words = set(stopwords.words('english'))
ranked_candidates = rank_candidates(job_description, resumes, stop_words)

print("Ranked Candidates:")
for idx, score in ranked_candidates:
    print(f"{resumes['resume_name'][idx]}: Score - {score}")


experience education skills certifications vijayawada india madalasaranya974 9666486245 saranya madala tech enthusiast collected structured entered vital data system orchestrated engaging project presentations delivering succinct updates senior staff thereby facilitating informed produced comprehensive reports outlining discoveries offering strategic recommendations blackbucks engineers pvt ltd prepared distributed reports presentations materials handled confidential documents maintained proper organization coordinated travel arrangements accommodations executives guests statskew computer science engineering aiml 2020 2024 mpc 2018 2020 programming languages java python c software development agile methodologies sdlc version control git database management sql mysql mongodb data scraping selenium beautifulsoup development html css javascript networking dns operating systems linux windows programming oop effective communication analytical proficiency python project data engineering comp

In [29]:
resumes

Unnamed: 0,resume_name,resume_text
0,sample2.pdf,experience education skills certifications vij...
1,sample.pdf,madhava reddy creative programmer madhavso2018...


In [40]:
ranking_scores = [0.9999999999999998, 1.0, 0.9999999999999998]

In [46]:
ranked_candidates = sorted(enumerate(ranking_scores), key=lambda x: x[1], reverse=True)