<a href="https://colab.research.google.com/github/SowjanyaC107/Mini_Project-AI-Resume-Screening-/blob/main/ResumeScreeningSystem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install PyPDF2 python-docx spacy sklearn pandas
!python -m spacy download en_core_web_sm


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/e

In [9]:
# Install the required package
!pip install PyPDF2

# Restart the kernel (or runtime environment)
# (You may need to use the "Kernel" menu or the runtime controls provided by your platform like Colab or Jupyter Notebook)

# Once the kernel is restarted, run the following cell with your import statements:
!pip install python-docx
import docx
import PyPDF2
import docx
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Collecting python-docx
  Using cached python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [29]:
def parse_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text


In [30]:
def parse_docx(file_path):
    doc = docx.Document(file_path)
    text = " ".join([paragraph.text for paragraph in doc.paragraphs])
    return text


In [31]:
def parse_resume(file_path):
    if file_path.endswith('.pdf'):
        return parse_pdf(file_path)
    elif file_path.endswith('.docx'):
        return parse_docx(file_path)
    else:
        raise ValueError("Unsupported file format")


In [49]:
nlp = spacy.load('en_core_web_sm')

def extract_info(text):
    doc = nlp(text)
    skills = [ent.text for ent in doc.ents if ent.label_ in ['SKILL', 'TECH']]
    return {
        "text": text
    }


In [50]:
def compute_relevance(resumes, job_description):
    texts = [job_description] + resumes
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(texts)

    scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
    return scores


In [51]:
def rank_candidates(resumes, scores):
    ranked = sorted(zip(resumes, scores), key=lambda x: x[1], reverse=True)
    return ranked


In [64]:
def main(resume_files, job_description):
    parsed_resumes = [parse_resume(file) for file in resume_files]
    extracted_info = [extract_info(text) for text in parsed_resumes]

    scores = compute_relevance([info["text"] for info in extracted_info], job_description)
    # Convert scores to percentages
    scores_percentage = [round(score * 100, 2) for score in scores]  # Scale and round to 2 decimal places

    ranked = rank_candidates(extracted_info, scores)

    results = [{"name": os.path.basename(resume_files[i]),
                "relevance_score(%)": scores_percentage[i]}
               for i in range(len(resume_files))]

    return pd.DataFrame(results).sort_values(by="relevance_score(%)", ascending=False)


In [65]:
job_description = """
Looking for a Software Engineer with expertise in Python, Data Structures, and html,css.
"""

resume_files = ["/content/Manasa .pdf","/content/Alvin.pdf","/content/Godwin.pdf","/content/Steve.pdf","/content/Rakshitha.pdf","/content/Riya.pdf","/content/Ruth.pdf","/content/Kevin.pdf","/content/Sowjanya.pdf","/content/Rishi.docx","/content/Sahana.docx","/content/Tomlin.pdf","/content/Siri.docx","/content/Steve.pdf","/content/Trisha.pdf","/content/Thrupthi.pdf","/content/Yashaswini.pdf","/content/caroline.pdf"]  # Replace with your resume files
results = main(resume_files, job_description)
print(results)


              name  relevance_score(%)
4    Rakshitha.pdf               18.13
16  Yashaswini.pdf               17.52
8     Sowjanya.pdf               13.40
14      Trisha.pdf               11.05
3        Steve.pdf                4.99
13       Steve.pdf                4.99
0      Manasa .pdf                3.79
12       Siri.docx                2.68
1        Alvin.pdf                2.56
7        Kevin.pdf                1.38
15    Thrupthi.pdf                0.00
9       Rishi.docx                0.00
11      Tomlin.pdf                0.00
10     Sahana.docx                0.00
6         Ruth.pdf                0.00
5         Riya.pdf                0.00
2       Godwin.pdf                0.00
17    caroline.pdf                0.00


In [66]:
results.to_csv("ranked_candidates.csv", index=False)
