In [1]:
import PyPDF2
import docx2txt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pradeesh11/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pradeesh11/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def extract_text_from_pdf(file_path):
    pdf_file = open(file_path, 'rb')
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page in range(len(pdf_reader.pages)):
        page_obj = pdf_reader.pages[page]
        text += page_obj.extract_text()
    pdf_file.close()
    return text

In [None]:
def extract_text_from_docx(file_path):
    text = docx2txt.process(file_path)
    return text

In [None]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in word_tokens if word.isalpha() and word not in stop_words]
    return filtered_tokens

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(resume_text, jd_text):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([resume_text, jd_text])
    similarity_score = cosine_similarity(vectors)[0][1]
    return similarity_score

In [None]:

def main():
    resume_file_path = "Dataset/TESTINGFILE1.pdf"
    jd_file_path = "Dataset/TESTINGFILE2.pdf"

    if resume_file_path.endswith('.pdf'):
        resume_text = extract_text_from_pdf(resume_file_path) #pdf
        jd_text = extract_text_from_pdf(jd_file_path)
    elif resume_file_path.endswith('.docx'):
        resume_text = extract_text_from_docx(resume_file_path) #docx
        jd_text = extract_text_from_pdf(jd_file_path)
    else:
        print("Unsupported file format.")
        return
    
    preprocessed_resume_text = preprocess_text(resume_text)
    preprocessed_jd_text = preprocess_text(jd_text)

    similarity_score = calculate_similarity(' '.join(preprocessed_resume_text), ' '.join(preprocessed_jd_text))
    print("The similarity score between the resume and job description is: ", similarity_score)

if __name__ == "__main__":
    main()