In [19]:
import joblib
import pdfplumber
import re
import spacy
import numpy as np
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [20]:
# Load the saved model and vectorizer
bc = pickle.load(open('/Users/macbookair/Desktop/project/Resume_Analyser/resume-job-matching/models/bc_model.pkl', 'rb'))
lgr = pickle.load(open('/Users/macbookair/Desktop/project/Resume_Analyser/resume-job-matching/models/lgr_model.pkl', 'rb'))
xgb = pickle.load(open('/Users/macbookair/Desktop/project/Resume_Analyser/resume-job-matching/models/xgb_model.pkl', 'rb'))

tfidf = pickle.load(open('/Users/macbookair/Desktop/project/Resume_Analyser/resume-job-matching/models/resume_tfidf_vectorizer.pkl', 'rb'))

In [21]:
nlp = spacy.load('en_core_web_sm')

In [22]:
def cleanText(text):

    if not isinstance(text, str):
        return ''

    text = re.sub(r'[^a-zA-Z0-9\s+]', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\d{10,}', '', text) 
    text = re.sub('\s+', ' ', text)
    return text.lower()

In [23]:
def lemmatize_tokens(tokens):
    doc = nlp(' '.join(tokens))
    return [token.lemma_ for token in doc]

In [24]:
DOMAIN_SPECIFIC_STOPWORDS = ['resume', 'job', 'candidate', 'apply', 'experience', 'skills']
stop_words = set(stopwords.words('english')).union(DOMAIN_SPECIFIC_STOPWORDS)

def remove_stopwords_from_tokens(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

In [25]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a single PDF file."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            full_text = ''
            for page in pdf.pages:
                full_text += page.extract_text() or ''  # Extract text from all pages
            return full_text.strip()  # Clean up whitespace
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ''

In [26]:
resume = '/Users/macbookair/Desktop/project/Resume_Analyser/resume-job-matching/data/Resume/Chef_resume.pdf'

In [27]:
extracted_text = extract_text_from_pdf(resume)
cleaned_resume = cleanText(extracted_text)

In [28]:
extracted_text

'Charles Mathews\nChef\nAREAS OF EXPERTISE PERSONAL SUMMARY\nSeasonal dishes A calm, pleasant, helpful and hardworking individual who has a passion for great\nfood and who enjoys cooking mouth watering dishes. Charles gets a real buzz out of\nCooking methods working in a busy kitchen, and great pleasure out of seeing happy faces enjoying a\ngood meal that he has cooked. As a natural leader he is not only able to give orders\nImplementing recipes\nand delegate tasks, but is also able to reliably carry out orders as well. As a\nexperienced Chef he has a proven track record of making great food that will entice\nPreparing meals\ndiners and leave them wanting more. He loves the freedom of expression that\nFood prepara tion cooking gives him and is willing to work hard to build a career in the culinary\nworld. Right now he is looking for a suitable position with a company that wants to\nContract cate ring recruit talented and enthusiastic individuals.\nBaking skills\nCAREER HISTORY\nFood co

In [29]:
tokens = word_tokenize(cleaned_resume)
tokens = remove_stopwords_from_tokens(tokens)
lemmatized_tokens = lemmatize_tokens(tokens)

In [30]:
final_resume_text = ' '.join(lemmatized_tokens)

In [31]:
resume_vectorized = tfidf.transform([final_resume_text])

In [32]:
bc_prediction = bc.predict(resume_vectorized)[0]
lgr_prediction = lgr.predict(resume_vectorized)[0]
xgb_prediction = xgb.predict(resume_vectorized)[0]

In [33]:
category_mapping = {0: 'accountant', 1: 'advocate', 2: 'agriculture', 3: 'apparel', 4: 'arts', 5: 'automation testing', 6: 'automobile', 7: 'aviation', 8: 'banking', 9: 'blockchain', 10: 'bpo', 11: 'business analyst', 12: 'business-development', 13: 'chef', 14: 'civil engineer', 15: 'construction', 16: 'consultant', 17: 'data science', 18: 'database', 19: 'designer', 20: 'devops engineer', 21: 'digital-media', 22: 'dotnet developer', 23: 'electrical engineering', 24: 'engineering', 25: 'etl developer', 26: 'finance', 27: 'fitness', 28: 'hadoop', 29: 'health and fitness', 30: 'healthcare', 31: 'hr', 32: 'information-technology', 33: 'java developer', 34: 'mechanical engineer', 35: 'network security engineer', 36: 'operations manager', 37: 'pmo', 38: 'public-relations', 39: 'python developer', 40: 'sales', 41: 'sap developer', 42: 'teacher', 43: 'testing', 44: 'web designing'}

In [34]:
#Bagging Classifier

category_name = category_mapping.get(bc_prediction, "Unknown")

print("Predicted Category:", category_name)
print(bc_prediction)

Predicted Category: chef
13


In [35]:
#Logistic Regression

category_name = category_mapping.get(lgr_prediction, "Unknown")

print("Predicted Category:", category_name)
print(lgr_prediction)

Predicted Category: chef
13


In [36]:
#XGBoost Classifier

category_name = category_mapping.get(xgb_prediction, "Unknown")

print("Predicted Category:", category_name)
print(xgb_prediction)

Predicted Category: chef
13
