In [104]:
import joblib
import pdfplumber
import re
import spacy
import numpy as np
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [105]:
# Load the saved model and vectorizer
bc = pickle.load(open('/Users/macbookair/Desktop/project/Resume_Analyser/resume-job-matching/models/bc_model.pkl', 'rb'))
tfidf = pickle.load(open('/Users/macbookair/Desktop/project/Resume_Analyser/resume-job-matching/models/tfidf_vectorizer.pkl', 'rb'))

In [106]:
nlp = spacy.load('en_core_web_sm')

In [107]:
def cleanText(text):

    if not isinstance(text, str):
        return ''

    text = re.sub(r'[^a-zA-Z0-9\s+]', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\d{10,}', '', text) 
    text = re.sub('\s+', ' ', text)
    return text.lower()

In [108]:
def lemmatize_tokens(tokens):
    doc = nlp(' '.join(tokens))
    return [token.lemma_ for token in doc]

In [109]:
DOMAIN_SPECIFIC_STOPWORDS = ['resume', 'job', 'candidate', 'apply', 'experience', 'skills']
stop_words = set(stopwords.words('english')).union(DOMAIN_SPECIFIC_STOPWORDS)

def remove_stopwords_from_tokens(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

In [110]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a single PDF file."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            full_text = ''
            for page in pdf.pages:
                full_text += page.extract_text() or ''  # Extract text from all pages
            return full_text.strip()  # Clean up whitespace
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ''

In [None]:
resume = '/Users/macbookair/Desktop/project/Resume_Analyser/resume-job-matching/data/Resume/resume_en.pdf'

In [112]:
extracted_text = extract_text_from_pdf(resume)
cleaned_resume = cleanText(extracted_text)

In [113]:
extracted_text

"Juan Jose Carin\nMountain View, CA 94041\n650-336-4590 | juanjose.carin@gmail.com\nData Scientist\nlinkedin.com/in/juanjosecarin | juanjocarin.github.io\nProfessional Profile\nPassionate about data analysis and experiments, mainly focused on user behavior, experience, and engagement, with a solid\nbackground in data science and statistics, and extensive experience using data insights to drive business growth.\nEducation\n2016 University of California, Berkeley Master of Information and Data Science GPA: 3.93\nRelevant courses: • Field Experiments • Data Visualization and\n• Machine Learning • Applied Regression and Time Series Communication\n• Machine Learning at Scale Analysis • Research Design and Applications for\n• Storing and Retrieving Data • Exploring and Analyzing Data Data Analysis\n2014 Universidad Politécnica de Madrid M.S. in Statistical and Computational Information Processing GPA: 3.69\nRelevant courses: • Neural Networks and Statistical • Monte Carlo Techniques\n• Data 

In [114]:
tokens = word_tokenize(cleaned_resume)
tokens = remove_stopwords_from_tokens(tokens)
lemmatized_tokens = lemmatize_tokens(tokens)

In [115]:
final_resume_text = ' '.join(lemmatized_tokens)

In [116]:
resume_vectorized = tfidf.transform([final_resume_text])

In [117]:
prediction = bc.predict(resume_vectorized)[0]

In [118]:
category_mapping = {0: 'accountant', 1: 'advocate', 2: 'agriculture', 3: 'apparel', 4: 'arts', 5: 'automation testing', 6: 'automobile', 7: 'aviation', 8: 'banking', 9: 'blockchain', 10: 'bpo', 11: 'business analyst', 12: 'business-development', 13: 'chef', 14: 'civil engineer', 15: 'construction', 16: 'consultant', 17: 'data science', 18: 'database', 19: 'designer', 20: 'devops engineer', 21: 'digital-media', 22: 'dotnet developer', 23: 'electrical engineering', 24: 'engineering', 25: 'etl developer', 26: 'finance', 27: 'fitness', 28: 'hadoop', 29: 'health and fitness', 30: 'healthcare', 31: 'hr', 32: 'information-technology', 33: 'java developer', 34: 'mechanical engineer', 35: 'network security engineer', 36: 'operations manager', 37: 'pmo', 38: 'public-relations', 39: 'python developer', 40: 'sales', 41: 'sap developer', 42: 'teacher', 43: 'testing', 44: 'web designing'}

In [119]:
category_name = category_mapping.get(prediction, "Unknown")

print("Predicted Category:", category_name)
print(prediction)

Predicted Category: sales
40


In [120]:
lgr = pickle.load(open('/Users/macbookair/Desktop/project/Resume_Analyser/resume-job-matching/models/lgr_model.pkl', 'rb'))

In [121]:
prediction_lgr = lgr.predict(resume_vectorized)[0]

In [122]:
category_name = category_mapping.get(prediction_lgr, "Unknown")

print("Predicted Category:", category_name)
print(prediction_lgr)

Predicted Category: sales
40
