In [19]:
import joblib
import pdfplumber
import re
import spacy
import numpy as np
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [20]:
# Load the saved model and vectorizer
bc = pickle.load(open('/Users/macbookair/Desktop/project/Resume_Analyser/resume-job-matching/models/bc_model.pkl', 'rb'))
lgr = pickle.load(open('/Users/macbookair/Desktop/project/Resume_Analyser/resume-job-matching/models/lgr_model.pkl', 'rb'))
# xgb = pickle.load(open('/Users/macbookair/Desktop/project/Resume_Analyser/resume-job-matching/models/xgb_model.pkl', 'rb'))

tfidf = pickle.load(open('/Users/macbookair/Desktop/project/Resume_Analyser/resume-job-matching/models/resume_tfidf_vectorizer.pkl', 'rb'))

In [21]:
nlp = spacy.load('en_core_web_sm')

In [22]:
def cleanText(text):

    if not isinstance(text, str):
        return ''

    text = re.sub(r'[^a-zA-Z0-9\s+]', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\d{10,}', '', text) 
    text = re.sub('\s+', ' ', text)
    return text.lower()

In [23]:
def lemmatize_tokens(tokens):
    doc = nlp(' '.join(tokens))
    return [token.lemma_ for token in doc]

In [24]:
DOMAIN_SPECIFIC_STOPWORDS = ['resume', 'job', 'candidate', 'apply', 'experience', 'skills']
stop_words = set(stopwords.words('english')).union(DOMAIN_SPECIFIC_STOPWORDS)

def remove_stopwords_from_tokens(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

In [25]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a single PDF file."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            full_text = ''
            for page in pdf.pages:
                full_text += page.extract_text() or ''  # Extract text from all pages
            return full_text.strip()  # Clean up whitespace
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ''

In [26]:
resume = '/Users/macbookair/Desktop/project/Resume_Analyser/resume-job-matching/data/Resume/resume_en.pdf'

In [27]:
extracted_text = extract_text_from_pdf(resume)
cleaned_resume = cleanText(extracted_text)

In [28]:
extracted_text

'Senghak ROU\nsenghak.rou@ensae.fr # (+33) 668 933 288 (cid:131) www.linkedin.com/in/rousenghak (cid:239)\nI am actively seeking a 2-3 months internship in data analyst, machine learning and statistics starting in June\n2025.\nFormation\nENSAE Paris 2023 – Present\nEngineering diploma in Data science, Statistics, and Economics.\n• Courses taken: Functional and convex analysis, Mathematical foundations of probability, Optimization, Statistics,\nMacroeconomics, Microeconomics, Algorithms and programming in Python, Univariate descriptive statistics with\nSAS, ...\nInstitute of Technology of Cambodia (ITC) 2020 – 2023\nEngineering program: 3 years of formation in mathematics, physics, and computer science\nExperience\nRSL Global Logistics Co., Ltd | Intern, IT Support Jul. 2024 – Aug. 2024\nCompleted a six-week exploratory internship at RSL Global Logistics, where I provided IT support and resolved\ntechnical issues across various departments.\nAXK | Tutor, Mathematics Jan. 2023 – May. 202

In [29]:
tokens = word_tokenize(cleaned_resume)
tokens = remove_stopwords_from_tokens(tokens)
lemmatized_tokens = lemmatize_tokens(tokens)

In [30]:
lemmatized_tokens

['senghak',
 'rou',
 'senghakrouensaefr',
 '+33',
 '668',
 '933',
 '288',
 'cid131',
 'wwwlinkedincominrousenghak',
 'cid239',
 'actively',
 'seek',
 '23',
 'month',
 'internship',
 'data',
 'analyst',
 'machine',
 'learn',
 'statistic',
 'start',
 'june',
 '2025',
 'formation',
 'ensae',
 'paris',
 '2023',
 'present',
 'engineering',
 'diploma',
 'datum',
 'science',
 'statistics',
 'economic',
 'course',
 'take',
 'functional',
 'convex',
 'analysis',
 'mathematical',
 'foundations',
 'probability',
 'optimization',
 'statistics',
 'macroeconomic',
 'microeconomics',
 'algorithm',
 'programming',
 'python',
 'univariate',
 'descriptive',
 'statistic',
 'sas',
 'institute',
 'technology',
 'cambodia',
 'itc',
 '2020',
 '2023',
 'engineering',
 'program',
 '3',
 'year',
 'formation',
 'mathematics',
 'physics',
 'computer',
 'science',
 'rsl',
 'global',
 'logistics',
 'co',
 'ltd',
 'intern',
 'support',
 'jul',
 '2024',
 'aug',
 '2024',
 'complete',
 'sixweek',
 'exploratory',
 'inte

In [31]:
final_resume_text = ' '.join(lemmatized_tokens)

In [32]:
resume_vectorized = tfidf.transform([final_resume_text])

In [33]:
bc_prediction = bc.predict(resume_vectorized)[0]
lgr_prediction = lgr.predict(resume_vectorized)[0]
# xgb_prediction = xgb.predict(resume_vectorized)[0]

In [34]:
category_mapping = {0: 'accountant', 1: 'advocate', 2: 'agriculture', 3: 'apparel', 4: 'arts', 5: 'automation testing', 6: 'automobile', 7: 'aviation', 8: 'banking', 9: 'blockchain', 10: 'bpo', 11: 'business analyst', 12: 'business-development', 13: 'chef', 14: 'civil engineer', 15: 'construction', 16: 'consultant', 17: 'data science', 18: 'database', 19: 'designer', 20: 'devops engineer', 21: 'digital-media', 22: 'dotnet developer', 23: 'electrical engineering', 24: 'engineering', 25: 'etl developer', 26: 'finance', 27: 'fitness', 28: 'hadoop', 29: 'health and fitness', 30: 'healthcare', 31: 'hr', 32: 'information-technology', 33: 'java developer', 34: 'mechanical engineer', 35: 'network security engineer', 36: 'operations manager', 37: 'pmo', 38: 'public-relations', 39: 'python developer', 40: 'sales', 41: 'sap developer', 42: 'teacher', 43: 'testing', 44: 'web designing'}

In [35]:
#Bagging Classifier

category_name = category_mapping.get(bc_prediction, "Unknown")

print("Predicted Category:", category_name)
print(bc_prediction)

Predicted Category: engineering
24


In [36]:
#Logistic Regression

category_name = category_mapping.get(lgr_prediction, "Unknown")

print("Predicted Category:", category_name)
print(lgr_prediction)

Predicted Category: engineering
24


In [37]:
#XGBoost Classifier

# category_name = category_mapping.get(xgb_prediction, "Unknown")

# print("Predicted Category:", category_name)
# print(xgb_prediction)