# CATEGORY PREDICTION FROM RESUME

In [3]:
import pandas as pd
import string
import re
from collections import Counter
import nltk
import joblib
import fitz
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    # Lower casing
    text = text.lower()
    # Punctuation removal
    text = "".join([i for i in text if i not in string.punctuation])
    # Stopwords removal
    STOPWORDS = set(stopwords.words('english'))
    text = " ".join([word for word in text.split() if word not in STOPWORDS])
    # URL removal
    text = re.sub(r'http[s]?://\S+|www\.\S+', '', text)
    # Stemming
    stemmer = PorterStemmer()
    words = " ".join([stemmer.stem(word) for word in text.split()])
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}
    pos_tagged_text = nltk.pos_tag(words.split())
    words = " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

    
    
    return words

def extract_text_from_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    return text

def preprocess_pdf(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    cleaned_text = preprocess_text(text)
    return cleaned_text

# Example usage
pdf_path = "My Resume.pdf"
cleaned_text = preprocess_pdf(pdf_path)
#Loading necessary files
joblib_file = "job_category_model.pkl"
model = joblib.load(joblib_file)
joblib_file_vectorizer = "job_tfidf_vectorizer.pkl"
tfidf_vectorizer = joblib.load(joblib_file_vectorizer)
joblib_file_le = "job_label_encoder.pkl"
le = joblib.load(joblib_file_le)
pdf_vec = tfidf_vectorizer.transform([cleaned_text])
pdf_pred = model.predict(pdf_vec)
pdf_pred = le.inverse_transform(pdf_pred)
resultdf = pd.DataFrame({'Resume ID': "Resume 1", 'Predicted Label': pdf_pred})
resultdf

Unnamed: 0,Resume ID,Predicted Label
0,Resume 1,project manager


# JOB SEARCH

## Linkedin job search

In [3]:
import requests
from bs4 import BeautifulSoup
import json
from parsel import Selector

head= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Connection": "keep-alive",
    "Accept-Language": "en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6",
}

def parse_job_page(job_url):
    response = requests.get(job_url)
    if response.status_code == 200:
        selector = Selector(response.text)
        script_data = json.loads(selector.xpath("//script[@type='application/ld+json']/text()").get())
        description = []
        for element in selector.xpath("//div[contains(@class, 'show-more')]/ul/li/text()").getall():
            text = element.replace("\n", "").strip()
            if len(text) != 0:
                description.append(text)
        script_data["jobDescription"] = description
        script_data.pop("description") # remove the key with the encoded HTML
        return script_data
    else:
        return None



job_title = "".join(pdf_pred)
location = "Abuja"
url = f"https://www.linkedin.com/jobs/search/?keywords={job_title.replace(' ', '%20')}&location={location.replace(' ', '%20')}&pageNum=0"
response = requests.get(url, headers=head)
job_listings_strings = []  # List to store job listings in string form
if response.status_code == 200:
   soup = BeautifulSoup(response.text, 'html5lib')
   job_listings = soup.find_all('div', {'class':'job-search-card'})
   for job in job_listings:
       title = job.find('h3', {'class': 'base-search-card__title'}).text.strip()
       company = job.find('a', {'class': 'hidden-nested-link'}).text.strip()
       location = job.find('span', {'class': 'job-search-card__location'}).text.strip()
       anchor_tag = job.find('a', class_='base-card__full-link')
       href_link = anchor_tag['href']
       job_data = parse_job_page(href_link)
       if job_data:
           job_description = job_data.get("jobDescription", [])
           job_listing_string = f"Title: {title}\nCompany: {company}\nLocation: {location}\nJob Link: {href_link}\nJob Description: {' '.join(job_description)}\n"
           job_listings_strings.append(job_listing_string)
       else:
           job_listing_string = f"Title: {title}\nCompany: {company}\nLocation: {location}\nJob Link: {href_link}\nJob Description: Could not retrieve\n"
           job_listings_strings.append(job_listing_string)
else:
   print("Failed to fetch job listings.")

print(job_listings_strings)
# # Print or use the list of job listings
# for job_listing in job_listings_strings:
#     print(job_listing)


['Title: Management Accountant (Abuja) at Saroafrica International Limited\nCompany: Saro Lifecare\nLocation: Abuja, Federal Capital Territory, Nigeria\nJob Link: https://ng.linkedin.com/jobs/view/management-accountant-abuja-at-saroafrica-international-limited-at-saro-lifecare-3915240657?position=1&pageNum=0&refId=e0XuilzB%2FSRaY2QLzvpNeQ%3D%3D&trackingId=TLCosVORYwtbGv76cohHeA%3D%3D&trk=public_jobs_jserp-result_search-card\nJob Description: Company: Location: Nigeria State: Job type: Full-Time Job category: Ensures that all foreign currency payments are made in accordance with contracted terms and budgeted exchange rates. Develops tools and systems to provide critical financial and operational information to the Management Team and provides actionable recommendations on both strategy and operations. He/she will ensure that financial discipline and prudence is applied for all transactions and will provide primary controls to ensure that the assets of the project are well secured. Put t

## Indeed job search

In [2]:
import requests
from bs4 import BeautifulSoup

head= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Connection": "keep-alive",
    "Accept-Language": "en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6",
}

job_title = "Accountant"
location = "Abuja"
url = f"https://ng.indeed.com/jobs?q={job_title.replace(' ', '+')}&l={location.replace(' ', '+')}"
response = requests.get(url, headers=head)

job_listings_strings = []  # List to store job listings in string form

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html5lib')
    joblistings = soup.find_all("div",{"class":"cardOutline"})
    for job in joblistings:
        title = job.find("a",{"class":"jcs-JobTitle css-jspxzf eu4oa1w0"}).text.strip()
        company = job.find("div",{"class":"company_location"}).find("span",{"class":"css-63koeb eu4oa1w0"}).text.strip()
        location = job.find("div",{"class":"company_location"}).find("div",{"class":"css-1p0sjhy eu4oa1w0"}).text.strip()
        link = job.find('a',{'class':'jcs-JobTitle css-jspxzf eu4oa1w0'}).get('href')
        href_link = 'https://ng.indeed.com' + link
        job_response = requests.get(href_link, headers=head)
        job_soup = BeautifulSoup(job_response.text, 'html5lib')
        job_description_tag = job_soup.find('div',{'class':'jobsearch-JobComponent-description css-16y4thd eu4oa1w0'}) 
        job_desc = job_description_tag.text.strip() if job_description_tag else "N/A"
        job_listing_string = f"Title: {title}\nCompany: {company}\nLocation: {location}\nJob Link: {href_link}\nJob Description: {job_desc}\n"
        job_listings_strings.append(job_listing_string)
else:
    print(response.status_code)
    print("Failed to fetch job listings.")

# # Print or use the list of job listings
# for job_listing in job_listings_strings:
#     print(job_listing)
print(job_listings_strings)

ConnectionError: HTTPSConnectionPool(host='ng.indeed.com', port=443): Max retries exceeded with url: /jobs?q=Accountant&l=Abuja (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000022DE1118800>: Failed to resolve 'ng.indeed.com' ([Errno 11001] getaddrinfo failed)"))

# SIMILARITY CALCULATION 

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity

resume_text = extract_text_from_pdf(pdf_path)
preprocessed_resume_text = preprocess_text(resume_text)

preprocessed_job_listings = []
for job_listing in job_listings_strings:
    job_description = job_listing.split("Job Description: ")[-1]
    preprocessed_job_description = preprocess_text(job_description)
    preprocessed_job_listings.append(preprocessed_job_description)
    
merged_list = [preprocessed_resume_text] + preprocessed_job_listings

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(merged_list)]

# Train Doc2Vec model
model = Doc2Vec(vector_size=30, min_count=2, epochs=80)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
model.save("d2v.model")

# Infer vectors for resume and job listings
resume_vec = model.infer_vector(word_tokenize(preprocessed_resume_text.lower()))
job_vecs = [model.infer_vector(word_tokenize(job.lower())) for job in preprocessed_job_listings]

# Calculate cosine similarity between resume and job listings
similarities = [cosine_similarity([resume_vec], [job_vec])[0][0] for job_vec in job_vecs]

# Rank job listings based on similarity
ranked_job_listings = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)

# Print the ranked job listings
for idx, similarity in ranked_job_listings:
    print(f"Job Listing {idx+1} - Similarity: {similarity:.4f}\n{job_listings_strings[idx]}\n")
