In [9]:
!pip install PyMuPDF spacy transformers torch
!python -m spacy download en_core_web_sm
!pip install pdfplumber
!pip install pymupdf
!pip install requests beautifulsoup4

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m81.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [10]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.pipeline import make_pipeline
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from tqdm.auto import tqdm
import warnings
import joblib
import os
import fitz
import requests
from bs4 import BeautifulSoup
import requests

In [11]:

warnings.filterwarnings("ignore")
tqdm.pandas()

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Load dataset
data = pd.read_csv("/content/UpdatedResumeDataSet.csv")
data['processed_resume'] = data['Resume'].progress_apply(preprocess_text)

# Encode categories
label_encoder = LabelEncoder()
data['encoded_category'] = label_encoder.fit_transform(data['Category'])

# Split data
X = data['processed_resume']
y = data['encoded_category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF parameters
tfidf_params = {
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2)],
    'tfidfvectorizer__max_df': [0.8, 0.9],
}

# GridSearch
def run_grid_search(name, pipeline, param_grid):
    print(f"\n Running GridSearchCV for {name}...")
    grid = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='macro')
    print(f"\n {name} Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_, digits=4))
    return grid.best_estimator_, f1

#  Naive Bayes
nb_pipeline = make_pipeline(TfidfVectorizer(), MultinomialNB())
nb_param_grid = {**tfidf_params, 'multinomialnb__alpha': [0.5, 1.0]}
nb_model, f1_nb = run_grid_search("Naive Bayes", nb_pipeline, nb_param_grid)

#  SVM
svm_pipeline = make_pipeline(TfidfVectorizer(), SVC(kernel='linear'))
svm_param_grid = {**tfidf_params, 'svc__C': [0.5, 1.0, 2.0]}
svm_model, f1_svm = run_grid_search("SVM", svm_pipeline, svm_param_grid)

#  Logistic Regression
logreg_pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression(max_iter=1000))
logreg_param_grid = {**tfidf_params, 'logisticregression__C': [0.5, 1.0, 2.0]}
logreg_model, f1_logreg = run_grid_search("Logistic Regression", logreg_pipeline, logreg_param_grid)

#  Select Best Model
results = {
    'Naive Bayes': (nb_model, f1_nb),
    'SVM': (svm_model, f1_svm),
    'Logistic Regression': (logreg_model, f1_logreg)
}
best_model_name = max(results, key=lambda k: results[k][1])
best_model, best_f1 = results[best_model_name]

print(f"\n Best Model Based on Macro F1-Score: {best_model_name}")
print(f" F1 Score: {best_f1:.4f}")
print(f" Best Parameters: {best_model.get_params()}")

# Save the best model and label encoder
output_dir = "/content/saved_model"
os.makedirs(output_dir, exist_ok=True)

joblib.dump(best_model, os.path.join(output_dir, "best_resume_model.pkl"))
joblib.dump(label_encoder, os.path.join(output_dir, "label_encoder.pkl"))

print(f"\n Model and LabelEncoder saved to: {output_dir}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


  0%|          | 0/962 [00:00<?, ?it/s]


 Running GridSearchCV for Naive Bayes...
Fitting 3 folds for each of 8 candidates, totalling 24 fits

 Naive Bayes Classification Report:
                            precision    recall  f1-score   support

                 Advocate     1.0000    1.0000    1.0000         3
                     Arts     1.0000    1.0000    1.0000         6
       Automation Testing     1.0000    1.0000    1.0000         5
               Blockchain     1.0000    1.0000    1.0000         7
         Business Analyst     1.0000    1.0000    1.0000         4
           Civil Engineer     1.0000    1.0000    1.0000         9
             Data Science     1.0000    1.0000    1.0000         5
                 Database     1.0000    1.0000    1.0000         8
          DevOps Engineer     1.0000    0.9286    0.9630        14
         DotNet Developer     1.0000    1.0000    1.0000         5
            ETL Developer     1.0000    1.0000    1.0000         7
   Electrical Engineering     1.0000    1.0000    1.000

In [12]:

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text


In [13]:

# Load the best model and label encoder
best_model = joblib.load("/content/saved_model/best_resume_model.pkl")
label_encoder = joblib.load("/content/saved_model/label_encoder.pkl")

pdf_text = extract_text_from_pdf('/content/curriculum_vitae.pdf')
processed_text = preprocess_text(pdf_text)
# Predict category
predicted_label = best_model.predict([processed_text])[0]
predicted_class = label_encoder.inverse_transform([predicted_label])[0]
print("Predicted Resume Class:", predicted_class)

Predicted Resume Class: Data Science


In [14]:

# Mapping from your predicted class to RemoteOK category
predicted_to_remoteok = {
    "Advocate": "legal",
    "Arts": "design",
    "Automation Testing": "qa",
    "Blockchain": "blockchain",
    "Business Analyst": "analyst",
    "Civil Engineer": "engineering",
    "Data Science": "data-science",
    "Database": "sql",
    "DevOps Engineer": "devops",
    "DotNet Developer": "csharp",
    "ETL Developer": "data-science",
    "Electrical Engineering": "engineering",
    "HR": "hr",
    "Hadoop": "data-science",
    "Health and fitness": "medical",
    "Java Developer": "java",
    "Mechanical Engineer": "engineering",
    "Network Security Engineer": "security",
    "Operations Manager": "ops",
    "PMO": "project-management",
    "Python Developer": "python",
    "SAP Developer": "software-dev",
    "Sales": "sales",
    "Testing": "qa",
    "Web Designing": "design"
}

def scrape_remoteok_jobs(category, keyword=None, num_results=10):
    base_url = f"https://remoteok.com/remote-{category.lower().replace(' ', '-')}-jobs"
    headers = {'User-Agent': 'Mozilla/5.0'}

    response = requests.get(base_url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch jobs for {category}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    job_posts = soup.find_all('tr', class_='job')[:num_results * 2]  # fetch more to filter

    jobs = []
    for post in job_posts:
        title_tag = post.find('h2')
        company_tag = post.find('h3')
        link_tag = post.get('data-href')

        if title_tag and company_tag and link_tag:
            title = title_tag.text.strip()
            job = {
                'title': title,
                'company': company_tag.text.strip(),
                'link': 'https://remoteok.com' + link_tag
            }
            jobs.append(job)

    # Prioritize jobs that contain the keyword in the title
    if keyword:
        keyword_lower = keyword.lower()
        matched = [job for job in jobs if keyword_lower in job['title'].lower()]
        unmatched = [job for job in jobs if keyword_lower not in job['title'].lower()]
        jobs = matched + unmatched

    return jobs[:num_results]


In [15]:
remoteok_category = predicted_to_remoteok.get(predicted_class)
jobs = scrape_remoteok_jobs(remoteok_category, keyword=predicted_class, num_results=10)

# Print results
print(f"\n Top {len(jobs)} jobs for '{predicted_class}' from Remoteok:\n")
for i, job in enumerate(jobs, 1):
    print(f"{i}. {job['title']} at {job['company']}\n   {job['link']}\n")


 Top 10 jobs for 'Data Science' from Remoteok:

1. Principal Software Engineer Applied Science Data Science at Cardlytics
   https://remoteok.com/remote-jobs/109197-remote-principal-software-engineer-applied-science-data-science-cardlytics

2. Senior Data Engineer at Alqen
   https://remoteok.com/remote-jobs/remote-senior-data-engineer-alqen-664520

3. Principal Software Engineer Data Scientist at Cardlytics
   https://remoteok.com/remote-jobs/109456-remote-principal-software-engineer-data-scientist-cardlytics

4. Marketing Data Scientist at Recast
   https://remoteok.com/remote-jobs/107901-remote-marketing-data-scientist-recast

5. Machine learning engineer for large scale ML project at OneSecondDelivery
   https://remoteok.com/remote-jobs/106921-remote-machine-learning-engineer-for-large-scale-ml-project-oneseconddelivery

6. Data Scientist at Ampcontrol
   https://remoteok.com/remote-jobs/103561-remote-data-scientist-ampcontrol

7. Staff Data Scientist at Shopify
   https://remoteo

In [16]:
# Define category mapping to Remotive categories
category_mapping = {
    "Advocate": "legal",
    "Arts": "others",
    "Automation Testing": "qa",
    "Blockchain": "software-dev",
    "Business Analyst": "product",
    "Civil Engineer": "others",
    "Data Science": "data",
    "Database": "software-dev",
    "DevOps Engineer": "devops-sysadmin",
    "DotNet Developer": "software-dev",
    "ETL Developer": "software-dev",
    "Electrical Engineering": "others",
    "HR": "human-resources",
    "Hadoop": "software-dev",
    "Health and fitness": "others",
    "Java Developer": "software-dev",
    "Mechanical Engineer": "others",
    "Network Security Engineer": "devops-sysadmin",
    "Operations Manager": "sales",
    "PMO": "project-management",
    "Python Developer": "software-dev",
    "SAP Developer": "software-dev",
    "Sales": "sales",
    "Testing": "qa",
    "Web Designing": "design"
}

#  Valid Remotive categories for reference (should match API expectations)
remotive_valid_categories = [
    "software-dev", "customer-support", "design", "marketing", "sales",
    "product", "devops-sysadmin", "finance-legal", "hr", "qa", "writing",
    "teaching", "business", "data", "project-management", "others"
]

# Function to fetch jobs from Remotive API for a valid category
def fetch_jobs_remotive(remotive_category, num_results=10):
    category = remotive_category.lower().replace(" ", "-")

    if category not in remotive_valid_categories:
        print(f"'{category}' is not a valid Remotive category.")
        return []

    url = f"https://remotive.com/api/remote-jobs?category={category}"

    try:
        response = requests.get(url)
        response.raise_for_status()

        jobs = response.json().get("jobs", [])
        return [
            {
                "title": job["title"],
                "company": job["company_name"],
                "url": job["url"]
            }
            for job in jobs[:num_results]
        ]
    except Exception as e:
        print(f" Error fetching jobs: {e}")
        return []

# Function to fetch jobs based on predicted class
def fetch_jobs_by_keywords(predicted_class, num_results=10):
    broad_category = category_mapping.get(predicted_class, "others")

    print(f"\n Predicted Class: {predicted_class}")
    print(f" Mapped to Remotive Category: {broad_category}")

    # Step 1: Try broad category first (Remotive valid category)
    jobs_in_broad_category = fetch_jobs_remotive(broad_category, num_results)
    if jobs_in_broad_category:
        return jobs_in_broad_category

    # Step 2: Try keyword-based fallback
    keywords = predicted_class.lower().split()
    jobs_with_keywords = []

    for keyword in keywords:
        jobs_with_keywords.extend(fetch_jobs_remotive(keyword, num_results))

    # Remove duplicates by URL
    unique_jobs = {job['url']: job for job in jobs_with_keywords}.values()

    if unique_jobs:
        return list(unique_jobs)

    # Step 3: Return empty if nothing found
    print("No jobs found for either mapped category or keywords.")
    return []

In [17]:
jobs = fetch_jobs_by_keywords(predicted_class)

print(f"\n Top {len(jobs)} jobs for '{predicted_class}' from Remotive:\n")
for job in jobs:
    print(f"{job['title']} at {job['company']}\n→ {job['url']}\n")


 Predicted Class: Data Science
 Mapped to Remotive Category: data

 Top 10 jobs for 'Data Science' from Remotive:

Data Analyst at Autone
→ https://remotive.com/remote-jobs/data/data-analyst-2007929

Data Analyst, Autonomy at Serve Robotics
→ https://remotive.com/remote-jobs/data/data-analyst-autonomy-2008113

Business Analyst at Natera
→ https://remotive.com/remote-jobs/data/business-analyst-2008107

Senior Manager, Statistical Programming at AbbVie
→ https://remotive.com/remote-jobs/data/senior-manager-statistical-programming-2008250

Business Analyst at InfyStrat
→ https://remotive.com/remote-jobs/data/business-analyst-2007244

Principal Data and AI Strategy at PhData
→ https://remotive.com/remote-jobs/data/principal-data-and-ai-strategy-2006972

Manager, Global Item Master Data at Paula's Choice Skincare
→ https://remotive.com/remote-jobs/data/manager-global-item-master-data-2006011

Part-Time Analyst - Social Listening & Insights at Mutiny
→ https://remotive.com/remote-jobs/data/