In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Setup Enviroment

In [None]:
!pip install pandarallel

Collecting pandarallel
  Downloading pandarallel-1.6.5.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py) ... [?25l[?25hdone
  Created wheel for pandarallel: filename=pandarallel-1.6.5-py3-none-any.whl size=16674 sha256=f0316f9ac35f466439ac733382f001a1fcdf22b58973825b2b5b79dea70c453a
  Stored in directory: /root/.cache/pip/wheels/46/f9/0d/40c9cd74a7cb8dc8fe57e8d6c3c19e2c730449c0d3f2bf66b5
Successfully built pandarallel
Installing collected packages: pandarallel
Successfully installed pandarallel-1.6.5


In [None]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 1 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
import pandas as pd
import numpy as np
import re
import pickle
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import Counter
import sys
import os

In [None]:
MODEL_PATH = '/content/drive/MyDrive/cv project/resume_classifier_model (3).keras'
TOKENIZER_PATH = '/content/drive/MyDrive/cv project/tokenizer.pickle'
ENCODER_PATH = '/content/drive/MyDrive/cv project/label_encoder (2).pickle'
cv_lstm_functions='/content/drive/MyDrive/cv project/cv_lstm_functions.py'

In [None]:
# Get the directory of the script
script_dir = os.path.dirname(cv_lstm_functions)
# Add the directory to sys.path if not already present
if script_dir not in sys.path:
    sys.path.insert(0, script_dir)

from cv_lstm_functions import *

In [None]:
import kagglehub
asaniczka_1_3m_linkedin_jobs_and_skills_2024_path = kagglehub.dataset_download('asaniczka/1-3m-linkedin-jobs-and-skills-2024')

print('Data source import complete.')

Downloading from https://www.kaggle.com/api/v1/datasets/download/asaniczka/1-3m-linkedin-jobs-and-skills-2024?dataset_version_number=2...


100%|██████████| 1.88G/1.88G [00:22<00:00, 89.5MB/s]

Extracting files...





Data source import complete.


In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df1 = pd.read_csv(asaniczka_1_3m_linkedin_jobs_and_skills_2024_path + '/job_skills.csv')
df2 = pd.read_csv(asaniczka_1_3m_linkedin_jobs_and_skills_2024_path + '/linkedin_job_postings.csv',usecols=['job_link', 'job_title', 'company','job_location'])

In [None]:
def load_models():
    try:
        model = load_model(MODEL_PATH)
        with open(TOKENIZER_PATH, 'rb') as f:
            tokenizer = pickle.load(f)
        with open(ENCODER_PATH, 'rb') as f:
            encoder = pickle.load(f)

        print("Models loaded successfully.")
        return model, tokenizer, encoder

    except Exception as e:
        print(f"An error occurred loading models: {e}")
        return None, None, None

In [None]:
model, tokenizer, encoder = load_models()

Models loaded successfully.


# resume preprocessing

In [None]:
def preprocess_resume(resume_text, verbose=True):
  nlp = load_spacy_model()
  cleaned_text = clean_resume_text(resume_text)

  cleaned_df = pd.DataFrame({'Resume': [cleaned_text]})
  processed_df = process_with_spacy(cleaned_df, nlp)  #lemmitization and POS
  processed_text = processed_df['Resume_POS_text'].iloc[0]

  return processed_text

# predict category

In [None]:
def predict_resume_category(resume_text,model=model,tokenizer=tokenizer,encoder=encoder,max_length=500):
  #preprocess resume
    processed_text = preprocess_resume(resume_text)
    sequence = tokenizer.texts_to_sequences([processed_text])
  #Pad sequences
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
    prediction_probs = model.predict(padded_sequence, verbose=0)[0]
  #predicted class
    predicted_index = np.argmax(prediction_probs)
    predicted_category = encoder.inverse_transform([predicted_index])[0]

  #confidence score
    confidence = prediction_probs[predicted_index]

    top_k = 5
    top_indices = np.argsort(prediction_probs)[-top_k:][::-1]
    top_categories = encoder.inverse_transform(top_indices)
    top_probabilities = prediction_probs[top_indices]

    return {
        'predicted_category': predicted_category,
        'confidence': float(confidence),
        'top_5_predictions': [
            {
                'category': cat,
                'probability': float(prob),
                'percentage': f"{prob * 100:.2f}%"
            }
            for cat, prob in zip(top_categories, top_probabilities)
        ]
    }

# Test category prediction

In [None]:
sample_resume = """SARAH JENKINS
sarah.jenkins@email.com | (555) 987-6543 | linkedin.com/in/sarahjenkins

SUMMARY
Human Resources Generalist with 6 years of experience in employee relations,
talent acquisition, and benefits administration. SHRM-CP certified.
Passionate about creating positive and productive workplace cultures.

SKILLS
- Full-Cycle Recruiting
- Onboarding & Training
- Employee Relations
- HRIS (Workday, BambooHR)
- Benefits Administration
- HR Policy & Compliance
- Performance Management

EXPERIENCE
HR Generalist | Innovate Solutions | 2019-Present
- Managed all aspects of the employee lifecycle for a 300-person tech company.
- Led full-cycle recruiting, hiring over 50 new employees in 2023.
- Administered employee benefits, payroll, and leave of absence programs.
- Resolved complex employee relations issues and conducted investigations.

HR Coordinator | DataCorp | 2017-2019
- Assisted with new hire onboarding and orientation.
- Maintained employee records and ensured HRIS data integrity.
- Supported the HR team with compliance reporting and audits."""

In [None]:
result = predict_resume_category(
    resume_text=sample_resume,
    max_length=500)

if result is not None:
    print(f"Predicted: {result['predicted_category']}")
    print(f"Confidence: {result['confidence'] * 100:.2f}%")
else:
    print("Prediction failed due to an error in loading model artifacts.")

spaCy model 'en_core_web_sm' loaded successfully
Processing text with spaCy (lemmatization & POS tagging)...


Processing Resumes:   0%|          | 0/1 [00:00<?, ?it/s]

spaCy processing with POS tags completed.
Predicted: human_resources
Confidence: 53.39%


# Job Recommendation based on predicted category

 ## Clean data

In [None]:
jobs = df2.dropna(subset=['job_title', 'company'])
jobs= jobs.drop_duplicates(subset=['job_link'])
jobs = jobs.reset_index(drop=True)


print(f"Final dataset: {len(jobs):,} unique jobs")

Final dataset: 1,348,443 unique jobs


In [None]:
# Match based on resume using job_title & job_skills
def resume_job_similarity(resume_text, jobs_df, top_n=500):
    print(f"Resume-Job Similarity Matching")

    # Prepare job texts
    jobs_df = jobs.copy()

    # combines title and skills
    if 'job_skills' in jobs_df.columns:
        jobs_df['job_text'] = (
            jobs_df['job_title'].fillna('') + ' ' +
            jobs_df['job_skills'].fillna('')
        )
    else:
        jobs_df['job_text'] = jobs_df['job_title'].fillna('')

    jobs_df['job_text'] = jobs_df['job_text'].str.lower()

    vectorizer = TfidfVectorizer(
        max_features=1000,
        stop_words='english',
        ngram_range=(1, 2)
    )

    # Combine resume and job texts
    all_texts = [resume_text.lower()] + jobs_df['job_text'].tolist()

    tfidf_matrix = vectorizer.fit_transform(all_texts)

    # Calculate similarity
    resume_vector = tfidf_matrix[0:1]
    job_vectors = tfidf_matrix[1:]
    similarities = cosine_similarity(resume_vector, job_vectors)[0]
    jobs_df['resume_score'] = similarities
    result = jobs_df.nlargest(top_n, 'resume_score')

    print(f"Calculated resume similarity (top score: {result['resume_score'].max():.4f})")

    return result

In [None]:
# Match jobs by category using TF-IDF on job_title
def tfidf_job_matching(jobs_df, predicted_category, top_n=500):
    category_text = predicted_category.replace('_', ' ')
    jobs_df = jobs_df.copy()
    jobs_df['job_text'] = jobs_df['job_title'].fillna('').str.lower()

    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english',
                                  ngram_range=(1, 2), min_df=2)
    all_texts = [category_text] + jobs_df['job_text'].tolist()
    tfidf_matrix = vectorizer.fit_transform(all_texts)

    category_vector = tfidf_matrix[0:1]
    job_vectors = tfidf_matrix[1:]
    similarities = cosine_similarity(category_vector, job_vectors)[0]

    jobs_df['tfidf_score'] = similarities
    return jobs_df.nlargest(top_n, 'tfidf_score')

In [None]:
# Combines both (60% category ,40% resume)
def job_matching(jobs_df, predicted_category, resume_text=None, top_n=100):
    print(f"Job Matching: {predicted_category}")

    print("TF-IDF semantic matching")
    tfidf_matches = tfidf_job_matching(jobs_df, predicted_category, top_n=1000)
    result = tfidf_matches.copy()

    #Resume similarity (if provided)
    if resume_text:
        print("\nResume-to-job similarity")

        resume_matches = resume_job_similarity(resume_text, result, top_n=len(result))

        # Merge resume scores
        result = result.merge(
            resume_matches[['job_link', 'resume_score']],
            on='job_link',
            how='left'
        )
        result['resume_score'] = result['resume_score'].fillna(0)

        # Normalize scores
        if result['tfidf_score'].max() > 0:
            result['tfidf_norm'] = result['tfidf_score'] / result['tfidf_score'].max()
        else:
            result['tfidf_norm'] = 0

        if result['resume_score'].max() > 0:
            result['resume_norm'] = result['resume_score'] / result['resume_score'].max()
        else:
            result['resume_norm'] = 0

        result['final_score'] = (
            0.60 * result['tfidf_norm'] +
            0.40 * result['resume_norm']
        )
    else:
        # Without resume: 100% TF-IDF score
        print("\nNo resume provided, skipping personalization.")
        if result['tfidf_score'].max() > 0:
            result['tfidf_norm'] = result['tfidf_score'] / result['tfidf_score'].max()
        else:
            result['tfidf_norm'] = 0

        # Calculate final score (100% TF-IDF)
        result['final_score'] = result['tfidf_norm']

    # Sort by final score
    result = result.nlargest(top_n, 'final_score')

    print(f"\n{'='*80}")
    print(f"✓ FINAL: {len(result)} recommendations generated")
    print(f"  Top score: {result['final_score'].max():.4f}")
    print(f"  Average score: {result['final_score'].mean():.4f}")

    return result

In [None]:
def complete_recommendation_pipeline(resume_text,
                                    predicted_category,
                                    confidence,
                                    jobs_df,
                                    top_n=20):

    print(f"\n{'#'*80}")
    print(f"Predicted Category: {predicted_category}")
    print(f"Confidence: {confidence * 100:.2f}%")
    print(f"{'#'*80}\n")

    # Get recommendations
    recommendations =job_matching(jobs_df,predicted_category, resume_text,top_n)
    # Clean up columns for display
    display_cols = ['job_link', 'job_title', 'company', 'job_location', 'final_score']
    display_cols = [col for col in display_cols if col in recommendations.columns]

    recommendations = recommendations[display_cols].copy()
    recommendations.insert(0, 'rank', range(1, len(recommendations) + 1))
    recommendations.rename(columns={'final_score': 'match_score'}, inplace=True)

    return {
        'predicted_category': predicted_category,
        'confidence': confidence,
        'method': 'automated (TF-IDF Title + Resume)',
        'total_recommendations': len(recommendations),
        'recommendations': recommendations
    }

In [None]:
def display_recommendations(result, show_top=10):

    print(f"\n{'='*80}")
    print("JOB RECOMMENDATIONS SUMMARY")
    print(f"{'='*80}")
    print(f"Predicted Role: {result['predicted_category']}")
    print(f"Confidence: {result['confidence'] * 100:.2f}%")
    print(f"Total Recommendations: {result['total_recommendations']}")
    print(f"{'='*80}\n")

    recommendations = result['recommendations']

    print(f"TOP {show_top} RECOMMENDED JOBS:\n")

    for idx, row in recommendations.head(show_top).iterrows():
        rank = row['rank']
        title = row['job_title']
        company = row['company']
        location = row.get('job_location', 'N/A')
        score = row.get('match_score', 0)

        print(f"{rank}. {title}")
        print(f"   Company: {company}")
        print(f"   Location: {location}")
        print(f"   Match Score: {score:.4f}")
        print(f"   Link: {row['job_link'][:80]}...")
        print()

In [None]:
predicted_category = result['predicted_category']
confidence = result['confidence']
result = complete_recommendation_pipeline(
    resume_text= sample_resume,
    predicted_category=predicted_category,
    confidence=confidence,
    jobs_df=jobs,
    top_n=20
)

display_recommendations(result, show_top=10)


################################################################################
Predicted Category: human_resources
Confidence: 53.39%
################################################################################

Job Matching: human_resources
TF-IDF semantic matching

Resume-to-job similarity
Resume-Job Similarity Matching
Calculated resume similarity (top score: 0.7443)

✓ FINAL: 20 recommendations generated
  Top score: 0.6000
  Average score: 0.5889

JOB RECOMMENDATIONS SUMMARY
Predicted Role: human_resources
Confidence: 53.39%
Total Recommendations: 20

TOP 10 RECOMMENDED JOBS:

1. Human Resources BP
   Company: Schindler Group
   Location: Atlanta, GA
   Match Score: 0.6000
   Link: https://www.linkedin.com/jobs/view/human-resources-bp-at-schindler-group-3745134...

2. Human Resources BP
   Company: Schindler Elevator Corporation (U.S.)
   Location: Atlanta, GA
   Match Score: 0.6000
   Link: https://www.linkedin.com/jobs/view/human-resources-bp-at-schindler-elevator-corp...