# Get Data

In [1]:
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from vertexai.language_models import TextEmbeddingModel
import vertexai
import re
import time

# STEP 1: Extract job URLs from the search page

chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)

url = "https://jobs.careers.microsoft.com/global/en/search?q=engineer&lc=Australia&l=en_us&pg=1&pgSz=20&o=Recent&flt=true"
driver.get(url)

wait = WebDriverWait(driver, 20)
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '[data-automationid="ListCell"]')))

job_cells = driver.find_elements(By.CSS_SELECTOR, '[data-automationid="ListCell"]')

search_list = []
for job_cell in job_cells:
    cell_html = job_cell.get_attribute('outerHTML')
    soup = BeautifulSoup(cell_html, 'html.parser')

    # Extract job number from aria-label="Job item XXXXXX"
    job_div = soup.select_one('div[aria-label^="Job item"]')
    if not job_div:
        continue
    aria_label = job_div.get('aria-label', '')
    parts = aria_label.split()
    job_number = parts[-1] if parts else None
    if not job_number:
        continue

    # Extract job name from h2 tag (adjust the class if needed)
    title_el = soup.select_one('h2.MZGzlrn8gfgSs8TZHhv2') 
    if not title_el:
        continue
    job_name = title_el.get_text(strip=True)

    # Format job name for URL (replace spaces with dashes)
    job_name_url = re.sub(r'\s+', '-', job_name)
   
    # Construct the job detail URL
    job_url = f"https://jobs.careers.microsoft.com/global/en/job/{job_number}/{job_name_url}"
    search_list.append(job_url)

driver.quit()

# STEP 2: Navigate to each job detail page and extract details

chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
wait = WebDriverWait(driver, 30)

job_details_list = []
error_list=[]
embedded_list = []
#declare a dictionary for scraped content as 
# {'work title' : ['Overview','Qualifications',''Responsibilities','Date posted','Role Type']}
sorted_info_list = {}
for job_url in search_list[:10]:
    time.sleep(10)
    driver.get(job_url)
    print(job_url)
    try:
        # Wait for a stable element on the job detail page.
        # Inspect the actual page to find a guaranteed element. 
        # Example: Let's assume there's always an H1 with class 'job-title' on detail page:
        detail_element = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class*="SearchJobDetailsCard"]'))  # Adjust this selector
        )
        
        # Get the entire page source now that we know the detail page is loaded
        detail_html = driver.page_source
        # job_details_list.append(detail_html)
        
        job_title = driver.find_element(By.CSS_SELECTOR, "div.SearchJobDetailsCardViewHelper ~ h1").text.replace("\n"," ")
        overview = driver.find_element(By.XPATH, "//h3[text()='Overview']/following-sibling::div").text.replace("\n"," ")
        qualifications = driver.find_element(By.XPATH, "//h3[text()='Qualifications']/following-sibling::div").text.replace("\n"," ")
        responsibilities = driver.find_element(By.XPATH, "//h3[text()='Responsibilities']/following-sibling::div").text.replace("\n"," ")
        posted_date = driver.find_element(By.XPATH, "//div[text()='Date posted']/following-sibling::div").text.replace("\n"," ")
        role_type = driver.find_element(By.XPATH, "//div[text()='Role type']/following-sibling::div").text.replace("\n"," ")
        
        job_info = [overview, qualifications, responsibilities, posted_date, role_type]
        
        sorted_info_list[job_url] = job_info
        
    except:
        # If we fail to find the element, print the page source for debugging
        # This helps identify what selector to use in the future.
        print(f"Could not find the detail element for {job_url}.")
        error_list.append(job_url)
        print("-" * 50)
        # Append the raw HTML anyway, so we can later inspect it outside the script
        #job_details_list.append(driver.page_source)

driver.quit()



https://jobs.careers.microsoft.com/global/en/job/1780627/Electrical-Engineer---Data-Centre
https://jobs.careers.microsoft.com/global/en/job/1780623/Mechanical-Engineer---Data-Centre
https://jobs.careers.microsoft.com/global/en/job/1792391/Technical-Support-Engineering---Intune
https://jobs.careers.microsoft.com/global/en/job/1797408/Senior-Technical-Support-Engineer---Azure
https://jobs.careers.microsoft.com/global/en/job/1796073/Senior-Software-Engineer
https://jobs.careers.microsoft.com/global/en/job/1793791/Senior-Lease-Contruction-Manager
https://jobs.careers.microsoft.com/global/en/job/1796560/Azure-ACE-Engineer
https://jobs.careers.microsoft.com/global/en/job/1794043/Software-Engineer-II
https://jobs.careers.microsoft.com/global/en/job/1793352/Datacenter-Engineering-Principal-Telecom-Engineer-–-APAC
https://jobs.careers.microsoft.com/global/en/job/1792955/DCE-Lease-Mechanical-Engineer


In [3]:
sorted_info_list

{'Electrical Engineer - Data Centre': ["In alignment with our Microsoft values, we are committed to cultivating an inclusive work environment for all employees to positively impact our culture every day and we need you as a Critical Environment Electrical Engineer.  Microsoft’s Cloud Operations & Innovation (CO+I) is the engine that powers our cloud services. As a CO+I Electrical Engineer, you will perform a key role in delivering the core infrastructure and foundational technologies for Microsoft's online services including Bing, Office 365, Xbox, OneDrive, and the Microsoft Azure platform. As a group, CO+I is focused on the personal and professional development for all employees and offers trainings and growth opportunities including Career Rotation Programs, Diversity & Inclusion trainings and events, and professional certifications.  Our infrastructure is comprised of a large global portfolio of more than 200 datacenters in 32 countries and millions of servers. Our foundation is bu

# Embedding

## 1. Custom Dimension

In [None]:
import os
from google.oauth2 import service_account
import vertexai
from vertexai.preview.language_models import TextEmbeddingModel, TextEmbeddingInput

# 1. Initialization
project_id = "core-sprite-445503-a2"
SERVICE_ACCOUNT_JSON = "key.json"

credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_JSON)

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")
vertexai.init(project=project_id, location=LOCATION, credentials=credentials)

# 2. Model Configuration
dimensionality = 768  # The dimensionality of the output embeddings
task = "RETRIEVAL_DOCUMENT"  # The task for embedding. Example: SEMANTIC_SIMILARITY, CLASSIFICATION, CLUSTERING, RETRIEVAL_QUERY, QUESTION_ANSWERING, FACT_VERIFICATION

model = TextEmbeddingModel.from_pretrained("text-embedding-005")

# 3. Prepare Input Text
# Assuming sorted_info_list is a dict whose .values() return [id, text, ...].
# Extract just the job text (job[1]) into a list.
job_texts = []
for job in sorted_info_list.values():
    job_texts.append(job[1])

sample_qualification_text = (
    '''Programming skills: Python, Ruby on Rail, JAVA, Node.JS, SQL, PHP, C#.
    Web skills: Django with Python, AWS, Express with Node.JS, HTML5, SCSS, YiiFramework, NetSuite, Ecommerce, Flutter.
    Software skills: Linux Operation, Android development.
    Soft skills: Problem Solving, Independent learning, Requirement analysis from customer.'''
)

# 4. Create EmbeddingInput Objects
job_inputs = [TextEmbeddingInput(text, task) for text in job_texts]
sample_input = [TextEmbeddingInput(sample_qualification_text, task)]

# 5. Pass optional kwargs if needed
kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {}

# 6. Get Embeddings from the Model
job_embeddings = model.get_embeddings(job_inputs, **kwargs)  # returns a list of Embedding objects
resume_embedding = model.get_embeddings(sample_input, **kwargs)[0]  # single Embedding object

# 7. Extract Embedding Values
embedded_list = [emb.values for emb in job_embeddings]
resume_embedding_values = resume_embedding.values

# 8. Print results
print("Job Embeddings:", embedded_list)
print("Resume Embedding:", resume_embedding_values)

Job Embeddings: [[-0.03150669112801552, -0.024175262078642845, -0.016502900049090385, -0.011113845743238926, 0.010463858023285866, 0.018997645005583763, -0.018657224252820015, 0.015455497428774834, 0.022020820528268814, -0.022415289655327797, -0.010453227907419205, -0.06879480928182602, 0.03674623742699623, -0.05288606137037277, 0.08605048060417175, 0.01538248173892498, 0.03417185693979263, -0.06726362556219101, -0.03939540311694145, -0.03725774958729744, 0.06964399665594101, -0.04261791706085205, -0.06694038957357407, -0.06496327370405197, -0.0052953423000872135, -0.06052738428115845, 0.09271311014890671, -0.0010931318392977118, 0.04679969698190689, 0.007791459094733, 0.08335044980049133, 0.02067439816892147, 0.03254356235265732, -0.055112216621637344, 0.019428761675953865, -0.031554896384477615, 0.010128730908036232, 0.021687032654881477, -0.024144930765032768, -0.0006050160736776888, 0.010125512257218361, -0.006046109367161989, -0.01784755475819111, -0.04466446861624718, -0.02340978

# 2. Similarity 

In [17]:
import numpy as np

def cosine_similarity(vec1, vec2):
    """
    Calculate the cosine similarity between two vectors.

    Parameters:
    vec1 (array-like): First vector.
    vec2 (array-like): Second vector.

    Returns:
    float: Cosine similarity between vec1 and vec2.
    """
    # Convert inputs to numpy arrays
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)

    # Compute the dot product of the two vectors
    dot_product = np.dot(vec1, vec2)

    # Compute the norm (magnitude) of each vector
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    # Check for zero vectors to avoid division by zero
    if norm_vec1 == 0 or norm_vec2 == 0:
        raise ValueError("One of the vectors is zero-vector; cannot compute cosine similarity.")

    # Compute cosine similarity
    cosine_sim = dot_product / (norm_vec1 * norm_vec2)

    return cosine_sim

# 3. Rank

In [18]:
from collections import OrderedDict

def rank_jobs_by_similarity_descending_dict(
    sorted_info_list,
    embedded_list,
    resume_embedding_values
):
    """
    Rank jobs by cosine similarity in descending order and return a dictionary.

    Parameters:
    -----------
    sorted_info_list : dict
        A dictionary where the key is the job URL 
        and the value is [id, text, ...].
        Example: { "https://joburl.com/abc123": [123, "job text...", ...], ... }
    embedded_list : list
        A list of job embeddings (one embedding per job),
        in the same order as `sorted_info_list.values()`.
    resume_embedding_values : list or np.ndarray
        The embedding of the resume text.

    Returns:
    --------
    dict
        A dictionary of {job_url: similarity_score}, sorted in descending order by similarity.
    """

    results = []
    
    # Pair each dict item (job_url, [id, text, ...]) with its corresponding embedding
    for (job_url, _), job_embedding in zip(sorted_info_list.items(), embedded_list):
        score = cosine_similarity(job_embedding, resume_embedding_values)
        results.append((job_url, score))

    # Sort by similarity score in descending order
    results.sort(key=lambda x: x[1], reverse=True)

    # Create an OrderedDict to preserve sorted order, or just a regular dict (Python 3.7+ maintains insertion order)
    sorted_dict = OrderedDict((url, score) for (url, score) in results)

    return sorted_dict

# Example usage:
similarity_dict = rank_jobs_by_similarity_descending_dict(sorted_info_list, embedded_list, resume_embedding_values)

# This returns something like:
# OrderedDict([
#   ("https://joburl.com/job_1", 0.85),
#   ("https://joburl.com/job_2", 0.77),
#   ...
# ])
#
# If you prefer a regular dict (and you're on Python 3.7+), you can do:
# sorted_dict = {url: score for (url, score) in results}
# which will maintain the insertion order.


In [20]:
similarity_dict

OrderedDict([('https://jobs.careers.microsoft.com/global/en/job/1796073/Senior-Software-Engineer',
              np.float64(0.729828944329994)),
             ('https://jobs.careers.microsoft.com/global/en/job/1797408/Senior-Technical-Support-Engineer---Azure',
              np.float64(0.7298221509590385)),
             ('https://jobs.careers.microsoft.com/global/en/job/1794043/Software-Engineer-II',
              np.float64(0.7239041464181559)),
             ('https://jobs.careers.microsoft.com/global/en/job/1792391/Technical-Support-Engineering---Intune',
              np.float64(0.7055147849518414)),
             ('https://jobs.careers.microsoft.com/global/en/job/1796560/Azure-ACE-Engineer',
              np.float64(0.698493257044475)),
             ('https://jobs.careers.microsoft.com/global/en/job/1780627/Electrical-Engineer---Data-Centre',
              np.float64(0.6665733099652597)),
             ('https://jobs.careers.microsoft.com/global/en/job/1793791/Senior-Lease-Contruction