In [6]:
from langchain_community.document_loaders import WebBaseLoader
from bs4 import BeautifulSoup as Soup
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint, HuggingFacePipeline
from langchain_core.output_parsers import StrOutputParser
from langchain import hub
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

from dotenv import load_dotenv

import requests

In [108]:
KNOWLEDGE_BASE_URLS = [
    'https://www.google.com/about/careers/applications/jobs/results?q=%22software%20engineer%22&employment_type=FULL_TIME&company=Google&location=India'
]

BASE_URL = 'https://www.google.com/about/careers/applications/jobs/results?q=%22software%20engineer%22&employment_type=FULL_TIME&company=Google&location=India'

GOOGLE_JOB_LINK_PREFIX = 'https://www.google.com/about/careers/applications/'
FETCHED_GOOGLE_JOB_URL_PREFIX = 'jobs/results/'

MINIMUM_QUALIFICATION_QUESTION = 'What are the Minimum Qualifications in the job?'
PREFERRED_QUALIFICATION_QUESTION = 'What are the Preferred Qualifications in the job?'
JOB_RESPONSIBILITIES_QUESTION = 'What are the Responsibilities in the job?'

In [16]:
loader = RecursiveUrlLoader(
    url=BASE_URL, max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
)

docs = loader.load()

In [35]:
def get_all_urls(base_url):
    try:
        # Fetch the HTML content from the URL
        response = requests.get(base_url)
        response.raise_for_status()  # This will raise an exception for bad responses (4xx or 5xx)

        # Parse the HTML content
        soup = Soup(response.text, 'html.parser')

        # Find all 'a' tags (anchor tags) which contain links
        links = soup.find_all('a')

        # Store the URLs in a list
        all_urls = []
        for link in links:
            href = link.get('href')  # Get the value of the 'href' attribute
            if href:
                all_urls.append(href)

        return all_urls

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return []


def get_google_job_urls(urls):
    google_job_urls = []
    
    for url in urls:
        if url.startswith(FETCHED_GOOGLE_JOB_URL_PREFIX):
            google_job_urls.append(GOOGLE_JOB_LINK_PREFIX + url)
    
    return google_job_urls


In [27]:
def get_url_content(url):
    return WebBaseLoader(url).load()

In [None]:
urls = get_all_urls(BASE_URL)
print(urls)

In [None]:
google_job_urls = get_google_job_urls(urls)
print(google_job_urls)
print(len(google_job_urls))

In [None]:
content = get_url_content(google_job_urls[0])
print(content[0])

In [77]:
def get_model(load_from_hugging_face=False):
    if load_from_hugging_face:
        llm = HuggingFaceEndpoint(
            repo_id="openai/gpt-oss-120b",
            task="text-generation",
            provider="auto",  # set your provider here
        )

        return ChatHuggingFace(llm=llm)
    
    return ChatOpenAI(model="gpt-4", temperature=0.0)

In [7]:
load_dotenv()

True

In [109]:
def get_resume_content():
    profile = open('./knowledge_base/resume.md', 'r')
    profile_content = profile.read()
    return profile_content


def get_minimum_qualifications(job_profile_content):
    prompt = hub.pull("rlm/rag-prompt")
    llm = get_model(load_from_hugging_face=True)
    rag_chain = prompt | llm | StrOutputParser()
    
    response = rag_chain.invoke(
        {"context": job_profile_content, "question": MINIMUM_QUALIFICATION_QUESTION})

    return response


def get_preferred_qualification(job_profile_content):
    prompt = hub.pull("rlm/rag-prompt")
    llm = get_model(load_from_hugging_face=True)
    rag_chain = prompt | llm | StrOutputParser()
    
    response = rag_chain.invoke(
        {"context": job_profile_content, "question": PREFERRED_QUALIFICATION_QUESTION})

    return response


def get_job_responsibilities(job_profile_content):
    prompt = hub.pull("rlm/rag-prompt")
    llm = get_model(load_from_hugging_face=True)
    rag_chain = prompt | llm | StrOutputParser()
    
    response = rag_chain.invoke(
        {"context": job_profile_content, "question": JOB_RESPONSIBILITIES_QUESTION})

    return response


def analyze_resume_against_minimum_qualifications(resume_content, minimum_qual):
    model = get_model(load_from_hugging_face=True)

    grade_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", MINIMUM_QUALIFICATION_PROMPT),
            ("human", "Minimum Qualification: \n\n {min_qual} \n\n Candidate's resume: {resume}"),
        ]
    )

    grade_chain = grade_prompt | model | StrOutputParser()
    response = grade_chain.invoke({"min_qual": minimum_qual, "resume": resume_content})

    return response


In [103]:
resume_content = get_resume_content()
response = analyze_resume_against_minimum_qualifications(resume_content, min_quals)

In [2]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('knowledge_base/resume.pdf')
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [3]:
for page in pages:
    print(page.page_content)

Saurav Prateek
Gurugram, India +91 9911453471 srvptk97@gmail.com Linkedin | Github  
Professional Experience (5+ years)
 Google
Nov 2022 - April 2025Web Solutions Engineer - II (L4) | gTech (Google Technical Services)
Designed and developed the Ads Headroom Planning and Projection tool for Video based campaigns in Google Ads from 
scratch allowing Ads Specialists to project and analyse future performances of 500+ Video Campaigns and 100+ AdWords 
Accounts at once.
Implemented an asynchronous approach to import multiple AdWords Account in-parallel using PromiseGraphs (an async 
framework) and Fire-and-Forget pattern. This helped in successfully importing 15+ dimensions of Campaigns having across 
1 Million+ results within 30 seconds.
Designed and implemented SQL data pipeline from scratch for crunching Google Ads’ LCS and SMB pods data. The pipeline 
had 30+ nodes forming a graphical structure and processing 1.5 Million+ quarterly ads sales data.
Awarded a Spot Bonus from the Director o

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Specify the local directory where your model files are stored
local_model_path = "../gpt-oss-20b"

# Load the tokenizer
# tokenizer = AutoTokenizer.from_pretrained(local_model_path)

# Load the model, specifying the local path
model = AutoModelForCausalLM.from_pretrained(local_model_path)

  from .autonotebook import tqdm as notebook_tqdm


RuntimeError: Using MXFP4 quantized models requires a GPU

In [8]:
from langchain_mistralai import ChatMistralAI

llm = ChatMistralAI(
    model="mistral-large-latest",
    temperature=0,
    max_retries=2,
    # other params...
)

llm.invoke("What is the capital of France?")  # Example usage

AIMessage(content='The capital of France is **Paris**. It is not only the political and administrative center of the country but also a major cultural, historical, and economic hub. Paris is famous for landmarks like the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral.', additional_kwargs={}, response_metadata={'token_usage': {'prompt_tokens': 10, 'total_tokens': 64, 'completion_tokens': 54}, 'model_name': 'mistral-large-latest', 'model': 'mistral-large-latest', 'finish_reason': 'stop'}, id='run--c8a4f63f-e207-4a6a-b076-0b4f73d4af0a-0', usage_metadata={'input_tokens': 10, 'output_tokens': 54, 'total_tokens': 64})