<a href="https://colab.research.google.com/github/StephenWalther/langgraph-job-finder/blob/main/LangGraph_Job_Finder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LangGraph Job Finder
This app illustrates how to use LangGraph with a Large Language Model (LLM) to retrieve a list of jobs, generate personalized cover letters for each job, and save the cover letters to Google Drive. This app also illustrates using LangGraph with Human in the Loop to get human input at a crucial step.

In [None]:
# Settings
"""
  These are global notebook settings such as LLM instructions and Google
  Drive paths.
"""

GOOGLE_DRIVE_PATH = '/content/drive'
APP_PATH = GOOGLE_DRIVE_PATH + '/MyDrive/langgraph-job-finder'
RESUME_PATH = APP_PATH + '/resume.pdf'
OUTPUT_PATH = APP_PATH + '/output'
PREVIOUS_PATH = APP_PATH + '/previous'
JOB_SEARCH_URL = (
    "https://www.linkedin.com/jobs/search/?"
    "keywords=Product%20Manager%20AI"
    "&location=Austin%2C%20Texas%2C%20United%20States"
    "&f_WT=2%2C3" # 2 is Remote, 3 is Hybrid
    "&f_TPR=r604800" # Posted in last week
    "&position=1"
    "&pageNum=0"
)


COVER_LETTER_INSTRUCTIONS = """
  You are a helpful assistant that writes cover letter used in an email. Do not
  include placeholders. Keep the cover letter concise (no more than 3 paragraphs).
  Do not mention the job ID in the cover letter. Lead with the reason that I am the
  best match for the job given my resume and the job description. Do not make up
  any qualifications that are not explicitly listed in the resume. Always start
  with a salutation.
  """

In [None]:
# Get secrets

from google.colab import userdata

LINKEDIN_EMAIL = userdata.get('LINKEDIN_EMAIL')
if LINKEDIN_EMAIL is None:
  raise ValueError('LINKEDIN_EMAIL not set in secrets')

LINKEDIN_PASSWORD = userdata.get('LINKEDIN_PASSWORD')
if LINKEDIN_PASSWORD is None:
  raise ValueError('LINKEDIN_PASSWORD not set in secrets')

OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
if OPENAI_API_KEY is None:
  raise ValueError('OPENAI_API_KEY not set in secrets')

ANTHROPIC_API_KEY = userdata.get('ANTHROPIC_API_KEY')
if ANTHROPIC_API_KEY is None:
  raise ValueError('ANTHROPIC_API_KEY not set in secrets')

GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
if GOOGLE_API_KEY is None:
  raise ValueError('GOOGLE_API_KEY not set in secrets')

In [None]:
# Install packages

%pip install -U langchain_openai
%pip install -U langchain_anthropic
%pip install -U langchain-google-genai
%pip install -U openai
%pip install -U pdfplumber
%pip install -U reportlab
%pip install PyPDF2

!apt-get update
!apt-get install -y xvfb
!apt-get install -y x11-xserver-utils
!apt-get install -y xephyr


%pip install pyvirtualdisplay

%pip install -U langgraph langsmith
%pip install -U openai
%pip install -U playwright
!playwright install

In [None]:
# Enable virtual display for headful browser
"""
  LinkedIn will act differently when accessed with a headless browser.
  Therefore, we are instantiating a 1920x1080 browser with a virtual display.
"""

from pyvirtualdisplay import Display
from playwright.sync_api import sync_playwright
import os

#display = Display(visible=1, size=(1920, 1080), backend='xvfb', extra_args=['-ac'])
display = Display(visible=1, size=(3840, 2160), backend='xvfb', extra_args=['-ac'])
display.start()

In [None]:
# Mount Google Drive
"""
  We are using Google Drive as a database (persistent data store)
"""
from google.colab import drive
drive.mount(GOOGLE_DRIVE_PATH)

In [None]:
# Create LangGraph App folder
"""
  Create the standard Google Drive folders:
    /langgraph-job-finder
    /langgraph-job-finder/output
    /langgraph-job-finder/previous
"""
from google.colab import drive
import os

if not os.path.exists(APP_PATH):
  os.makedirs(APP_PATH)
  print(f"Folder '{APP_PATH}' created successfully.")
else:
  print(f"Folder '{APP_PATH}' already exists.")

if not os.path.exists(PREVIOUS_PATH):
  os.makedirs(PREVIOUS_PATH)
  print(f"Folder '{PREVIOUS_PATH}' created successfully.")
else:
  print(f"Folder '{PREVIOUS_PATH}' already exists.")

In [None]:
# Retrieve Resume
"""
  If a resume.pdf does not exist in the app folder, prompt the user
  to upload one.
"""
from google.colab import files
import os

def load_or_upload_resume(resume_path):
  """Loads a resume from the given path or prompts the user to upload one.

  Args:
    resume_path: The path to the resume file.

  Returns:
    The path to the loaded or uploaded resume file.
  """
  if os.path.exists(resume_path):
    print(f"Resume found at '{resume_path}'. Loading...")
    return resume_path
  else:
    print(f"Resume not found at '{resume_path}'. Please upload your resume:")
    uploaded = files.upload()
    for filename in uploaded.keys():
      with open(resume_path, 'wb') as f:
        f.write(uploaded[filename])
      print(f"Resume saved to '{resume_path}'.")
      return resume_path

resume_path = load_or_upload_resume(RESUME_PATH)
print(f"Using resume from: {resume_path}")


In [None]:
# Create Google Drive folder for today
"""
  Create a new folder with today's date as the name to store all of the
  generated cover letters.
"""
import os
import datetime

def create_dated_folder(output_path):
  """Creates a new folder with today's date as the name.

  Args:
    output_path: The base path where the folder should be created.

  Returns:
    The path to the created folder.
  """
  today = datetime.date.today().strftime('%Y-%m-%d')
  folder_path = os.path.join(output_path, today)

  if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    print(f"Folder '{folder_path}' created successfully.")
  else:
    print(f"Folder '{folder_path}' already exists.")

  return folder_path

# create folder:
today_folder_path = create_dated_folder(OUTPUT_PATH)
print(f"Today's output folder: {today_folder_path}")

In [None]:
# Global functions
"""
  Functions that are used throughout the notebook.
"""

async def screen_shot(page, name, show=True):
  # generate and display image
  await page.screenshot(path=f"{name}.png")
  if show:
    display(Image(filename=f"{name}.png"))

  # save complete HTML
  html = await page.content()
  with open(f"{name}.html", "w", encoding="utf-8") as f:
        f.write(html)


def get_cover_letter_path_txt(job_folder_path, provider):
  return os.path.join(job_folder_path, f"{provider}_cover_letter.txt")

def get_cover_letter_path_pdf(job_folder_path, provider):
  return os.path.join(job_folder_path, f"{provider}_cover_letter.pdf")

def get_resume_with_cover_letter_path_pdf(job_folder_path, provider):
  return os.path.join(job_folder_path, f"{provider}_resume_with_cover_letter.pdf")

In [None]:
# Define LangGraph State
"""
  Did not end up using a state object because used Google Drive as persistent
  state instead (which worked better for this scenario)
"""

from langgraph.graph import StateGraph, START, END
from typing_extensions import Annotated, TypedDict, List, Optional

class Job(TypedDict):
    title: str
    company: str
    description: str
    url: str
    isGoodFit: Optional[bool]
    coverLetter: Optional[str]
    isApproved: Optional[bool]

class State(TypedDict):
    resume: str
    jobs: List[Job]


graph_builder = StateGraph(State)

In [None]:
# LangGraph Node - login - Login to LinkedIn
"""
  You need an existing LinkedIn email and password stored in secrets. This node
  will request a LinkedIn verification PIN from your email if LinkedIn
  requires it.
"""

async def login(state: State):
  print("login node started")

  # Navigate to LinkedIn login page
  await page.goto('https://www.linkedin.com/login')
  await screen_shot(page, "linkedin_login")
  print("Navigated to LinkedIn login page")

  # Fill in credentials
  await page.fill('#username', LINKEDIN_EMAIL)
  await page.fill('#password', LINKEDIN_PASSWORD)

  # Click the sign in button
  await page.wait_for_timeout(random.uniform(500, 1000))
  await page.click('button[type="submit"]')
  await screen_shot(page, "linkedin_login_submit")
  print("Clicked authenticate button")


  # if verification required, ask user for code from email
  # Check if the <input> element with name="pin" exists
  pin_locator = page.locator('input[name="pin"]')
  exists = await pin_locator.count() > 0
  if exists:
      print("The input element with name='pin' exists.")

      # ask user for pin
      pin = input("Enter LinkedIn PIN code from email")

       # Fill in credentials
      await page.fill('input[name="pin"]', pin)

      # Click the sign in button
      await page.click('button[type="submit"]')
      await screen_shot(page, "pin_submit")
      print("Clicked pin button")

  else:
      print("The input element with name='pin' does not exist.")

  print("login node done")
  return

In [None]:
# LangGraph Node - fetch_jobs - Fetch job list using LinkedIn job search criteria
"""
  Navigate to job search results page and retrieve a list of jobs. Matching
  jobs are stored in your Google Drive.
"""
import os
from playwright.async_api import async_playwright
import random
from IPython.display import Image

def extract_job_id(url: str) -> str:
    """
    Extract the job ID from a LinkedIn job URL.
    Example URL: /jobs/view/4093827142/?eBP=...
    Returns: '4093827142'
    """
    try:
        # Split the URL by '/' and get the job ID segment
        segments = url.strip('/').split('/')
        # The job ID should be after 'view' in the URL
        if 'view' in segments:
            view_index = segments.index('view')
            job_id = segments[view_index + 1].split('?')[0]  # Remove any query parameters
            return job_id
    except Exception as e:
        print(f"Error extracting job ID: {e}")
        return None

    return None


async def fetch_jobs(state):
    print("fetch_jobs node started")

    # Navigate to Jobs page with search parameters
    # Using URL parameters for more precise control
    await page.goto(url=JOB_SEARCH_URL, wait_until='domcontentloaded')
    print("Navigated to job search results")

    # save as jobs
    await screen_shot(page, "jobs")

    # Get links
    await page.wait_for_timeout(random.uniform(500, 1000))
    job_links = await page.locator('a.job-card-container__link').all()
    print("Getting each job link. found " + str(len(job_links)) + " links")

    # Click on each job link
    for job_link in job_links:
        job_id = extract_job_id(await job_link.get_attribute('href'))
        print(f"Job ID: {job_id}")

        # if the job id is already in the PREVIOUS folder, skip
        if os.path.exists(f"{PREVIOUS_PATH}/{job_id}.html"):
            print(f"Skipping job ID: {job_id} because it already exists")
            continue

        # Before clicking, check for and dismiss any modal overlays:
        try:
            # This selector was updated based on the call log in the error
            dismiss_button = page.locator('button[aria-label="Dismiss"] >> visible=true')
            if await dismiss_button.is_visible():
                await dismiss_button.click()
                print("Dismissed modal overlay with aria-label 'Dismiss'")
                await page.wait_for_timeout(1000)
        except Exception as e:
            print(f"Error handling modal overlay: {e}")
            # This was added to continue to the next job if error handling modals failed
            continue


        # Now attempt the click on the job link
        await page.wait_for_timeout(random.uniform(500, 1000))
        await job_link.click(timeout=60000) # Increased timeout to 60 seconds
        print("Clicked job link")
        await screen_shot(page, f"job_link_clicked_{job_id}")

        # get job details
        job_details = await page.locator('div.jobs-details').inner_html()
        print("got job details")

        # Save job details to file
        with open(f"{today_folder_path}/{job_id}_job_details.html", "w", encoding="utf-8") as f:
            f.write(job_details)

        # Add to previous
        with open(f"{PREVIOUS_PATH}/{job_id}.html", "w", encoding="utf-8") as f:
            f.write(job_details)



In [None]:
# LangGraph Node - extract_job_info
"""
  Convert the HTML job description to JSON using OpenAI.
"""
# https://python.langchain.com/api_reference/openai/chat_models/langchain_openai.chat_models.base.ChatOpenAI.html#langchain_openai.chat_models.base.ChatOpenAI

from langchain_openai import ChatOpenAI
from openai import ContentFilterFinishReasonError

job_parse_instructions = """Parse the HTML job description and return the job details as JSON
  using the following format:
  {
    job_title: string,
    job_description: string,
    job_location: string,
    job_company: string,
    job_application_url: string,
    job_id: string,
    job_hiring_managers: string
    job_pay_range: string
  }
  """

async def extract_job_info(state):
  print("extract_job_info started")

  # Init LLM
  llm = ChatOpenAI(model="gpt-4o", api_key=OPENAI_API_KEY)
  json_llm = llm.bind(response_format={"type": "json_object"})

  # retrieve each file from the data folder with a name that matches the pattern 'job_details_*.html'
  for file in os.listdir(today_folder_path):
      if file.endswith('_job_details.html'):
          print(f"Processing file: {file}")
          json_path = f'{today_folder_path}/{file.replace(".html", ".json")}'

          # if JSON already extracted, skip
          if os.path.exists(json_path):
              print(f"Skipping job: {json_path} because it already exists")
              continue

          with open(f'{today_folder_path}/{file}', 'r', encoding='utf-8') as f:
              html_content = f.read()

              messages=[
                ("system", job_parse_instructions + html_content)
              ]

              # parse the html by passing the file to OpenAI and extract the job details into JSON
              try:
                response = await json_llm.ainvoke(messages)
                print(response.content)

                # save the response to a new file in a folder beneath the data folder with the current date with the same name but with a .json extension
                with open(json_path, 'w', encoding='utf-8') as f:
                    f.write(response.content)
              except ContentFilterFinishReasonError as e:
                print(f"Content filter error: {e}")

# uncomment to run this node independently
# await extract_job_info(None)

In [None]:
# LangGraph Node - generate_cover_letters
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI

"""
  Generate the following files for each job:
    - cover letter (text version)
    - cover letter (PDF version)
    - resume with cover letter (PDF)
"""

import json
import pdfplumber
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from PyPDF2 import PdfReader, PdfWriter

# Convert PDF to text
def extract_text_from_pdf(pdf_path):
  with pdfplumber.open(pdf_path) as pdf:
    text = ''
    for page in pdf.pages:
        text += page.extract_text() or ''
  return text

def save_cover_letter_to_txt(cover_letter_text, cover_letter_path):
  with open(cover_letter_path, 'w') as f:
    f.write(cover_letter_text)

def save_cover_letter_to_pdf(cover_letter_text, cover_letter_path):
  styles = getSampleStyleSheet()
  normal_style = styles["Normal"]
  story = []
  for paragraph in cover_letter_text.split('\n'):
      story.append(Paragraph(paragraph, normal_style))
      story.append(Spacer(1, 12))  # Add space between paragraphs

  doc = SimpleDocTemplate(cover_letter_path, pagesize=letter)
  doc.build(story)

def save_resume_with_cover_letter(resume_with_cover_letter_path_pdf, cover_letter_path_pdf):
  merger = PdfWriter()

  # Add cover letter first
  merger.append(PdfReader(cover_letter_path_pdf, 'rb'))

  # Add resume
  print(f"resume path: {RESUME_PATH}")
  merger.append(PdfReader(open(RESUME_PATH, 'rb')))

  # Write the merged PDF
  merger.write(resume_with_cover_letter_path_pdf)
  merger.close()

async def generate_cover_letter_by_provider(provider, llm, resume_text, job_details, job_folder_path):
  print(f"generate_cover_letter_by_provider started for provider {provider}")

  # generate paths
  cover_letter_path_txt = get_cover_letter_path_txt(job_folder_path, provider)
  cover_letter_path_pdf = get_cover_letter_path_pdf(job_folder_path, provider)
  resume_with_cover_letter_path_pdf = get_resume_with_cover_letter_path_pdf(job_folder_path, provider)

  # if cover letters already generated, skip
  if os.path.exists(cover_letter_path_txt):
    print(f"Skipping job because it already exists")
    return


  # Build request payload
  messages=[
    ("system", COVER_LETTER_INSTRUCTIONS),
    ("human", f"Write a cover letter for the following job: {job_details} using the following resume: {resume_text}")
  ]

  # send resume_text and job_details to OpenAI to write a cover letter
  response = await llm.ainvoke(messages)
  cover_letter_text = response.content.strip()
  print(cover_letter_text)

  # save the cover letter to a txt file '*_cover_letter.txt'
  save_cover_letter_to_txt(cover_letter_text, cover_letter_path_txt)

  # Save the cover letter to a pdf file '*_cover_letter.pdf'
  save_cover_letter_to_pdf(cover_letter_text, cover_letter_path_pdf)

  # Merge the cover letter PDF and the resume PDF
  save_resume_with_cover_letter(resume_with_cover_letter_path_pdf, cover_letter_path_pdf)

async def generate_cover_letters(state):
  print("generate_cover_letters node started")

  # Init LLMs
  llm_openai = ChatOpenAI(model="gpt-4o", api_key=OPENAI_API_KEY)
  llm_anthropic = ChatAnthropic(model="claude-3-5-sonnet-latest", anthropic_api_key=ANTHROPIC_API_KEY)
  llm_google = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest", google_api_key=GOOGLE_API_KEY)

  for file in os.listdir(today_folder_path):
    if file.endswith('.json'):
      with open(f'{today_folder_path}/{file}', 'r', encoding='utf-8') as f:
        print(f'Processing {file}')

        # Load job details
        job_details = json.load(f)
        job_id = job_details.get("job_id", "id not found")

        # Calculate job folder path
        job_folder_path = os.path.join(today_folder_path, job_id)
        if not os.path.exists(job_folder_path):
          os.makedirs(job_folder_path)

        # Get resume text instead of binary PDF data
        resume_text = extract_text_from_pdf(RESUME_PATH)

        await generate_cover_letter_by_provider('openai', llm_openai, resume_text, job_details, job_folder_path)
        await generate_cover_letter_by_provider('anthropic', llm_anthropic, resume_text, job_details, job_folder_path)
        await generate_cover_letter_by_provider('google', llm_google, resume_text, job_details, job_folder_path)


# uncomment to run this node independently
# await generate_cover_letters(None)

In [None]:
# LangGraph Node - report
"""
  Generate report.html file with job application links for
  each job.
"""
async def report(state):
  report_file_path = os.path.join(today_folder_path, "report.html")
  html_content = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Job Application Report</title>
    </head>
    <body>
        <h1>Job Application Report</h1>
        <ul>
    """


  for file in os.listdir(today_folder_path):
    if file.endswith('.json'):
      with open(f'{today_folder_path}/{file}', 'r', encoding='utf-8') as f:
        print(f'Processing {file}')
        job_details = json.load(f)

        # Extract job details
        job_title = job_details.get("job_title", "Title not found")
        job_application_url = job_details.get("job_application_url", "#")
        job_id = job_details.get("job_id", "id not found")
        job_company = job_details.get("job_company", "job_company not found")
        job_pay_range = job_details.get("job_pay_range", "job_pay_range not found")

        # If missing job_id folder then skip
        job_folder_path = os.path.join(today_folder_path, job_id)
        if not os.path.exists(job_folder_path):
          continue

        # Generate paths to cover letters and resumes
        openai_cover_letter_path_pdf = f"{job_id}/openai_cover_letter.pdf"
        openai_resume_with_cover_letter_path_pdf = f"{job_id}/openai_resume_with_cover_letter.pdf"

        anthropic_cover_letter_path_pdf = f"{job_id}/anthropic_cover_letter.pdf"
        anthropic_resume_with_cover_letter_path_pdf = f"{job_id}/anthropic_resume_with_cover_letter.pdf"

        google_cover_letter_path_pdf = f"{job_id}/google_cover_letter.pdf"
        google_resume_with_cover_letter_path_pdf = f"{job_id}/google_resume_with_cover_letter.pdf"

        # Add to HTML content
        html_content += f"""
            <li>
                <h2>{job_id} - {job_company} - {job_title}</h2>
                <p><small>{job_pay_range}</small></p>
                <p><a href="{job_application_url}" target="_blank">Apply Now</a></p>
                <p>Cover Letter PDF:
                  <a href="{openai_cover_letter_path_pdf}" target="_blank">OpenAI</a> /
                  <a href="{anthropic_cover_letter_path_pdf}" target="_blank">Anthropic</a> /
                  <a href="{google_cover_letter_path_pdf}" target="_blank">Google</a>
                </p>
            </li>
        """

  html_content += """
        </ul>
    </body>
    </html>
    """

  # Write the HTML content to file
  with open(report_file_path, 'w', encoding='utf-8') as report_file:
      report_file.write(html_content)
  print(f"Report generated at: {report_file_path}")

# uncomment to run this node independently
# await report(None)

In [None]:
# Build LangGraph graph
from langgraph.checkpoint.memory import MemorySaver

memory = MemorySaver()

# add nodes
graph_builder.add_node("login", login)
graph_builder.add_node("fetch_jobs", fetch_jobs)
graph_builder.add_node("extract_job_info", extract_job_info)
graph_builder.add_node("generate_cover_letters", generate_cover_letters)
graph_builder.add_node("report", report)


# add edges
graph_builder.add_edge(START, "login")
graph_builder.add_edge("login", "fetch_jobs")
graph_builder.add_edge("fetch_jobs", "extract_job_info")
graph_builder.add_edge("extract_job_info", "generate_cover_letters")
graph_builder.add_edge("generate_cover_letters", "report")
graph_builder.add_edge("report", END)

graph = graph_builder.compile(
    checkpointer=memory,
)


In [None]:
# Show mermaid diagram

from IPython.display import Image, display

display(Image(graph.get_graph().draw_mermaid_png()))

In [None]:
# Execute Graph

import asyncio
from playwright.async_api import BrowserContext

async def run_graph():
  # Create Playwright globals
  async with async_playwright() as playwright:
    # Init browser, page
    global browser, page
    browser = await playwright.chromium.launch(headless=False)
    context: BrowserContext = await browser.new_context(
        viewport={"width": 3840, "height": 2160}  # Set screen resolution
    )
    page = await context.new_page()

    # Init state and config
    initial_state = {"jobs": []}
    config = {"configurable": {"thread_id": "1"}, "page" : page}

    # Start the graph
    async for event in graph.astream(initial_state, config, stream_mode="updates"):
        print(event)

    print("All done!")

# Kick everything off
await run_graph()

