# MarkItDown
Wanted to see if we could use MarkItDown to make the resume better fit expected input of LLMs.

In [None]:
from markitdown._markitdown import PdfConverter

md = PdfConverter()
resume_markdown = md.convert("Tommy Nguyen Resume 2 Pages.pdf", file_extension = ".pdf")

markup_text = resume_markdown.text_content.replace("• ", "").replace("• \n", "")

print(markup_text)

In [None]:
from pypdf import PdfReader

reader = PdfReader("Tommy Nguyen Resume 2 Pages.pdf")

print(len(reader.pages))

final_resume_text = ""

for page_num in range(len(reader.pages)):
    resulting_text = reader.pages[page_num].extract_text()

    # Removing all "comments" brought over from Word to PDF when converting. Forgot to remove them beforehand. Preprocessing the text here.
    resulting_list = resulting_text.split("\n")
    for i in range(len(resulting_list)):
        index_of_comments = resulting_list[i].lower().find("comment")
        if index_of_comments >= 0:
            resulting_list[i] = resulting_list[i].replace(resulting_list[i][index_of_comments:-1], "")

        string_check = resulting_list[i].lower().strip()
        if "graduated august 2023..." in string_check or "undergraduate courses."  == string_check or "https://www.indeed.com/career -advice/resumes"  in string_check or "how did you mentor them?"  in string_check or "letters/how -to-list-publications -on-resume" in string_check or "another subsection as personal." == string_check:
            resulting_list[i] = None
    
    # TODO: There's still a ton of whitespaces between some parts, most likely due to page breaks.
    processed_text = "\n".join([item for item in resulting_list if item is not None ])

    if final_resume_text != "":
        final_resume_text += "\n\n" + processed_text
    else:
        final_resume_text += processed_text
    # print(f"Page {page_num}:\n{processed_text}")

print(f"Final extracted text:\n{final_resume_text}")

# Scraping For Job Postings

In [1]:
import requests
from bs4 import BeautifulSoup
import math
import pandas as pd

# User input
keyword = ["Software Engineer", "Amazon"]
location = "Boston, MA"

# Constants/Fields
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"}

SPACE_REPLACER = "%20"
COMMA_REPLACER = "%2C"

class LinkedinScraper():
    def __init__(self):
        self.output_url = "https://www.linkedin.com/jobs/search/?currentJobId={}"
        self.results_context_class = "results-context-header"
        self.results_context_tag_type = "div"

        self.job_count_class = "results-context-header__job-count"
        self.job_count_tag_type = "span"

        self.keyword_query = ""
        self.location_query = ""
        self.target_url = ""

    def set_keyword_and_locations(self, keyword : list[str], location : str):
        """
        Sets up urls for GET API requests.

        Args:
            keyword: list of strings containing specific keywords to search for in a job.
            location: Desired location for the job.
        """
        self.keyword_query = f"{COMMA_REPLACER}{SPACE_REPLACER}".join(keyword).replace(",", COMMA_REPLACER).replace(" ", SPACE_REPLACER)
        self.location_query = location.replace(",", COMMA_REPLACER).replace(" ", SPACE_REPLACER)
        url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={self.keyword_query}&location={self.location_query}"
        self.target_url= url + '&geoId=&currentJobId=&start={}'

    def get_job_counter(self) -> int:
        """
        Retrieves the number of jobs found for the particular search. Used by "get_jobs" to determine number of iterations.
        """
        job_counter_url = f"https://www.linkedin.com/jobs/search?keywords={self.keyword_query}&location={self.location_query}&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0"
        
        request = requests.get(job_counter_url)
        soup1 = BeautifulSoup(request.text, "html.parser")

        try:
            num_jobs_found = soup1.find_all(self.results_context_tag_type, {"class" : self.results_context_class})[0].find(self.job_count_tag_type, {"class" : self.job_count_class}).text
            num_jobs_found = num_jobs_found.replace("+", "").replace(",", "")
        except:
            raise Exception("The 'Job Counter' tag was not found! LinkedIn must have changed it...")
        
        if not num_jobs_found or not num_jobs_found.isdigit():
            raise Exception(f"Either the returned job counter is invalid or the search is too broad! Job Counter: {num_jobs_found}")

        return int(num_jobs_found)

    def get_jobs(self, keyword : list[str] = ["Software Engineer"], location : str = "Boston") -> pd.DataFrame:
        """
        Retrieves job listings from LinkedIn based on keyword and location search.

        Args:
            keyword: list of strings containing specific keywords to search for in a job.
            location: Desired location for the job.
        Returns:
            DataFrame: A dataframe containing all relevant jobs to search.
        """
        self.set_keyword_and_locations(keyword, location)

        num_jobs = self.get_job_counter()
        job_ids=[]
        job_info={}
        jobs=[]
        for i in range(0,math.ceil(num_jobs/25)):

            res = requests.get(self.target_url.format(i))
            soup=BeautifulSoup(res.text,'html.parser')
            alljobs_on_this_page=soup.find_all("li")
            # print(len(alljobs_on_this_page))
            for x in range(0,len(alljobs_on_this_page)):
                jobid = alljobs_on_this_page[x].find("div",{"class":"base-card"}).get('data-entity-urn').split(":")[3]
                job_ids.append(jobid)

        specific_job_url='https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{}'
        for j in range(0,len(job_ids)):

            resp = requests.get(specific_job_url.format(job_ids[j]))
            soup=BeautifulSoup(resp.text,'html.parser')

            try:
                job_info["company"]=soup.find("div",{"class":"top-card-layout__card"}).find("a").find("img").get('alt')
            except:
                continue

            try:
                a_tag = soup.find("div",{"class":"top-card-layout__entity-info"}).find("a")
                job_info["job-title"] = a_tag.text.strip()
            except:
                job_info["job-title"]=None
            
            try:
                job_description = soup.find("div", {"class" : "description__text description__text--rich"}).text.lstrip("\n") 
                job_info["job-description"] = job_description
            except:
                job_info["job-description"] = "N/A"
            
            try:
                job_info["pay-range"] = soup.find("div", {"class" : "compensation__salary-range"}).find("div", {"class" : "salary compensation__salary"}).text
            except:
                job_info["pay-range"] = "N/A"

            try:
                job_info["level"]=soup.find("ul",{"class":"description__job-criteria-list"}).find("li").text.replace("Seniority level","").strip()
            except:
                job_info["level"]=None

            job_info["job-link"] = self.output_url.format(job_ids[j])

            jobs.append(job_info)
            job_info={}

        df = pd.DataFrame(jobs)
        # df.to_csv('linkedinjobs.csv', index=False, encoding='utf-8')
        # print(df)
        return df

linkedinObj = LinkedinScraper()
linkedinObj.get_jobs(["Software Engineer", "Apple", "Python"], "Boston, MA").to_csv("jobs.csv")