In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException, WebDriverException
import time
import pandas as pd
import random

# LinkedIn credentials
with open("credentials.txt", "r") as file: #You need to put your own username and password
    linkedin_username = file.readline().strip()
    linkedin_password = file.readline().strip()

def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    options.add_argument("--disable-extensions")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--remote-debugging-port=9222")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    service = Service(ChromeDriverManager().install())
    return webdriver.Chrome(service=service, options=options)

def random_sleep(min_time=1, max_time=3):
    time.sleep(random.uniform(min_time, max_time))

def wait_and_click(driver, by, value, timeout=10):
    try:
        element = WebDriverWait(driver, timeout).until(
            EC.element_to_be_clickable((by, value))
        )
        random_sleep()
        element.click()
        return True
    except (TimeoutException, StaleElementReferenceException) as e:
        print(f"Error clicking element {value}: {e}")
        return False

def safe_find_element(driver, by, value, timeout=10):
    try:
        return WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((by, value))
        )
    except (TimeoutException, NoSuchElementException) as e:
        print(f"Element not found {value}: {e}")
        return None

def scroll_job_list(driver):
    SCROLL_PAUSE_TIME = 2
    last_height = driver.execute_script("return document.querySelector('.jobs-search-results-list').scrollHeight")

    while True:
        # Scroll down the job list
        driver.execute_script("document.querySelector('.jobs-search-results-list').scrollBy(0, 500);")
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with the last scroll height
        new_height = driver.execute_script("return document.querySelector('.jobs-search-results-list').scrollHeight")
        if new_height == last_height:
            break  # If the scroll height hasn't changed, we are at the bottom of the page
        last_height = new_height



def scrape_jobs_on_page(driver, job_data, job_links):
    job_cards = driver.find_elements(By.CSS_SELECTOR, '.job-card-container')
    total_jobs_on_page = len(job_cards)

    for index, job_card in enumerate(job_cards):
        job_link_element = job_card.find_element(By.CSS_SELECTOR, 'a.job-card-container__link')
        job_link = job_link_element.get_attribute('href')

        if job_link in job_links:
            continue  # Skip already processed job links

        job_card.click()
        random_sleep(1, 2)

        try:
            title_element = safe_find_element(driver, By.CSS_SELECTOR, '.job-details-jobs-unified-top-card__job-title')
            company_element = safe_find_element(driver, By.CSS_SELECTOR, '.job-details-jobs-unified-top-card__company-name')
            location_element = safe_find_element(driver, By.CSS_SELECTOR, '.job-details-jobs-unified-top-card__primary-description-container')
            description_element = safe_find_element(driver, By.CLASS_NAME, 'jobs-description__content')

            job_title = title_element.text.strip() if title_element else "N/A"
            company = company_element.text.strip() if company_element else "N/A"
            location = location_element.text.strip() if location_element else "N/A"
            job_description = description_element.text.strip() if description_element else "N/A"

            if job_link not in job_links:
                job_links.add(job_link)
                job_data.append({
                    "Job Title": job_title,
                    "Company": company,
                    "Location": location,
                    "Job Description": job_description
                })

                print(f"Extracted: {job_title} at {company} in {location}")

            # Scroll to make the next job card visible
            if index < total_jobs_on_page - 1:
                driver.execute_script("arguments[0].scrollIntoView(true);", job_cards[index + 1])
                random_sleep(0.5, 1.5)

        except Exception as e:
            print(f"Error extracting job info: {e}")


def scrape_all_pages(driver):
    job_data = []
    job_links = set()
    current_page = 1

    while len(job_data) < 100:  # Extract up to 100 jobs
        print(f"Scraping page {current_page}...")

        # Step 1: Scroll through the entire job list to load all jobs on the page
        scroll_job_list(driver)

        # Step 2: Scrape all job details on the current page
        scrape_jobs_on_page(driver, job_data, job_links)

        # Step 3: Navigate to the next page using page numbers
        try:
            current_page += 1
            next_page_button = safe_find_element(driver, By.XPATH, f"//button[@aria-label='Page {current_page}']")
            if next_page_button:
                next_page_button.click()
                random_sleep(5, 7)
            else:
                print(f"Reached the last page or no more jobs to scrape at page {current_page - 1}.")
                break
        except Exception as e:
            print(f"Error navigating to page {current_page}: {e}")
            break

    return job_data

def navigate_to_jobs_page(driver):
    try:
        jobs_link = safe_find_element(driver, By.LINK_TEXT, "Jobs")
        if jobs_link:
            jobs_link.click()
            random_sleep(5, 7)
            return True
    except Exception as e:
        print(f"Error navigating to Jobs page: {e}")
    return False



def main():
    driver = None
    try:
        driver = setup_driver()

        # Step 1: Login to LinkedIn
        driver.get("https://www.linkedin.com/login")
        random_sleep(3, 5)
        
        username = safe_find_element(driver, By.ID, "username")
        if username:
            username.send_keys(linkedin_username)
            random_sleep()
        
        password = safe_find_element(driver, By.ID, "password")
        if password:
            password.send_keys(linkedin_password)
            random_sleep()
            password.send_keys(Keys.RETURN)
        
        random_sleep(5, 7)

        # Step 2: Navigate to the Jobs page
        if not navigate_to_jobs_page(driver):
            raise Exception("Failed to navigate to Jobs page")

        # Step 3: Search for jobs
        search_input = safe_find_element(driver, By.CSS_SELECTOR, "input[aria-label='Search by title, skill, or company']")
        if not search_input:
            search_input = safe_find_element(driver, By.CSS_SELECTOR, "input[placeholder='Search jobs']")
        if search_input:
            search_input.send_keys(user_job_title)
            random_sleep()
            search_input.send_keys(Keys.RETURN)
            random_sleep(5, 7)
        else:
            print("Could not find search input")
            raise Exception("Search failed")

        # Step 4: Extract job information across all pages
        job_data = scrape_all_pages(driver)

        # Step 5: Save to a CSV file
        if job_data:
            df = pd.DataFrame(job_data)
            df.to_csv("linkedin_jobs_detailed v1.csv", index=False)
            print("Job details have been saved to 'linkedin_jobs_detailed.csv'.")
        else:
            print("No job data was collected.")

    except WebDriverException as e:
        print(f"WebDriver error occurred: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        if driver:
            driver.quit()

if __name__ == "__main__":
    user_job_title = input("Enter your the Role you want: ")  #Data Scientist
    main()
    
    


Scraping page 1...
Extracted: Lead Data Scientist-Retail at Prometheus Consulting Services. in Bengaluru, Karnataka, India · 3 days ago · Over 100 applicants
Extracted: Machine Learning Specialist at Cloud4C Services in Hyderabad, Telangana, India · 1 week ago · Over 100 applicants
Extracted: Generative AI Engineer at Deloitte in Bengaluru, Karnataka, India · Reposted 8 hours ago · Over 100 applicants
Extracted: Data Scientist at LTIMindtree in Bengaluru, Karnataka, India · 1 minute ago · 0 applicants
Extracted: Senior Data Scientist at S&P Global in Hyderabad, Telangana, India · Reposted 6 days ago · Over 100 applicants
Extracted: Data Scientist -10+Yrs-Immediate joiner to 30days at HCLTech in Bengaluru, Karnataka, India · 2 weeks ago · Over 100 applicants
Extracted: Senior Data Scientist at S&P Global in Bengaluru, Karnataka, India · Reposted 1 week ago · 79 applicants
Extracted: Lead Data Scientist-R-228147 at Mastercard in Pune, Maharashtra, India · 1 week ago · 56 applicants
Extra

In [6]:
df = pd.read_csv('linkedin_jobs_detailed v1.csv')
df.head(2)
df= df.rename(columns={"Location": "Detials"})
df['Deatails_copy']= df['Detials']
df['Location'] = df['Deatails_copy'].str.split('·').str[0]
df.head(5)

Unnamed: 0,Job Title,Company,Detials,Job Description,Deatails_copy,Location
0,Lead Data Scientist-Retail,Prometheus Consulting Services.,"Bengaluru, Karnataka, India · 3 days ago · Ove...","About the job\nHi Folks,\nWe are hiring for an...","Bengaluru, Karnataka, India · 3 days ago · Ove...","Bengaluru, Karnataka, India"
1,Machine Learning Specialist,Cloud4C Services,"Hyderabad, Telangana, India · 1 week ago · Ove...",About the job\nDevOPs / MLOPs engineer\n Key R...,"Hyderabad, Telangana, India · 1 week ago · Ove...","Hyderabad, Telangana, India"
2,Generative AI Engineer,Deloitte,"Bengaluru, Karnataka, India · Reposted 8 hours...","About the job\nHi, We are looking for Candidat...","Bengaluru, Karnataka, India · Reposted 8 hours...","Bengaluru, Karnataka, India"
3,Data Scientist,LTIMindtree,"Bengaluru, Karnataka, India · 1 minute ago · 0...",About the job\nWe are looking for a Data Scien...,"Bengaluru, Karnataka, India · 1 minute ago · 0...","Bengaluru, Karnataka, India"
4,Senior Data Scientist,S&P Global,"Hyderabad, Telangana, India · Reposted 6 days ...",About the job\nAbout The Role\n\nGrade Level (...,"Hyderabad, Telangana, India · Reposted 6 days ...","Hyderabad, Telangana, India"


In [7]:
# Identify where jobs are posted more ?
df['Location'].value_counts(normalize=True).sort_values(ascending=False)

Location
Bengaluru, Karnataka, India           0.44
Hyderabad, Telangana, India           0.15
Pune, Maharashtra, India              0.09
Bengaluru East, Karnataka, India      0.06
Mumbai, Maharashtra, India            0.04
Mumbai Metropolitan Region            0.03
Gurugram, Haryana, India              0.02
Ahmedabad, Gujarat, India             0.02
Gurgaon, Haryana, India               0.02
Meghalaya, India                      0.01
Noida, Uttar Pradesh, India           0.01
Arunachal Pradesh, India              0.01
Gujarat, India                        0.01
Madhya Pradesh, India                 0.01
Telangana, India                      0.01
Mizoram, India                        0.01
Bangalore Urban, Karnataka, India     0.01
Lakshadweep, India                    0.01
Bihar, India                          0.01
West Bengal, India                    0.01
India                                 0.01
Kanayannur, Kerala, India             0.01
Name: proportion, dtype: float64

In [8]:
# Identify titles captured
df['Job Title'].value_counts(normalize=True).sort_values(ascending=False)

Job Title
Senior Data Scientist                                                              0.14
Lead Data Scientist                                                                0.11
Data Scientist                                                                     0.07
Data and Applied Scientist II                                                      0.03
Engagement Manager-Analytics                                                       0.03
Principal Specialist, Data Analyst                                                 0.02
Principal Analyst                                                                  0.02
Senior Manager - Data Science                                                      0.02
Gen AI - Manager                                                                   0.02
Manager II, Generative AI                                                          0.02
Lead Data Scientist-R-228147                                                       0.02
Gen AI - Senior Associ

In [1]:
import logging
import pandas as pd
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import Ollama

# Load and preprocess data
df = pd.read_csv('linkedin_jobs_detailed v1.csv')
job_descriptions = df['Job Description'].tolist()

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Optimize chunk size and overlap
chunk_size = 80000  # Adjust based on your needs
chunk_overlap = 100  # Adjust overlap as needed
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

chunks = text_splitter.split_text('\n'.join(job_descriptions))

# Define a prompt template for extracting skillsets
extraction_prompt = PromptTemplate(
    input_variables=["chunk"],
    template="""Analyze the following text chunk and extract the key skillsets required: {chunk}

    Below points should be focused on when extracting key skillsets:
        - Focus on technical skillsets more
        - Extract relevant programming language or libraries or technology stack that is necessary
        - Don't repeat skillsets and be brief
    """
)

# Define the LLM
llm = Ollama(model="llama31_storm",num_ctx=4096)

# Process each chunk with the LLM and store responses
responses = []
logger.info(f"Starting extraction from {len(chunks)} chunks.")
for i, chunk in enumerate(chunks):
    logger.info(f"Processing chunk {i+1}/{len(chunks)}.")
    prompt = extraction_prompt.format(chunk=chunk)
    response = llm.invoke(input=prompt)  # Using `input` argument instead of `prompt`
    responses.append(response)
logger.info("Completed extraction from all chunks.")

# Combine responses into a single string for the ranking LLM
responses_str = "\n".join(responses)

# Define a prompt template for ranking and commenting
ranking_prompt = PromptTemplate(
    input_variables=["responses"],
    template=(
       """Given the following list of skillset extractions, rank the top most important skillsets
        and provide a brief commentary on why these are critical for landing a job in this field:
        {responses}

        Below points should be adhered to when data is presented:
        - Focus on technical skillsets 
        - If any non-technical skillset is repeated make it brief
        - Maximum number of important skillsets can be 10, it can be lower as well.
        - Focus on latest trend skillsets if present
        """
    )
)

# Log the step of the process
logger.info("Starting ranking and commenting with the second LLM.")
ranking_input = ranking_prompt.format(responses=responses_str)
final_output = llm.invoke(input=ranking_input)  # Using `input` argument instead of `prompt`
logger.info("Completed ranking and commenting with the second LLM.")

# Print the final output
print(final_output)


INFO:__main__:Starting extraction from 6 chunks.
INFO:__main__:Processing chunk 1/6.
INFO:__main__:Processing chunk 2/6.
INFO:__main__:Processing chunk 3/6.
INFO:__main__:Processing chunk 4/6.
INFO:__main__:Processing chunk 5/6.
INFO:__main__:Processing chunk 6/6.
INFO:__main__:Completed extraction from all chunks.
INFO:__main__:Starting ranking and commenting with the second LLM.
INFO:__main__:Completed ranking and commenting with the second LLM.


Based on the provided job descriptions and extracted key skill sets, here are the top 10 most important technical skillsets for a career in data science and AI:

1. **Python Libraries**: Proficiency in Pandas, NumPy, SciPy for data analysis and manipulation.
2. **Machine Learning**: Knowledge of building and optimizing machine learning models to predict industry trends and supply chain dynamics.
3. **Data Engineering**: Experience with developing and deploying predictive models to solve complex business problems.
4. **Generative AI (GenAI)**: Leveraging GenAI to enhance data analysis and generate insightful reports.
5. **Deep Learning Frameworks**: Experience with deep learning frameworks like PyTorch or TensorFlow.
6. **Large Language Models (LLMs)**: Demonstrated experience with LLMs such as GPT, BERT, T5, etc.
7. **Cloud Platforms**: Familiarity with cloud platforms like AWS, Azure, Google Cloud Platform.
8. **Distributed Training Tools**: Experience with distributed training tools 