# Job Search Project

In [47]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
import re
from collections import defaultdict

def extract(baseurl):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
    }
    response = requests.get(baseurl, headers=headers)
    
    if response.status_code == 200:
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        return soup
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return None

def get_job_summary(soup, i):
    if soup is None:
        return [], i
    
    joblist = []
    divs = soup.find_all('article', {'data-automation': "normalJob"})
    for item in divs:
        i += 1
        jobD = item.find('span', {'data-automation': 'jobShortDescription'})
        jobD = jobD.text if jobD else 'NA'

        job_title = item.find('a', {'data-automation': 'jobTitle'})
        href = job_title.get('href') if job_title else ''
        job_title = job_title.text if job_title else 'NA'

        next_page_url = 'https://www.seek.com.au' + href
        
        # Fetch long description from detailed page
        details_soup = extract(next_page_url)
        jobLongD = 'NA'
        if details_soup:
            jobLongD_div = details_soup.find('div', {'class': 'xvu5580 _1o93qk60'})
            if jobLongD_div:
                jobLongD_parts = []
                for element in jobLongD_div.find_all(['p', 'ul', 'li'], recursive=True):
                    text = element.get_text(separator='\n', strip=True)
                    if text:
                        jobLongD_parts.append(text)
                jobLongD = '\n'.join(jobLongD_parts)

        job_salary = item.find('span', {'data-automation': 'jobSalary'})
        job_salary = job_salary.text if job_salary else 'NA'

        job_company = item.find('a', {'data-automation': 'jobCompany'})
        job_company = job_company.text if job_company else 'NA'

        job_loc = item.find('a', {'data-automation': 'jobLocation'})
        job_loc = job_loc.text if job_loc else 'NA'

        job_clas = item.find('a', {'data-automation': 'jobClassification'})
        job_clas = job_clas.text if job_clas else 'NA'

        date = item.find('span', {"data-automation": "jobListingDate"})
        date = date.text if date else 'NA'

        job = {
            'jobID': i,
            'description': jobD,
            'Long_Description': jobLongD,
            'title': job_title,
            'salary': job_salary,
            'company': job_company,
            'location': job_loc,
            'class': job_clas,
            'days_before': date,
            'link': next_page_url
        }
        joblist.append(job)
    return joblist, i

def get_next_page_url(soup):
    next_button = soup.find('a', {'aria-label': 'Next'})
    if next_button:
        next_page_url = next_button.get('href')
        if not next_page_url.startswith('http'):
            next_page_url = 'https://www.seek.com.au' + next_page_url
        return next_page_url
    print("No next page found.")
    return None

def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    return text

def match_keywords(description, keywords):
    score = defaultdict(int)
    for category, words in keywords.items():
        for word in words:
            if word in description:
                score[category] += 1
    return score

# Define your keywords
keywords = {
    'must_have': ['data science', 'python', 'sql', 'bachelor', 'data engineering'],
    'nice_to_have': ['student', 'tableau', 'java', 'ai', "above 80", 'distinction', 'ETL'],
    'time_relevant': ['2025', 'february', 'january'],
    'experience_relevant': ["1-2 years", "no experience", "2 years", "1 year"]
}

# Main function to run the scraping and processing
def main():
    joblist = []
    baseurl = "https://www.seek.com.au/Junior-data-analyst-jobs/in-Newington-NSW-2127?daterange=7&distance=25"
    i = 0

    while baseurl:
        print(f'Fetching data from: {baseurl}')
        soup = extract(baseurl)
        jobs, i = get_job_summary(soup, i)
        joblist.extend(jobs)
        baseurl = get_next_page_url(soup)
        sleep(1)

    # Convert to DataFrame
    df = pd.DataFrame(joblist)

    # Preprocess the 'Long_Description' and pair with 'jobID'
    df['Long_Description'] = df['Long_Description'].apply(preprocess)

    # Evaluate job descriptions and add scores to the DataFrame
    df['scores'] = df['Long_Description'].apply(lambda desc: match_keywords(desc, keywords))

    # Rank the jobs based on the scores
    df['rank'] = df['scores'].apply(lambda x: (x['must_have'], x['nice_to_have'], x['time_relevant'], x['experience_relevant']))
    df_ranked = df.sort_values(by='rank', ascending=False)
    df_result = df_ranked.drop(['scores', 'Long_Description'], axis=1)

    # Save the ranked DataFrame to CSV
    df_result.to_csv('ranked_report.csv', index=False)

if __name__ == "__main__":
    main()


Fetching data from: https://www.seek.com.au/Junior-data-analyst-jobs/in-Newington-NSW-2127?daterange=7&distance=25
Fetching data from: https://www.seek.com.au/Junior-data-analyst-jobs/in-Newington-NSW-2127?daterange=7&distance=25&page=2
Fetching data from: https://www.seek.com.au/Junior-data-analyst-jobs/in-Newington-NSW-2127?daterange=7&distance=25&page=3
Fetching data from: https://www.seek.com.au/Junior-data-analyst-jobs/in-Newington-NSW-2127?daterange=7&distance=25&page=4
Fetching data from: https://www.seek.com.au/Junior-data-analyst-jobs/in-Newington-NSW-2127?daterange=7&distance=25&page=5
Fetching data from: https://www.seek.com.au/Junior-data-analyst-jobs/in-Newington-NSW-2127?daterange=7&distance=25&page=6
No next page found.
