# GradConnection

In [52]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import json

def grad_connection_scrape(job_level,discipline,coverletter_context):
    
    # Set headers to mimic a browser visit
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    base_url = f'https://au.gradconnection.com/{job_level}/{discipline}/australia/'
    local_url = 'au.gradconnection.com'
    page_num = 1
    jobs_list = []

    while True:
        # Construct the URL for the current page
        if page_num == 1:
            url = f"{base_url}"
        else:
            url = f"{base_url}?page={page_num}"

        # Send a GET request
        response = requests.get(url, headers=headers)
        
        # Break the loop if the request failed
        if response.status_code != 200:
            print(f"Failed to retrieve page {page_num}. Status code: {response.status_code}")
            break
        
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all job listings
        job_listings = soup.find_all('div', class_='box-name')
        
        # Break the loop if no jobs found
        if not job_listings:
            print("No more job listings found.")
            break
        
        print(f"Scraping page {page_num}...")
        
        # Extract details for each job
        for job in job_listings:

            title = job.find('h3').text
            url_extra = job.find('a')['href']

            url_new = local_url+url_extra
            
            company_name = re.search(r'employers/([^/]+)/jobs', url_extra)

            if company_name:
                company_name = company_name.group(1)
                    # Add to jobs list
                
                # fk pay internship
                if company_name!= "readygrad" and company_name!= "gradconnection":
                    # call the api from ollama
                    print("company_name: ", company_name)
                    print("title: ", title)
                    def remove_think_tags(response):
                        # Use regular expression to remove content within <think> tags
                        cleaned_response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
                        return cleaned_response.strip()

                    api = 'http://localhost:11434/api/generate'
                    headers = {'Content-Type': 'application/json'}
                    prompt = f'My name is Richard Shao, I am a {coverletter_context["year_level"]} student majoring in {coverletter_context["major"]} with a minor in {coverletter_context["minor"]}.I have experience in {coverletter_context["ability"]}. One of the relevant event to the company program is {coverletter_context["event"]}. The company I am applying for is {company_name} and the program is {title}. Please write a cover letter for the company {company_name} and the program {title}. Make sure that the cover letter is relevant to the context, and fits the company culture. Please keep the cover letter to one page with is around 500 words. There is no need to mention anything else other than your name on the top of a cover letter. Please double check that the company name is correct. Start the response with: Dear Hiring Manager,and end with Richard Shao'

                    data = {
                        'model': 'deepseek-r1:14b',
                        'prompt': prompt,
                        'stream': False
                    }

                    response = requests.post(api, headers=headers, data=json.dumps(data))
                    result = remove_think_tags(response.json()["response"])

                    import os

                    # Define your target directory and file name
                    directory = "coverletters_grad"
                    filename = f"{company_name}_{title}.txt"
                    # remove illegal character in filename
                    allowed_fn_chars = [' ','_']
                    filename = "".join(x for x in filename if (x.isalnum() or x in allowed_fn_chars))

                    file_path = os.path.join(directory, filename)

                    # Create the directory if it doesn't exist
                    os.makedirs(directory, exist_ok=True)
                    
                    # export result into a txt file named by company name and job title
                    with open(file_path, 'w') as f:
                        f.write(result)
                    
                    jobs_list.append({
                        'Program Title': title,
                        'Company': company_name,
                        'Link': url_new
                    })
        
        # Increment page number and add a delay
        page_num += 1
        time.sleep(1)  # 1-second delay between requests

    # Create DataFrame
    df_jobs = pd.DataFrame(jobs_list)
    df_jobs.to_excel(f'{discipline}_{job_level}_jobs.xlsx', index=False)

    # Display results
    print("\nScraping completed. Found {} relevant jobs.".format(len(df_jobs)))
    return


In [53]:
# please choose from the following job level: graduate-jobs, internships，entry-level-jobs
discipline = "data-science-and-analytics"

# choose your discipline "data-science-and-analytics", "computer-science"
job_level = 'graduate-jobs'

# cover letter context, year_level, major, minor, interest,company_name, job_title company_description
ability = "different programing languages including R,python,java,C,SQL. Differnet packages include numpy, pandas, matplotlib, ggplot2,pytorch,scikit-learn,spark"
event = "help processing data from various news sources, and help to build a model to extract emotion and company name from the news as well as putting a label to the news "
coverletter_context = {"year_level":"first year master","major":"Probability and Statisitcs",
                       "minor":"Data Science","company_name":"",
                       "job_program":"","ability":ability,"event":event}

grad_connection_scrape(job_level,discipline,coverletter_context)


Scraping page 1...
company_name:  the-alternative
title:  The Alternative
company_name:  bcg
title:  Boston Consulting Group | Graduate Associate – Consulting careers (Maths & Data)
company_name:  anao
title:  Data Analyst and IT Auditor Graduate Stream
company_name:  fdm-group
title:  Technical Analyst Graduate Program
company_name:  fdm-group
title:  Expression of interest Gold Coast: Graduate Software Engineering Program – Major Australian bank
company_name:  fdm-group
title:  Expression of Interest: Project Grace Graduate Program (Gold Coast)
company_name:  fdm-group
title:  Data & Analytics Graduate Program
company_name:  fdm-group
title:  Software Engineering Graduate Program
company_name:  optiver
title:  FutureFocus – Technology 2025
company_name:  optiver
title:  FutureFocus – Trading & Research 2025
company_name:  fdm-group
title:  Tech Unboxed Series: Skills, Careers & AI Hacks
company_name:  amazon
title:  Amazon WoW presents, MARCH FORWARD : Accelerating Equality Together!

# Seek

https://www.seek.com.au/internship-machine-learning-jobs/in-All-Australia?classification=1223%2C6281

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

# please choose from the following job level: graduate-jobs, internships，entry-level-jobs
seek_keyword = f'{job_level}-machine-learning'

# Set headers to mimic a browser visit
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Initialize list to store job data and keywords for filtering
jobs_list = []


# choose your discipline (see seek website for details)
discipline = "1223%2C6281" # All ICT and Science&Technology
word_discipline = "science_and_technology"
base_url = f'https://www.seek.com.au/{seek_keyword}-jobs/in-All-Australia?classification={discipline}'
local_url = 'https://www.seek.com.au'
page_num = 1

while True:
    # Construct the URL for the current page
    if page_num == 1:
        url = f"{base_url}"
    else:
        url = f"{base_url}&page={page_num}"

    # Send a GET request
    print(f"Scraping page {page_num}...")
    response = requests.get(url, headers=headers)
    
    # Break the loop if the request failed
    if response.status_code != 200:
        print(f"Failed to retrieve page {page_num}. Status code: {response.status_code}")
        break
    
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all job listings
    job_listings = soup.find_all('article')
    # Break the loop if no jobs found
    if not job_listings:
        print("No more job listings found.")
        break
    
    print(f"Analysing page {page_num}...")
    
    # Extract details for each job
    for job in job_listings:

        title = job.get('aria-label')
        url_extra = job.find('a')['href']

        url_new = local_url+url_extra
        
        company = job.find('a', {'data-type': 'company'})
        if company:
            company_name = company.text
        else:
            company_name = None

        if company_name:
            # company_name = company_name.group(1)
                # Add to jobs list
               
            # fk pay internship
            if company_name!= "readygrad" and company_name!= "gradconnection":
                def remove_think_tags(response):
                        # Use regular expression to remove content within <think> tags
                        cleaned_response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
                        return cleaned_response.strip()

                api = 'http://localhost:11434/api/generate'
                headers = {'Content-Type': 'application/json'}
                prompt = f'My name is Richard Shao, I am a {coverletter_context["year_level"]} student majoring in {coverletter_context["major"]} with a minor in {coverletter_context["minor"]}.I have experience in {coverletter_context["ability"]}. One of the relevant event to the company program is {coverletter_context["event"]}. The company I am applying for is {company_name} and the program is {title}. Please write a cover letter for the company {company_name} and the program {title}. Make sure that the cover letter is relevant to the context, and fits the company culture. Please keep the cover letter to one page with is around 500 words. There is no need to mention anything else other than your name on the top of a cover letter. Please double check that the company name is correct. Start the response with: Dear Hiring Manager,and end with Richard Shao'

                data = {
                    'model': 'deepseek-r1:14b',
                    'prompt': prompt,
                    'stream': False
                }

                response = requests.post(api, headers=headers, data=json.dumps(data))
                result = remove_think_tags(response.json()["response"])

                import os

                # Define your target directory and file name
                directory = "coverletters_seek"
                filename = f"{company_name}_{title}.txt"
                # remove illegal character in filename
                allowed_fn_chars = [' ','_']
                filename = "".join(x for x in filename if (x.isalnum() or x in allowed_fn_chars))

                file_path = os.path.join(directory, filename)

                # Create the directory if it doesn't exist
                os.makedirs(directory, exist_ok=True)
                
                # export result into a txt file named by company name and job title
                with open(file_path, 'w') as f:
                    f.write(result)
                jobs_list.append({
                    'Program Title': title,
                    'Company': company_name,
                    'Link': url_new,
                })



    
    # Increment page number and add a delay
    page_num += 1
    time.sleep(5)  # 1-second delay between requests


# append into df_jobs
df_jobs = pd.DataFrame(jobs_list)


# Display results
print("\nScraping completed. Found {} relevant jobs.".format(len(df_jobs)))
print("\nSample of scraped jobs:")



#To save to CSV:
df_jobs.to_excel(f'{word_discipline}_{job_level}.xlsx', index=False)

Scraping page 1...
Analysing page 1...


In [7]:
# change datascience_graduate-jobs to excel
df_jobs.to_excel(f'datascience_{job_level}.xlsx', index=False)