In [3]:
import subprocess
import time
from datetime import datetime
import os

import pathlib 
from decouple import Config, RepositoryEnv
import json
from googleapiclient.discovery import build
from google.oauth2.service_account import Credentials


def run_ps_cmd(cmds: list[list[str]], max_retries: int=3, retry_delay:int =1, log: bool=True):
    all_outputs = []
    for cmd in cmds:
        retries = 0
        while retries < max_retries:
            try:
                full_command = ['powershell', '-Command'] + cmd
                cmd_str = " ".join(cmd)
                if log: print(f"Running command: {cmd_str}\n")
                result = subprocess.run(full_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True)
                combined_output = result.stdout + result.stderr
                output = combined_output.strip()
                if log: print(f"Output: {output}\n")
                all_outputs.append(output)
                break  # Move to the next command after successful execution
            except subprocess.CalledProcessError as e:
                print(f"Failed to run PowerShell command (Attempt {retries + 1}/{max_retries}): Exit Code {e.returncode}\n")
                retries += 1
                if retries < max_retries:
                    print(f"Retrying in {retry_delay} second(s)...\n")
                    time.sleep(retry_delay)
                else:
                    raise  # If all retries fail, raise the exception
    return all_outputs

def auth(email: str):
    output = run_ps_cmd([['gcloud', 'auth', 'list', '--format=json']], log=False)
    output_json = json.loads(output[0].replace("\n", ""))
    for account in output_json:
        if account["account"] == email:
            if account["status"] == "ACTIVE":
                print("Authenticated as "+email)
                return
            else:
                run_ps_cmd([['gcloud', 'auth', 'login', email]], log=False)
                print("Authenticated as "+email)
                return
    run_ps_cmd([['gcloud', 'auth', 'login']])
    
def init(email: str=None, prefix: str="web-scraping", base_path: str=None):

    if email == None:        
        # get variable from .env file in the project's root directory
        BASE_DIR = pathlib.Path(__file__).parent
        ENV_PATH = BASE_DIR / '.env'
        env_config = Config(RepositoryEnv(ENV_PATH))
        email = env_config.get('email')

    if base_path == None:        
        base_path = os.path.join(os.getcwd(), "sa_keys", email)
        if not os.path.exists(base_path):
            os.makedirs(base_path)
    
    auth(email)
    
    proj_name = prefix # proj_name
    time_iso_now = datetime.utcnow().isoformat() # current ISO time
    time_iso_now = time_iso_now.replace(".", "").replace("T", "").replace(":", "").replace("-", "") # remove special characters from ISO time
    proj_id = f"{proj_name}-{time_iso_now}"[:30] # project id
    
    # Create the commands
    pre = ['gcloud']
    proj_id_arg = f"--project={proj_id}" # project argument
    key_path = os.path.join(base_path, f"{proj_id}.json")
    cmd_create_proj = pre+['projects', 'create', proj_id, f"--name={proj_name}"]
    cmd_en_yt_api = pre+['services', 'enable', 'youtube.googleapis.com', proj_id_arg]
    cmd_create_sa = pre+['iam', 'service-accounts', 'create', proj_name, proj_id_arg]
    cmd_grant_roles_sa = pre+['projects', 'add-iam-policy-binding', proj_id, f'--member=serviceAccount:{proj_name}@{proj_id}.iam.gserviceaccount.com', '--role=roles/owner']
    cmd_create_api_key = pre+['iam', 'service-accounts', 'keys', 'create', f'"{key_path}"', f'--iam-account={proj_name}@{proj_id}.iam.gserviceaccount.com', proj_id_arg]
    
    # run the commands
    run_ps_cmd([cmd_create_proj, cmd_en_yt_api, cmd_create_sa, cmd_grant_roles_sa, cmd_create_api_key])

if __name__ == "__main__":
    init()

In [4]:


# (New-Object Net.WebClient).DownloadFile("https://dl.google.com/dl/cloudsdk/channels/rapid/GoogleCloudSDKInstaller.exe", "$env:Temp\GoogleCloudSDKInstaller.exe")

# & $env:Temp\GoogleCloudSDKInstaller.exe
    

NameError: name '__file__' is not defined

In [2]:

import time
import math

import json
import requests
from bs4 import BeautifulSoup



def get_total_jobs_count(job_search_url):
  time.sleep(0.5)
  response = requests.get(job_search_url)
  
  if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find the span element with data-automation="totalJobsCount"
    total_jobs_element = soup.find("span", {"data-automation": "totalJobsCount"})
    
    if total_jobs_element:
      # Extract the job count text
      total_jobs_count = total_jobs_element.get_text()
      return int(total_jobs_count)
    print("Could not parse total jobs count")
  
  else:
    print("Failed to fetch the web page.")
    
  return 0

def get_urls(base_urls):
  # Salary ranges
  salary_limits = ["30", "40", "50", "60", "70", "80", "100", "120", "150", "200", "250", "350"]
  salary_limits = [limit + "000" for limit in salary_limits]
  salary_limits.append("")
  salary_ranges = [(salary_limits[i], salary_limits[i + 1]) for i in range(len(salary_limits) - 1)]

  # urls with salary ranges
  urls_salary_ranges = []
  for base_url in base_urls:
    for salary_range in salary_ranges:
      # Add salary parameters to url
      urls_salary_ranges.append(base_url + f"?salaryrange={salary_range[0]}-{salary_range[1]}&salarytype=annual")

  # urls with start page
  urls = []  
  for url in urls_salary_ranges:
    # Get max pages per run
    JOBS_PER_PAGE = 22
    SCRAP_PAGES_PER_RUN = 200
    MAX_PAGES_PER_RUN = math.floor(SCRAP_PAGES_PER_RUN/(JOBS_PER_PAGE+1))
    
    # Get number of runs
    number_of_jobs = get_total_jobs_count(url)
    number_of_runs = math.ceil(number_of_jobs/(JOBS_PER_PAGE+1)/MAX_PAGES_PER_RUN)
    
    for run in range(number_of_runs):
      start_page = run * MAX_PAGES_PER_RUN + 1
      # Add page parameter
      urls.append(url + f"&page={start_page}")
    
  return urls


# base_urls = [
#   "https://www.seek.com.au/jobs-in-information-communication-technology/business-systems-analysts",
#   "https://www.seek.com.au/jobs-in-information-communication-technology/developers-programmers"
# ]

# urls = get_urls(base_urls)

# print(len(urls))

# for url in urls:
#   print(url)

# url = "https://www.seek.com.au/jobs-in-information-communication-technology/business-systems-analysts?page=6&salaryrange=100000-&salarytype=annual"
# total_jobs_count = get_total_jobs_count(url)
# print(total_jobs_count)

# params = {
#   "api_key": "tobsMf-25k1y"
# }
# r = requests.get('https://www.parsehub.com/api/v2/runs/tuTNuCbASVTm', params=params)
# print(json.loads(r.text)["status"])

# print(salary_limits)


41
https://www.seek.com.au/jobs-in-information-communication-technology/business-systems-analysts?salaryrange=30000-40000&salarytype=annual&page=1
https://www.seek.com.au/jobs-in-information-communication-technology/business-systems-analysts?salaryrange=40000-50000&salarytype=annual&page=1
https://www.seek.com.au/jobs-in-information-communication-technology/business-systems-analysts?salaryrange=50000-60000&salarytype=annual&page=1
https://www.seek.com.au/jobs-in-information-communication-technology/business-systems-analysts?salaryrange=60000-70000&salarytype=annual&page=1
https://www.seek.com.au/jobs-in-information-communication-technology/business-systems-analysts?salaryrange=70000-80000&salarytype=annual&page=1
https://www.seek.com.au/jobs-in-information-communication-technology/business-systems-analysts?salaryrange=70000-80000&salarytype=annual&page=9
https://www.seek.com.au/jobs-in-information-communication-technology/business-systems-analysts?salaryrange=80000-100000&salarytype=an

In [3]:
original_list = [i for i in range(350)]  # Example original list


print(smaller_lists)


[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199], [200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220,

In [None]:
import requests

params = {
  "api_key": "tobsMf-25k1y",
  "start_url": "http://www.example.com",
  "start_template": "main_template",
  "start_value_override": "{\"query\": \"San Francisco\"}",
  "send_email": "1"
}
r = requests.post("https://www.parsehub.com/api/v2/projects/{PROJECT_TOKEN}/run", data=params)

In [87]:
from extract import load_json_files


path = r"D:\personal\seek scrapping\data\JSON\IT\DB"
job_post_list = load_json_files(path)


In [3]:
import requests

url = 'http://localhost:5000/trigger-run'  # Replace with your API endpoint

# Sample payload/data to send in the POST request
data = {
    'key1': 'value1',
    'key2': 'value2'
}

# Make the POST request with data
response = requests.post(url, json=data)  # Use `json=data` for sending JSON data

# Check the response
if response.status_code == 200:  # Successful response
    print('Request successful!')
    print('Response:', response.json())  # Print response data
else:
    print('Request failed with status code:', response.status_code)
    print('Response:', response.text)  # Print error message or response content


Request failed with status code: 500
Response: <!doctype html>
<html lang=en>
  <head>
    <title>sqlalchemy.exc.ProgrammingError: (psycopg2.errors.UndefinedColumn) column job_listings.id does not exist
LINE 1: SELECT job_listings.id AS job_listings_id, job_listings.job_...
               ^

[SQL: SELECT job_listings.id AS job_listings_id, job_listings.job_url AS job_listings_job_url, job_listings.title AS job_listings_title, job_listings.advertiser AS job_listings_advertiser, job_listings.location AS job_listings_location, job_listings.category AS job_listings_category, job_listings.job_type AS job_listings_job_type, job_listings.salary AS job_listings_salary, job_listings.time_posted AS job_listings_time_posted, job_listings.description AS job_listings_description, job_listings.employer_questions AS job_listings_employer_questions, job_listings.title_words AS job_listings_title_words, job_listings.description_words AS job_listings_description_words, job_listings.employer_questions_wo