# GigHunter - Daily post Remotive jobs 

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import time
import naas_drivers

In [2]:
# INDEED_URL = "https://fr.indeed.com/jobs?q=developpeur&l=Strasbourg+%2867%29"
REMOTIVE_URL = "https://remotive.io/"
REMOTIVE_API = "https://remotive.io/api/remote-jobs/"
REMOTIVE_DATETIME = "%Y-%m-%dT%H:%M:%S"
NAAS_DATETIME = "%Y/%m/%d %H:%M:%S"

CATEGORIE_ROUTE = "categories"

## Get all job categories from Remotive

In [3]:
def get_job_categories(category_route):
    res = requests.get(category_route)
    ret = []
    for category in res.json()['jobs']:
        ret.append(category['slug'])
    return ret

# categories = get_job_categories(REMOTIVE_API + CATEGORIE_ROUTE)
categories = ['data'] # Fetch only jobs with category 'data'

In [57]:
# url = REMOTIVE_API + '?category=data'
# # requests.get(url).json()['jobs']
# # link="https://remotive.io/api/remote-jobs?category=data"
# name = requests.get(url).json()['jobs'][2]['company_name']
# company_info = "https://remotive.io/api/remote-jobs?company_name=Talentuch"
# requests.get(company_info).json()

 'job-count': 0,
 'jobs': []}

## Functions to get jobs from Remotive

In [4]:
def get_remotive_jobs_since(jobs, date):
    ret = []
    for job in jobs:
        publication_date = datetime.strptime(job['publication_date'], REMOTIVE_DATETIME).timestamp()
        if publication_date > date:
            ret.append({
                'URL': job['url'],
                'JOBS': job['title'],
                'COMPANY': job['company_name'],
                'PUBLISH_DATE': datetime.fromtimestamp(publication_date).strftime(NAAS_DATETIME)
            })
    return ret

def get_category_jobs_since(api, category, date, limit):
    url = f"{api}?category={category}&limit={limit}"
    res = requests.get(url)
    if res.json()['jobs']:
        publication_date = datetime.strptime(res.json()['jobs'][-1]['publication_date'], REMOTIVE_DATETIME).timestamp()
        if len(res.json()['jobs']) < limit or date > publication_date:
            print(f"Jobs from catgory {category} fetched ✅")
            return get_remotive_jobs_since(res.json()['jobs'], date)
        else:
            return get_category_jobs_since(api, category, date, limit + 5)
    return []

def get_jobs_since(api, categories, date):
    jobs = []
    for category in categories:
        jobs += get_category_jobs_since(api, category, date, 5)
    print(f'- All job since {datetime.fromtimestamp(date)} have been fetched -')
    return jobs

## Get all jobs posted after timestamp_date

All jobs posted after the timestamp stored in 'timestamp_date' will be fetched.
In summary, we can set the value, in seconds, of 'search_data_from' to fetch all jobs posted since this duration

In [5]:
##### Search jobs past 20 days #####
search_jobs_from = 20 * 24 * 60 * 60 # 24 hours in seconds
timestamp_date = time.time() - search_jobs_from

jobs = get_jobs_since(REMOTIVE_API, categories, timestamp_date)

Jobs from catgory data fetched ✅
- All job since 2022-01-14 17:12:14.083598 have been fetched -


## Display found jobs

In [6]:
for job in jobs:
    print(job['JOBS'])

Java Developer (Data)
Data and Research Analyst
Data Analyst
Data Quality Analyst, Healthcare
Lead Instructor - Data Analytics
Analytics Manager, Data
Data Science
Data Analytics Engineer
Senior Database Engineer/Data Scientist
Data Engineer
Senior Analyst
Associate Analyst, TruGuard
(Senior) Data Scientist
Data Scientist
Data Engineer
Data Analyst
Analytics Lead
Data Analyst
Data Engineer
Clinical Data Manager I
(Senior) Database Administrator
Instructor, Data Analytics
(Associate) Database Administrator
(Senior) Database Administrator
Data Engineer/Data Architect
Data Science, Senior Manager
Data Engineering Lead
Data Scientist


## Setup sheet log data

In [7]:
spreadsheet_id = "1EBefhkbmqaXMZLRCiafabf68qwhYKiEOayT1BRT34tU"
sheet_name = "SLACK_CHANNEL_POSTS"

## Get the sheet log of jobs

In [8]:
df = naas_drivers.gsheet.connect(spreadsheet_id).get(
    sheet_name=sheet_name
)
df

Unnamed: 0,URL,JOBS,COMPANY,PUBLISH_DATE
0,https://remote.co/job/data-support-specialist-...,Data Support Specialist III,Compassion International,21/03/2021 13:42:00
1,https://remote.co/job/senior-data-analyst-7/,Senior Data Analyst,Varsity Tutors,21/03/2021 13:45:00
2,https://remote.co/job/data-scientist-marketing...,"Data Scientist, Marketing Capital Allocation",Root Insurance,21/03/2021 13:46:00
3,https://remote.co/job/business-data-strategist/,Business Data Strategist,Quantum Metric,21/03/2021 13:47:00
4,https://remote.co/job/senior-data-scientist-36/,Senior Data Scientist,"League, Inc.",21/03/2021 13:49:00
...,...,...,...,...
358,https://remotive.io/remote-jobs/data/director-...,director of data science,soona,2021/08/04 01:40:59
359,https://remotive.io/remote-jobs/data/data-scie...,Data Scientist,STILT,2021/08/05 01:41:56
360,https://remotive.io/remote-jobs/data/senior-da...,Senior Data Scientist (NLP),Parenthetic,2021/08/06 01:46:37
361,https://remotive.io/remote-jobs/data/senior-da...,Senior Data Analyst,Ginger,2021/08/05 20:21:48


## Remove duplicate jobs

In [9]:
jobs_to_remove = []

for index, job in enumerate(jobs):
    for url in df['URL']:
        if job['URL'] == url:
            jobs_to_remove.append(index)
            
for index in reversed(jobs_to_remove):
    print(f"'{jobs[index]['JOBS']}' already in stored in sheet ❌")
    del jobs[index]

'Analytics Manager, Data' already in stored in sheet ❌


## Add new jobs on the sheet log

In [10]:
naas_drivers.gsheet.connect(spreadsheet_id).send(
    sheet_name=sheet_name,
    data=jobs)

{'insertedRow': 27}

## Setup slack channel configuration

In [25]:
# token = "xoxb-148104********-190666******-BVrrrVN7e6eMCm6******"
token =   "gQC3N88wFIIXxiyDdrrPYR14"
SLACK_CHANNEL = "05_work"
# assert len("BVrrrVN7e6eMCm6cU0vZ6BYM") == len(token)

## Send all jobs link to the slack channel

In [26]:
for job in jobs:
    naas_drivers.slack.connect(token).send(SLACK_CHANNEL, f"<{job['URL']}>")

Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth
Got an error: invalid_auth


## Set the Scheduler

In [13]:
import naas

naas.scheduler.add(recurrence="0 9 * * *")

recurrence is deprecated use cron arg instead
👌 Well done! Your Notebook has been sent to production.

⏰ It will be scheduled "At 09:00 every day" (more on the syntax on https://crontab.guru/).

Ps: to remove the "Scheduler", just replace .add by .delete


In [14]:
# naas.scheduler.delete()