Skip to content

Commit

Permalink
added pytests for north american coverage + more housekeeping
Browse files Browse the repository at this point in the history
  • Loading branch information
studentbrad committed Jan 13, 2020
1 parent 910f0bc commit d259310
Show file tree
Hide file tree
Showing 20 changed files with 78,411 additions and 112 deletions.
6 changes: 5 additions & 1 deletion .idea/JobFunnel.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ verify_ssl = true

[packages]
jobfunnel = {path = ".",editable = true}
nltk = "==3.4.5"

[requires]
python_version = "3.6"
74 changes: 72 additions & 2 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

43 changes: 19 additions & 24 deletions demo/settings.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
# All paths are relative to this file.
# all paths are relative to this file

# Paths.
# place the search right next to this file
# paths
output_path: './'

# Providers from which to search (case insensitive)
# providers from which to search (case insensitive)
providers:
- 'Indeed'
- 'Monster'
- 'GlassDoor' # This used to take ~10x longer to run than the other providers

- 'GlassDoor'

# Filters.
# filters
search_terms:
region:
province: 'ON'
Expand All @@ -21,32 +19,29 @@ search_terms:

keywords:
- 'Python'
- 'Java'

black_list:
- 'Infox Consulting'
- 'Terminal'

# Logging level options are: critical, error, warning, info, debug, notset
# logging level options are: critical, error, warning, info, debug, notset
log_level: 'info'

# Saves duplicates removed by tfidf filter to duplicate_list.csv
# saves duplicates removed by tfidf filter to duplicate_list.csv
save_duplicates: False

# Turn on or off delaying
set_delay: True
# turn on or off delaying
set_delay: True

# Delaying algorithm configuration
# delaying algorithm configuration
delay_config:
# Functions used for delaying algorithm, options are: constant, linear, sigmoid
# functions used for delaying algorithm, options are: constant, linear, sigmoid
function: 'linear'
# Maximum delay/upper bound for converging random delay
delay: 30
# Minimum delay/lower bound for random delay
min_delay: 15
# Random delay
random: True
# Converging random delay, only used if 'random' is set to True
converge: True


# maximum delay/upper bound for converging random delay
delay: 10
# minimum delay/lower bound for random delay
min_delay: 1
# random delay
random: True
# converging random delay, only used if 'random' is set to True
converge: True
2 changes: 1 addition & 1 deletion jobfunnel/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '2.0.1'
__version__ = '2.0.2'
6 changes: 4 additions & 2 deletions jobfunnel/__main__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
#!python
"""main script, scrapes data off several listings, pickles it,
and applies search filters"""
from typing import Union

from .config.parser import parse_config

from .jobfunnel import JobFunnel
from .indeed import Indeed
from .monster import Monster
from .glassdoor import GlassDoor

providers = {'indeed': Indeed, 'monster': Monster, 'glassdoor': GlassDoor}
PROVIDERS = {'indeed': Indeed, 'monster': Monster, 'glassdoor': GlassDoor}


def main():
Expand All @@ -29,7 +31,7 @@ def main():
jf.load_pickle(config)
else:
for p in config['providers']:
provider = providers[p](config)
provider: Union[GlassDoor, Monster, Indeed] = PROVIDERS[p](config)
provider_id = provider.__class__.__name__
try:
provider.scrape()
Expand Down
20 changes: 13 additions & 7 deletions jobfunnel/config/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,10 @@
'debug': logging.DEBUG, 'notset': logging.NOTSET}


def _parse_cli():
"""Parse the command line arguments.
def parse_cli():
""" Parse the command line arguments.
"""

parser = argparse.ArgumentParser(
'CLI options take precedence over settings in the yaml file'
'empty arguments are replaced by settings in the default yaml file')
Expand Down Expand Up @@ -122,7 +121,7 @@ def _parse_cli():


def parse_config():
"""Parse the JobFunnel configuration settings.
""" Parse the JobFunnel configuration settings.
"""
# find the jobfunnel root dir
Expand All @@ -134,7 +133,7 @@ def parse_config():
default_yaml = yaml.safe_load(open(default_yaml_path, 'r'))

# parse the command line arguments
cli = _parse_cli()
cli = parse_cli()

# parse the settings file for the line arguments
given_yaml = None
Expand Down Expand Up @@ -182,6 +181,13 @@ def parse_config():
if cli.keywords is not None:
config['search_terms']['keywords'] = cli.keywords

# search term state is inserted as province if province does not already exist
if 'state' in config['search_terms']['region']:
if (config['search_terms']['region']['state'] is not None) and \
(config['search_terms']['region']['province'] is None):
config['search_terms']['region']['province'] = \
config['search_terms']['region']['state']

# parse the blacklist
config['black_list'] = default_yaml['black_list']
if given_yaml_path is not None:
Expand Down Expand Up @@ -230,7 +236,7 @@ def parse_config():
if given_yaml_path is not None:
config['delay_config'] = given_yaml['delay_config']

# Cli options for delaying configuration
# cli options for delaying configuration
if cli.function is not None:
config['delay_config']['function'] = cli.function
if cli.delay is not None:
Expand All @@ -242,7 +248,7 @@ def parse_config():
if cli.converge is not None:
config['delay_config']['converge'] = cli.converge

# Converts function name to lower case in config
# converts function name to lower case in config
config['delay_config']['function'] = \
config['delay_config']['function'].lower()
else:
Expand Down
38 changes: 19 additions & 19 deletions jobfunnel/config/settings.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
# This is the default settings file. Do not edit.

# All paths are relative to this file.
# all paths are relative to this file

# Paths.
# paths
output_path: 'search'

# Providers from which to search (case insensitive)
# providers from which to search (case insensitive)
providers:
- 'Indeed'
- 'Monster'
- 'GlassDoor' # This used to take ~10x longer to run than the other providers
- 'GlassDoor'

# Filters.
# filters
search_terms:
region:
province: 'ON'
Expand All @@ -22,28 +22,28 @@ search_terms:
keywords:
- 'Python'

# Black-listed company names
# black-listed company names
black_list:
- 'Infox Consulting'

# Logging level options are: critical, error, warning, info, debug, notset
# logging level options are: critical, error, warning, info, debug, notset
log_level: 'info'

# Saves duplicates removed by tfidf filter to duplicate_list.csv
# saves duplicates removed by tfidf filter to duplicate_list.csv
save_duplicates: False

# Turn on or off delaying
set_delay: True
# turn on or off delaying
set_delay: True

# Delaying algorithm configuration
# delaying algorithm configuration
delay_config:
# Functions used for delaying algorithm, options are: constant, linear, sigmoid
# functions used for delaying algorithm, options are: constant, linear, sigmoid
function: 'linear'
# Maximum delay/upper bound for converging random delay
delay: 15
# Minimum delay/lower bound for random delay
min_delay: 1
# Random delay
# maximum delay/upper bound for converging random delay
delay: 10
# minimum delay/lower bound for random delay
min_delay: 1
# random delay
random: False
# Converging random delay, only used if 'random' is set to True
converge: False
# converging random delay, only used if 'random' is set to True
converge: False
Loading

0 comments on commit d259310

Please sign in to comment.