Skip to content

Commit

Permalink
Fixes #2 initial POC
Browse files Browse the repository at this point in the history
  • Loading branch information
TheFriendlyCoder committed Sep 12, 2020
1 parent 7e2d0c1 commit f9f2652
Show file tree
Hide file tree
Showing 19 changed files with 790 additions and 51 deletions.
3 changes: 0 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,6 @@ target/
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
Expand Down
6 changes: 3 additions & 3 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,13 @@ evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / stateme
[SIMILARITIES]

# Minimum lines number of a similarity.
min-similarity-lines=7
min-similarity-lines=10

# Ignore comments when computing similarities.
ignore-comments=yes
ignore-comments=no

# Ignore docstrings when computing similarities.
ignore-docstrings=yes
ignore-docstrings=no

# Ignore imports when computing similarities.
ignore-imports=no
Expand Down
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.8.0
37 changes: 13 additions & 24 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
language: python

# Test against the oldest and newest supported version of Python only
# NOTE: there appears to only be 1 version of pypy available on Travis
# right now, so we just test against that one version
python:
- 3.8

Expand All @@ -11,36 +8,28 @@ install:
# https://github.com/z4r/python-coveralls/issues/73
- pip install tox tox-factor wheel python-coveralls "coverage<5.0"

script:
- echo $TRAVIS_PYTHON_VERSION
# Construct a correct tox python version
# For PyPy the TravisCI env var mapping should be: pypy3.5 -> pypy3
# For CPython the TravisCI env var mapping should be: 3.8 -> py38
- '[ "$TRAVIS_PYTHON_VERSION" = "pypy3.5" ] && export PYVER=`echo $TRAVIS_PYTHON_VERSION | tr "." "\n" | head -n 1` || export PYVER=py`echo $TRAVIS_PYTHON_VERSION | sed "s/\.//" | sed "s/^py//"`'
- echo $PYVER
- tox -e $PYVER-lint
- tox -e $PYVER-test -- --block-network
- tox -e $PYVER-docs
# Only publish coverage metrics for the latest supported Python version
- '[ "$TRAVIS_PYTHON_VERSION" = "3.8" ] && coveralls || echo Skipping Coveralls'

# Deploy to test.pypi.org for branches
# Deploy to pypi.org for tags
# NOTE: You can not replace builds published to pypi, even if you delete one
# so you must make sure your versions are always unique
# NOTE: We also restrict publishing of packages using the latest supported
# Python version so we don't publish redundant packages
jobs:
include:
- stage: test
script:
# If we are building from a cron trigger, lets test against live data.
# This ensures that we run period tests with the latest school website data
# and can detect and fix any changes to the content as quickly as possible
- echo Build trigger type is $TRAVIS_EVENT_TYPE
- '[ "$TRAVIS_EVENT_TYPE" == "cron" ] && export TEST_PARMS="-- --live" || :'
- tox -f lint
- tox -f test $TEST_PARMS
- tox -f docs
- coveralls

- stage: deploy-release
python: 3.8
script:
- pip install twine
- python setup.py bdist_wheel
- twine upload dist/*.whl -u $DEPLOY_USER -p $DEPLOY_PASS
if: tag IS true

- stage: deploy-snapshot
python: 3.8
script:
- pip install twine
- python setup.py bdist_wheel
Expand Down
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ Overview

This library parses HTML encoded data from various grade-school websites and extracts information about school closures and late bus announcements. Currently supported school websites are listed below:

* `Francophone School Districts, New Brunswick, Canada <https://bp.nbed.nb.ca/notices/BPRFtbl.aspx?dst=dsfs&amp;vtbl=1>`_
* `Francophone School Districts, New Brunswick, Canada <https://francophonesud.nbed.nb.ca/retards-et-fermetures>`_
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,4 +122,4 @@ def add_intersphinx_aliases_to_inv(app):
def setup(app):
"""Custom Sphinx extention manager entry point method"""
app.add_config_value('intersphinx_aliases', {}, 'env')
app.connect('builder-inited', add_intersphinx_aliases_to_inv)
app.connect('builder-inited', add_intersphinx_aliases_to_inv)
2 changes: 1 addition & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Jinja2==2.11.2
MarkupSafe==1.1.1
packaging==20.4
pbr==5.5.0
Pygments==2.6.1
Pygments==2.7.0
pyparsing==2.4.7
pytz==2020.1
requests==2.24.0
Expand Down
4 changes: 2 additions & 2 deletions project.prop
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,9 @@
"NAME" : "school_scraper",
"REPO" : "school-scraper",
"DEPENDENCIES" : [
"requests",
"pandas",
"lxml",
"tabulate"
"tabulate",
],
"DEV_DEPENDENCIES" : [
"pytest",
Expand All @@ -25,6 +24,7 @@
# pinning coverage package until bugs with coveralls plugin is fixed
# https://github.com/z4r/python-coveralls/issues/73
"coverage<5.0",
"requests",
],
"DESCRIPTION" : "Web scraper that parses school closure and late bus announcements from various school websites",
"KEYWORDS" : "web scraper school closure late bus",
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ pandas==1.1.2
pbr==5.5.0
pluggy==0.13.1
py==1.9.0
Pygments==2.6.1
Pygments==2.7.0
pylint==2.6.0
pyparsing==2.4.7
pytest==6.0.2
Expand Down
60 changes: 60 additions & 0 deletions src/school_scraper/fsddistrict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""Primitives for reading francophone school district information"""
from .fsdschool import FSDSchool


class FSDDistrict:
"""Abstraction around school district data parsed from the district website
"""
DISTRICT_FIELD = "Région"

def __init__(self, df):
"""
Args:
df: Pandas data frame describing the district
"""
self._df = df

def __str__(self):
return self._df.to_markdown()

def __repr__(self):
return str(self)

@property
def name(self):
"""str: Name of the district"""
# NOTE: Some districts have the same name but in different character
# casing so we just return a lower-cased representation
temp = self._df[self.DISTRICT_FIELD].str.lower().unique()
assert len(temp) == 1
return temp[0]

@property
def schools(self):
"""list (FSDSchool): 1 or more schools associated with this district"""
return [FSDSchool(df[1]) for df in self._df.iterrows()]

@property
def school_names(self):
"""list (str): list of school names associated with this district"""
retval = list()
for cur_school in self.schools:
retval.append(cur_school.name)
return retval

def get_school(self, name):
"""Gets a specific school from the district
Args:
name (str):
name of the school to get data for
Returns:
FSDSchool:
reference to the school with the given name, or None if no
such school exists
"""
for cur_school in self.schools:
if cur_school.name.lower() == name.lower():
return cur_school
return None
44 changes: 44 additions & 0 deletions src/school_scraper/fsdschool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""Abstraction around a francophone school district school"""


class FSDSchool:
"""Abstraction around school-specific data parsed from the school
district's website"""
SCHOOL_FIELD = "Nom de l'école"
OPEN_FIELD = "École"
BUS_FIELD = "Autobus"
MESSAGE_FIELD = "Messages"

def __init__(self, df):
"""
Args:
df: Panda's dataframe containing school data parsed from the website
"""
self._df = df

def __str__(self):
return self._df.to_markdown()

def __repr__(self):
return str(self)

@property
def name(self):
"""str: name of the school"""
return self._df[self.SCHOOL_FIELD]

@property
def messages(self):
"""str: status messages associated with the school"""
return self._df[self.MESSAGE_FIELD]

@property
def is_open(self):
"""bool: True if school is open, False if not"""
return self._df[FSDSchool.OPEN_FIELD] == "Ouvert"

@property
def has_late_busses(self):
"""bool: True if 1 or more buses are late, False if all are running
on time"""
return self._df[FSDSchool.BUS_FIELD] != "À l’heure"
138 changes: 138 additions & 0 deletions src/school_scraper/fsdscraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
"""Scraper for New Brunswick francophone school district website"""
import logging
import requests
import pandas as pd
from .fsdschool import FSDSchool
from .fsddistrict import FSDDistrict


class FSDScraper:
"""Interface for parsing HTML data loaded from the New Brunswick
francophone school district website"""

SCHEDULE_URL = \
"https://bp.nbed.nb.ca/notices/BPRFtbl.aspx?dst=dsfs&amp;vtbl=1"

def __init__(self, html):
"""
Args:
html (str):
HTML data loaded from the website. Is expected to contain a
single HTML table containing rows describing each school in
each school district
"""
assert FSDScraper.validate(html)

temp = pd.read_html(
html, header=0, converters={FSDSchool.MESSAGE_FIELD: str})

self._data = temp[0]
self._data.fillna("", inplace=True)

def __str__(self):
return self._data.to_markdown()

@staticmethod
def validate(html):
"""Checks to see if HTML loaded from the website is parseable
Args:
html (str):
HTML data loaded from the district website
Returns:
bool:
True if the HTML content was parseable, False if not. Details
of any parsing errors are reported to the logger.
"""
log = logging.getLogger(__name__)
try:
temp = pd.read_html(
html, header=0, converters={FSDSchool.MESSAGE_FIELD: str},
flavor="lxml")
except ValueError as err:
log.error("Error parsing HTML input:")
log.error(err)
log.debug(html)
return False

if len(temp) != 1:
log.error(f"Expected 1 HTML table in the source data but "
f"found {len(temp)} instead")
return False
data = temp[0]
data.fillna("", inplace=True)

log.debug("Parsed HTML data table:")
log.debug(data.to_markdown())

school_names = list()
for cur_school in data[FSDSchool.SCHOOL_FIELD]:
if cur_school == "":
log.error("Detected row with no valid school name")
return False

if cur_school in school_names:
log.error(f"Multiple schools with the same name "
f"detected: {cur_school}")
return False
school_names.append(cur_school)

for cur_district in data[FSDDistrict.DISTRICT_FIELD]:
if cur_district == "":
log.error("Detected row with no valid district name")
return False

return True

@property
def districts(self):
"""list (FSDDistrict): 0 or more districts parsed from the HTML content
"""
unique_names = self._data[FSDDistrict.DISTRICT_FIELD].str.lower().unique()
retval = list()
for cur_name in unique_names:
rows = self._data[self._data[FSDDistrict.DISTRICT_FIELD].str.lower() == cur_name]
retval.append(FSDDistrict(rows))
return retval

def get_district(self, name):
"""Gets a specific district from the HTML content
Args:
name (str):
the name of the district to locate
Returns:
FSDDistrict:
Reference to the district details for the named district, or
None if no district with the given name exists
"""
for cur_district in self.districts:
if cur_district.name.lower() == name.lower():
return cur_district
return None

@property
def district_names(self):
"""list (str): list of names of all districts parsed from the HTML"""
retval = list()
for cur_district in self.districts:
retval.append(cur_district.name)
return retval

@property
def school_names(self):
"""list (str): list of unique names of all schools in all districts"""
retval = list()
for cur_district in self.districts:
for cur_school in cur_district.schools:
retval.append(cur_school.name)
return retval


if __name__ == "__main__": # pragma: no cover
text = requests.get(FSDScraper.SCHEDULE_URL).text
print(FSDScraper.validate(text))
obj = FSDScraper(text)
print(obj)

0 comments on commit f9f2652

Please sign in to comment.