Fixes #2 initial POC

TheFriendlyCoder · Sep 12, 2020 · f9f2652 · f9f2652
1 parent 7e2d0c1
commit f9f2652
Show file tree

Hide file tree

Showing 19 changed files with 790 additions and 51 deletions.
diff --git a/.gitignore b/.gitignore
@@ -81,9 +81,6 @@ target/
 profile_default/
 ipython_config.py
 
-# pyenv
-.python-version
-
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies

diff --git a/.pylintrc b/.pylintrc
@@ -93,13 +93,13 @@ evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / stateme
 [SIMILARITIES]
 
 # Minimum lines number of a similarity.
-min-similarity-lines=7
+min-similarity-lines=10
 
 # Ignore comments when computing similarities.
-ignore-comments=yes
+ignore-comments=no
 
 # Ignore docstrings when computing similarities.
-ignore-docstrings=yes
+ignore-docstrings=no
 
 # Ignore imports when computing similarities.
 ignore-imports=no

diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.8.0
diff --git a/.travis.yml b/.travis.yml
@@ -1,8 +1,5 @@
 language: python
 
-# Test against the oldest and newest supported version of Python only
-# NOTE: there appears to only be 1 version of pypy available on Travis
-#       right now, so we just test against that one version
 python:
   - 3.8
 
@@ -11,36 +8,28 @@ install:
   # https://github.com/z4r/python-coveralls/issues/73
   - pip install tox tox-factor wheel python-coveralls "coverage<5.0"
 
-script:
-  - echo $TRAVIS_PYTHON_VERSION
-  # Construct a correct tox python version
-  # For PyPy the TravisCI env var mapping should be: pypy3.5 -> pypy3
-  # For CPython the TravisCI env var mapping should be: 3.8 -> py38
-  - '[ "$TRAVIS_PYTHON_VERSION" = "pypy3.5" ] && export PYVER=`echo $TRAVIS_PYTHON_VERSION | tr "." "\n" | head -n 1` || export PYVER=py`echo $TRAVIS_PYTHON_VERSION | sed "s/\.//" | sed "s/^py//"`'
-  - echo $PYVER
-  - tox -e $PYVER-lint
-  - tox -e $PYVER-test -- --block-network
-  - tox -e $PYVER-docs
-  # Only publish coverage metrics for the latest supported Python version
-  - '[ "$TRAVIS_PYTHON_VERSION" = "3.8" ] && coveralls || echo Skipping Coveralls'
-
-# Deploy to test.pypi.org for branches
-# Deploy to pypi.org for tags
-# NOTE: You can not replace builds published to pypi, even if you delete one
-#       so you must make sure your versions are always unique
-# NOTE: We also restrict publishing of packages using the latest supported
-#       Python version so we don't publish redundant packages
 jobs:
   include:
+    - stage: test
+      script:
+        # If we are building from a cron trigger, lets test against live data.
+        # This ensures that we run period tests with the latest school website data
+        # and can detect and fix any changes to the content as quickly as possible
+        - echo Build trigger type is $TRAVIS_EVENT_TYPE
+        - '[ "$TRAVIS_EVENT_TYPE" == "cron" ] && export TEST_PARMS="-- --live" || :'
+        - tox -f lint
+        - tox -f test $TEST_PARMS
+        - tox -f docs
+        - coveralls
+
     - stage: deploy-release
-      python: 3.8
       script:
         - pip install twine
         - python setup.py bdist_wheel
         - twine upload dist/*.whl -u $DEPLOY_USER -p $DEPLOY_PASS
       if: tag IS true
+
     - stage: deploy-snapshot
-      python: 3.8
       script:
         - pip install twine
         - python setup.py bdist_wheel

diff --git a/README.rst b/README.rst
@@ -3,4 +3,4 @@ Overview
 
 This library parses HTML encoded data from various grade-school websites and extracts information about school closures and late bus announcements. Currently supported school websites are listed below:
 
-* `Francophone School Districts, New Brunswick, Canada <https://bp.nbed.nb.ca/notices/BPRFtbl.aspx?dst=dsfs&amp;vtbl=1>`_
+* `Francophone School Districts, New Brunswick, Canada <https://francophonesud.nbed.nb.ca/retards-et-fermetures>`_
diff --git a/docs/conf.py b/docs/conf.py
@@ -122,4 +122,4 @@ def add_intersphinx_aliases_to_inv(app):
 def setup(app):
     """Custom Sphinx extention manager entry point method"""
     app.add_config_value('intersphinx_aliases', {}, 'env')
-    app.connect('builder-inited', add_intersphinx_aliases_to_inv)
+    app.connect('builder-inited', add_intersphinx_aliases_to_inv)
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -9,7 +9,7 @@ Jinja2==2.11.2
 MarkupSafe==1.1.1
 packaging==20.4
 pbr==5.5.0
-Pygments==2.6.1
+Pygments==2.7.0
 pyparsing==2.4.7
 pytz==2020.1
 requests==2.24.0

diff --git a/project.prop b/project.prop
@@ -5,10 +5,9 @@
     "NAME" : "school_scraper",
     "REPO" : "school-scraper",
     "DEPENDENCIES" : [
-        "requests",
         "pandas",
         "lxml",
-        "tabulate"
+        "tabulate",
     ],
     "DEV_DEPENDENCIES" : [
         "pytest",
@@ -25,6 +24,7 @@
         # pinning coverage package until bugs with coveralls plugin is fixed
         # https://github.com/z4r/python-coveralls/issues/73
         "coverage<5.0",
+        "requests",
     ],
     "DESCRIPTION" : "Web scraper that parses school closure and late bus announcements from various school websites",
     "KEYWORDS" : "web scraper school closure late bus",

diff --git a/requirements.txt b/requirements.txt
@@ -26,7 +26,7 @@ pandas==1.1.2
 pbr==5.5.0
 pluggy==0.13.1
 py==1.9.0
-Pygments==2.6.1
+Pygments==2.7.0
 pylint==2.6.0
 pyparsing==2.4.7
 pytest==6.0.2

diff --git a/src/school_scraper/fsddistrict.py b/src/school_scraper/fsddistrict.py
@@ -0,0 +1,60 @@
+"""Primitives for reading francophone school district information"""
+from .fsdschool import FSDSchool
+
+
+class FSDDistrict:
+    """Abstraction around school district data parsed from the district website
+    """
+    DISTRICT_FIELD = "Région"
+
+    def __init__(self, df):
+        """
+        Args:
+            df: Pandas data frame describing the district
+        """
+        self._df = df
+
+    def __str__(self):
+        return self._df.to_markdown()
+
+    def __repr__(self):
+        return str(self)
+
+    @property
+    def name(self):
+        """str: Name of the district"""
+        # NOTE: Some districts have the same name but in different character
+        #       casing so we just return a lower-cased representation
+        temp = self._df[self.DISTRICT_FIELD].str.lower().unique()
+        assert len(temp) == 1
+        return temp[0]
+
+    @property
+    def schools(self):
+        """list (FSDSchool): 1 or more schools associated with this district"""
+        return [FSDSchool(df[1]) for df in self._df.iterrows()]
+
+    @property
+    def school_names(self):
+        """list (str): list of school names associated with this district"""
+        retval = list()
+        for cur_school in self.schools:
+            retval.append(cur_school.name)
+        return retval
+
+    def get_school(self, name):
+        """Gets a specific school from the district
+
+        Args:
+            name (str):
+                name of the school to get data for
+
+        Returns:
+            FSDSchool:
+                reference to the school with the given name, or None if no
+                such school exists
+        """
+        for cur_school in self.schools:
+            if cur_school.name.lower() == name.lower():
+                return cur_school
+        return None
diff --git a/src/school_scraper/fsdschool.py b/src/school_scraper/fsdschool.py
@@ -0,0 +1,44 @@
+"""Abstraction around a francophone school district school"""
+
+
+class FSDSchool:
+    """Abstraction around school-specific data parsed from the school
+    district's website"""
+    SCHOOL_FIELD = "Nom de l'école"
+    OPEN_FIELD = "École"
+    BUS_FIELD = "Autobus"
+    MESSAGE_FIELD = "Messages"
+
+    def __init__(self, df):
+        """
+        Args:
+            df: Panda's dataframe containing school data parsed from the website
+        """
+        self._df = df
+
+    def __str__(self):
+        return self._df.to_markdown()
+
+    def __repr__(self):
+        return str(self)
+
+    @property
+    def name(self):
+        """str: name of the school"""
+        return self._df[self.SCHOOL_FIELD]
+
+    @property
+    def messages(self):
+        """str: status messages associated with the school"""
+        return self._df[self.MESSAGE_FIELD]
+
+    @property
+    def is_open(self):
+        """bool: True if school is open, False if not"""
+        return self._df[FSDSchool.OPEN_FIELD] == "Ouvert"
+
+    @property
+    def has_late_busses(self):
+        """bool: True if 1 or more buses are late, False if all are running
+        on time"""
+        return self._df[FSDSchool.BUS_FIELD] != "À l’heure"
diff --git a/src/school_scraper/fsdscraper.py b/src/school_scraper/fsdscraper.py
@@ -0,0 +1,138 @@
+"""Scraper  for New Brunswick francophone school district website"""
+import logging
+import requests
+import pandas as pd
+from .fsdschool import FSDSchool
+from .fsddistrict import FSDDistrict
+
+
+class FSDScraper:
+    """Interface for parsing HTML data loaded from the New Brunswick
+    francophone school district website"""
+
+    SCHEDULE_URL = \
+        "https://bp.nbed.nb.ca/notices/BPRFtbl.aspx?dst=dsfs&amp;vtbl=1"
+
+    def __init__(self, html):
+        """
+        Args:
+            html (str):
+                HTML data loaded from the website. Is expected to contain a
+                single HTML table containing rows describing each school in
+                each school district
+        """
+        assert FSDScraper.validate(html)
+
+        temp = pd.read_html(
+            html, header=0, converters={FSDSchool.MESSAGE_FIELD: str})
+
+        self._data = temp[0]
+        self._data.fillna("", inplace=True)
+
+    def __str__(self):
+        return self._data.to_markdown()
+
+    @staticmethod
+    def validate(html):
+        """Checks to see if HTML loaded from the website is parseable
+
+        Args:
+            html (str):
+                HTML data loaded from the district website
+
+        Returns:
+            bool:
+                True if the HTML content was parseable, False if not. Details
+                of any parsing errors are reported to the logger.
+        """
+        log = logging.getLogger(__name__)
+        try:
+            temp = pd.read_html(
+                html, header=0, converters={FSDSchool.MESSAGE_FIELD: str},
+                flavor="lxml")
+        except ValueError as err:
+            log.error("Error parsing HTML input:")
+            log.error(err)
+            log.debug(html)
+            return False
+
+        if len(temp) != 1:
+            log.error(f"Expected 1 HTML table in the source data but "
+                      f"found {len(temp)} instead")
+            return False
+        data = temp[0]
+        data.fillna("", inplace=True)
+
+        log.debug("Parsed HTML data table:")
+        log.debug(data.to_markdown())
+
+        school_names = list()
+        for cur_school in data[FSDSchool.SCHOOL_FIELD]:
+            if cur_school == "":
+                log.error("Detected row with no valid school name")
+                return False
+
+            if cur_school in school_names:
+                log.error(f"Multiple schools with the same name "
+                          f"detected: {cur_school}")
+                return False
+            school_names.append(cur_school)
+
+        for cur_district in data[FSDDistrict.DISTRICT_FIELD]:
+            if cur_district == "":
+                log.error("Detected row with no valid district name")
+                return False
+
+        return True
+
+    @property
+    def districts(self):
+        """list (FSDDistrict): 0 or more districts parsed from the HTML content
+        """
+        unique_names = self._data[FSDDistrict.DISTRICT_FIELD].str.lower().unique()
+        retval = list()
+        for cur_name in unique_names:
+            rows = self._data[self._data[FSDDistrict.DISTRICT_FIELD].str.lower() == cur_name]
+            retval.append(FSDDistrict(rows))
+        return retval
+
+    def get_district(self, name):
+        """Gets a specific district from the HTML content
+
+        Args:
+            name (str):
+                the name of the district to locate
+
+        Returns:
+            FSDDistrict:
+                Reference to the district details for the named district, or
+                None if no district with the given name exists
+        """
+        for cur_district in self.districts:
+            if cur_district.name.lower() == name.lower():
+                return cur_district
+        return None
+
+    @property
+    def district_names(self):
+        """list (str): list of names of all districts parsed from the HTML"""
+        retval = list()
+        for cur_district in self.districts:
+            retval.append(cur_district.name)
+        return retval
+
+    @property
+    def school_names(self):
+        """list (str): list of unique names of all schools in all districts"""
+        retval = list()
+        for cur_district in self.districts:
+            for cur_school in cur_district.schools:
+                retval.append(cur_school.name)
+        return retval
+
+
+if __name__ == "__main__":  # pragma: no cover
+    text = requests.get(FSDScraper.SCHEDULE_URL).text
+    print(FSDScraper.validate(text))
+    obj = FSDScraper(text)
+    print(obj)