-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from TheFriendlyCoder/poc
initial poc
- Loading branch information
Showing
19 changed files
with
791 additions
and
51 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
3.8.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
"""Primitives for reading francophone school district information""" | ||
from .fsdschool import FSDSchool | ||
|
||
|
||
class FSDDistrict: | ||
"""Abstraction around school district data parsed from the district website | ||
""" | ||
DISTRICT_FIELD = "Région" | ||
|
||
def __init__(self, df): | ||
""" | ||
Args: | ||
df: Pandas data frame describing the district | ||
""" | ||
self._df = df | ||
|
||
def __str__(self): | ||
return self._df.to_markdown() | ||
|
||
def __repr__(self): | ||
return str(self) | ||
|
||
@property | ||
def name(self): | ||
"""str: Name of the district""" | ||
# NOTE: Some districts have the same name but in different character | ||
# casing so we just return a lower-cased representation | ||
temp = self._df[self.DISTRICT_FIELD].str.lower().unique() | ||
assert len(temp) == 1 | ||
return temp[0] | ||
|
||
@property | ||
def schools(self): | ||
"""list (FSDSchool): 1 or more schools associated with this district""" | ||
return [FSDSchool(df[1]) for df in self._df.iterrows()] | ||
|
||
@property | ||
def school_names(self): | ||
"""list (str): list of school names associated with this district""" | ||
retval = list() | ||
for cur_school in self.schools: | ||
retval.append(cur_school.name) | ||
return retval | ||
|
||
def get_school(self, name): | ||
"""Gets a specific school from the district | ||
Args: | ||
name (str): | ||
name of the school to get data for | ||
Returns: | ||
FSDSchool: | ||
reference to the school with the given name, or None if no | ||
such school exists | ||
""" | ||
for cur_school in self.schools: | ||
if cur_school.name.lower() == name.lower(): | ||
return cur_school | ||
return None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
"""Abstraction around a francophone school district school""" | ||
|
||
|
||
class FSDSchool: | ||
"""Abstraction around school-specific data parsed from the school | ||
district's website""" | ||
SCHOOL_FIELD = "Nom de l'école" | ||
OPEN_FIELD = "École" | ||
BUS_FIELD = "Autobus" | ||
MESSAGE_FIELD = "Messages" | ||
|
||
def __init__(self, df): | ||
""" | ||
Args: | ||
df: Panda's dataframe containing school data parsed from the website | ||
""" | ||
self._df = df | ||
|
||
def __str__(self): | ||
return self._df.to_markdown() | ||
|
||
def __repr__(self): | ||
return str(self) | ||
|
||
@property | ||
def name(self): | ||
"""str: name of the school""" | ||
return self._df[self.SCHOOL_FIELD] | ||
|
||
@property | ||
def messages(self): | ||
"""str: status messages associated with the school""" | ||
return self._df[self.MESSAGE_FIELD] | ||
|
||
@property | ||
def is_open(self): | ||
"""bool: True if school is open, False if not""" | ||
return self._df[FSDSchool.OPEN_FIELD] == "Ouvert" | ||
|
||
@property | ||
def has_late_busses(self): | ||
"""bool: True if 1 or more buses are late, False if all are running | ||
on time""" | ||
return self._df[FSDSchool.BUS_FIELD] != "À l’heure" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
"""Scraper for New Brunswick francophone school district website""" | ||
import logging | ||
import requests | ||
import pandas as pd | ||
from .fsdschool import FSDSchool | ||
from .fsddistrict import FSDDistrict | ||
|
||
|
||
class FSDScraper: | ||
"""Interface for parsing HTML data loaded from the New Brunswick | ||
francophone school district website""" | ||
|
||
SCHEDULE_URL = \ | ||
"https://bp.nbed.nb.ca/notices/BPRFtbl.aspx?dst=dsfs&vtbl=1" | ||
|
||
def __init__(self, html): | ||
""" | ||
Args: | ||
html (str): | ||
HTML data loaded from the website. Is expected to contain a | ||
single HTML table containing rows describing each school in | ||
each school district | ||
""" | ||
assert FSDScraper.validate(html) | ||
|
||
temp = pd.read_html( | ||
html, header=0, converters={FSDSchool.MESSAGE_FIELD: str}) | ||
|
||
self._data = temp[0] | ||
self._data.fillna("", inplace=True) | ||
|
||
def __str__(self): | ||
return self._data.to_markdown() | ||
|
||
@staticmethod | ||
def validate(html): | ||
"""Checks to see if HTML loaded from the website is parseable | ||
Args: | ||
html (str): | ||
HTML data loaded from the district website | ||
Returns: | ||
bool: | ||
True if the HTML content was parseable, False if not. Details | ||
of any parsing errors are reported to the logger. | ||
""" | ||
log = logging.getLogger(__name__) | ||
try: | ||
temp = pd.read_html( | ||
html, header=0, converters={FSDSchool.MESSAGE_FIELD: str}, | ||
flavor="lxml") | ||
except ValueError as err: | ||
log.error("Error parsing HTML input:") | ||
log.error(err) | ||
log.debug(html) | ||
return False | ||
|
||
if len(temp) != 1: | ||
log.error(f"Expected 1 HTML table in the source data but " | ||
f"found {len(temp)} instead") | ||
return False | ||
data = temp[0] | ||
data.fillna("", inplace=True) | ||
|
||
log.debug("Parsed HTML data table:") | ||
log.debug(data.to_markdown()) | ||
|
||
school_names = list() | ||
for cur_school in data[FSDSchool.SCHOOL_FIELD]: | ||
if cur_school == "": | ||
log.error("Detected row with no valid school name") | ||
return False | ||
|
||
if cur_school in school_names: | ||
log.error(f"Multiple schools with the same name " | ||
f"detected: {cur_school}") | ||
return False | ||
school_names.append(cur_school) | ||
|
||
for cur_district in data[FSDDistrict.DISTRICT_FIELD]: | ||
if cur_district == "": | ||
log.error("Detected row with no valid district name") | ||
return False | ||
|
||
return True | ||
|
||
@property | ||
def districts(self): | ||
"""list (FSDDistrict): 0 or more districts parsed from the HTML content | ||
""" | ||
unique_names = self._data[FSDDistrict.DISTRICT_FIELD].str.lower().unique() | ||
retval = list() | ||
for cur_name in unique_names: | ||
rows = self._data[self._data[FSDDistrict.DISTRICT_FIELD].str.lower() == cur_name] | ||
retval.append(FSDDistrict(rows)) | ||
return retval | ||
|
||
def get_district(self, name): | ||
"""Gets a specific district from the HTML content | ||
Args: | ||
name (str): | ||
the name of the district to locate | ||
Returns: | ||
FSDDistrict: | ||
Reference to the district details for the named district, or | ||
None if no district with the given name exists | ||
""" | ||
for cur_district in self.districts: | ||
if cur_district.name.lower() == name.lower(): | ||
return cur_district | ||
return None | ||
|
||
@property | ||
def district_names(self): | ||
"""list (str): list of names of all districts parsed from the HTML""" | ||
retval = list() | ||
for cur_district in self.districts: | ||
retval.append(cur_district.name) | ||
return retval | ||
|
||
@property | ||
def school_names(self): | ||
"""list (str): list of unique names of all schools in all districts""" | ||
retval = list() | ||
for cur_district in self.districts: | ||
for cur_school in cur_district.schools: | ||
retval.append(cur_school.name) | ||
return retval | ||
|
||
|
||
if __name__ == "__main__": # pragma: no cover | ||
text = requests.get(FSDScraper.SCHEDULE_URL).text | ||
print(FSDScraper.validate(text)) | ||
obj = FSDScraper(text) | ||
print(obj) |
Oops, something went wrong.