From 237fd49a5c2edf7ec8c93b8bf2e48ad9e17a20f7 Mon Sep 17 00:00:00 2001 From: phi-line Date: Tue, 5 Jun 2018 01:26:05 -0700 Subject: [PATCH 01/19] added format for advanced search. request hits fine but the table will be hard to parse --- scrape_advanced.py | 239 ++++++++++++++++++++++++++++++ data_scraper.py => scrape_term.py | 12 +- server.py | 7 +- settings.py | 13 +- 4 files changed, 256 insertions(+), 15 deletions(-) create mode 100644 scrape_advanced.py rename data_scraper.py => scrape_term.py (89%) diff --git a/scrape_advanced.py b/scrape_advanced.py new file mode 100644 index 0000000..edc0488 --- /dev/null +++ b/scrape_advanced.py @@ -0,0 +1,239 @@ +from collections import defaultdict +from os import makedirs, rename, remove +from os.path import join, exists + +from scrape_term import get_key +from settings import DB_DIR, ADVANCED_HEADERS, PAST_TERM_CODES, SCHEDULE + +# 3rd party +import requests +from bs4 import BeautifulSoup +from tinydb import TinyDB + +def main(): + if not exists(DB_DIR): + makedirs(DB_DIR, exist_ok=True) + + for term in PAST_TERM_CODES.values(): + temp_path = join(DB_DIR, 'temp.json') + temp = TinyDB(temp_path) + + content = mine(term, write=True) + advanced_parse(content, db=temp) + + if rename(temp_path, join(DB_DIR, f'old_{term}_database.json')): + remove(temp_path) + + db = TinyDB(join(DB_DIR, f'old_{term}_database.json')) + print(term, db.tables()) + +def mine(term, write=False): + ''' + Mine will hit the database for foothill's class listings + :param term: (str) the term to mine + :param write: (bool) write to file? + :return res.content: (json) the html body + ''' + cookies = { + 'TESTID': 'SET', + 'badprotocol': '3', + 'shib_idp_session': 'ad53da4cf9b9a4c1f354e513c10c60802592d842df45e292e94acb39e4f22f3c', + 'BANSSO': 'F1538D4DA70C773174EDA34DF454DCA1E95FBB970A1B3C78D794E9F6850F167B', + 'fos.web.server': 'lumweb3', + 'fos.secure.web.server': 'lumweb3', + 'runId': '-5887403237976173719', + 'usid': 'nEekhNSMntHK2/GY/bTtOA__', + 'CPSESSID': 'AQARMjAxODA2MDUwMTE4MjUDABA1NlBKWTcxODI5MzAz', + } + + headers = { + 'Connection': 'keep-alive', + 'Pragma': 'no-cache', + 'Cache-Control': 'no-cache', + 'Origin': 'https://banssb.fhda.edu', + 'Upgrade-Insecure-Requests': '1', + 'Content-Type': 'application/x-www-form-urlencoded', + 'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3202.29 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', + 'Referer': 'https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'en-US,en;q=0.9,la;q=0.8', + 'DNT': '1', + } + + data = [ + ('rsts', 'dummy'), + ('crn', 'dummy'), + ('term_in', f'{term}'), + ('sel_subj', 'dummy'), + ('sel_subj', 'ACTG'), + ('sel_subj', 'ALCB'), + ('sel_subj', 'ALTW'), + ('sel_subj', 'AHS'), + ('sel_subj', 'ANTH'), + ('sel_subj', 'APSM'), + ('sel_subj', 'ART'), + ('sel_subj', 'ASTR'), + ('sel_subj', 'ATHL'), + ('sel_subj', 'BIOL'), + ('sel_subj', 'BUSI'), + ('sel_subj', 'CRLP'), + ('sel_subj', 'CHEM'), + ('sel_subj', 'CHLD'), + ('sel_subj', 'COMM'), + ('sel_subj', 'C S'), + ('sel_subj', 'CNSL'), + ('sel_subj', 'CRWR'), + ('sel_subj', 'DANC'), + ('sel_subj', 'D A'), + ('sel_subj', 'D H'), + ('sel_subj', 'DMS'), + ('sel_subj', 'ECON'), + ('sel_subj', 'EMS'), + ('sel_subj', 'ENGR'), + ('sel_subj', 'ENGL'), + ('sel_subj', 'ESLL'), + ('sel_subj', 'HORT'), + ('sel_subj', 'JFS'), + ('sel_subj', 'GEOG'), + ('sel_subj', 'GIST'), + ('sel_subj', 'GID'), + ('sel_subj', 'HLTH'), + ('sel_subj', 'HIST'), + ('sel_subj', 'HUMN'), + ('sel_subj', 'ITRN'), + ('sel_subj', 'JAPN'), + ('sel_subj', 'KINS'), + ('sel_subj', 'L A'), + ('sel_subj', 'LINC'), + ('sel_subj', 'LIBR'), + ('sel_subj', 'MATH'), + ('sel_subj', 'MDIA'), + ('sel_subj', 'MTEC'), + ('sel_subj', 'MUS'), + ('sel_subj', 'NCBS'), + ('sel_subj', 'NCBH'), + ('sel_subj', 'NCEL'), + ('sel_subj', 'NCLA'), + ('sel_subj', 'NCP'), + ('sel_subj', 'NCSV'), + ('sel_subj', 'P A'), + ('sel_subj', 'PHT'), + ('sel_subj', 'PHIL'), + ('sel_subj', 'PHOT'), + ('sel_subj', 'PHED'), + ('sel_subj', 'PHDA'), + ('sel_subj', 'PSE'), + ('sel_subj', 'PHYS'), + ('sel_subj', 'POLI'), + ('sel_subj', 'PCA'), + ('sel_subj', 'PSYC'), + ('sel_subj', 'R T'), + ('sel_subj', 'RSPT'), + ('sel_subj', 'SOSC'), + ('sel_subj', 'SOC'), + ('sel_subj', 'SPAN'), + ('sel_subj', 'THTR'), + ('sel_subj', 'V T'), + ('sel_subj', 'VITI'), + ('sel_subj', 'WMN'), + ('sel_day', 'dummy'), + ('sel_schd', 'dummy'), + ('sel_insm', 'dummy'), + ('sel_camp', 'dummy'), + ('sel_camp', '%'), + ('sel_levl', 'dummy'), + ('sel_sess', 'dummy'), + ('sel_sess', '%'), + ('sel_instr', 'dummy'), + ('sel_instr', '%'), + ('sel_ptrm', 'dummy'), + ('sel_ptrm', '%'), + ('sel_attr', 'dummy'), + ('sel_crse', ''), + ('sel_title', ''), + ('sel_from_cred', ''), + ('sel_to_cred', ''), + ('begin_hh', '0'), + ('begin_mi', '0'), + ('begin_ap', 'a'), + ('end_hh', '0'), + ('end_mi', '0'), + ('end_ap', 'a'), + ('SUB_BTN', 'Section Search'), + ('path', '1'), + ] + + res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced', headers=headers, cookies=cookies, data=data) + res.raise_for_status() + + if write: + with open(f'{join(DB_DIR, SCHEDULE)}', "wb") as file: + for chunk in res.iter_content(chunk_size=512): + if chunk: + file.write(chunk) + + return res.content + + +def advanced_parse(content, db): + ''' + Advanced parse takes the content from the request and then populates the database with the data + :param content: (html) The html containing the courses + :param db: (TinyDB) the current database + :return: None + ''' + soup = BeautifulSoup(content, 'html5lib') + + table = soup.find('table', {'class': 'datadisplaytable'}) + table_rows = table.find_all('tr') + for tr in table_rows[:5]: + td = tr.find_all('td', {'class': 'dddefault'}) + print(td) + + +def parse(content, db): + ''' + Parse takes the content from the request and then populates the database with the data + :param content: (html) The html containing the courses + :param db: (TinyDB) the current database + ''' + soup = BeautifulSoup(content, 'html5lib') + + tables = soup.find_all('table', {'class': 'TblCourses'}) + for t in tables: + dept = t['dept'].replace(' ', '') + dept_desc = t['dept-desc'] + + rows = t.find_all('tr', {'class': 'CourseRow'}) + s = defaultdict(lambda: defaultdict(list)) + for r in rows: + cols = r.find_all(lambda tag: tag.name == 'td' and not tag.get_text().isspace()) + + if cols: + for i, c in enumerate(cols): + a = c.find('a') + cols[i] = a.get_text() if a else cols[i].get_text() + + try: + key = get_key(f'{cols[0] if cols[0] else cols[1]}')[0] + data = dict(zip(ADVANCED_HEADERS, cols)) + + crn = data['CRN'] + if s[key][crn]: + comb = set(s[key][crn][0].items()) ^ set(data.items()) + if not comb: + continue + + data['units'] = data['units'].lstrip() + + s[key][crn].append(data) + except KeyError: + continue + + j = dict(s) + db.table(f'{dept}').insert(j) + + +if __name__ == '__main__': + main() diff --git a/data_scraper.py b/scrape_term.py similarity index 89% rename from data_scraper.py rename to scrape_term.py index 2420f18..4bcc7ab 100644 --- a/data_scraper.py +++ b/scrape_term.py @@ -8,20 +8,14 @@ from bs4 import BeautifulSoup from tinydb import TinyDB -from settings import DB_DIR +from settings import DB_DIR, CURRENT_TERM_CODES, COURSE_PATTERN, HEADERS, SCHEDULE -SCHEDULE = 'schedule.html' -TERM_CODES = {'fh': '201911', 'da': '201912'} -HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end', - 'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap') - -COURSE_PATTERN = r'[FD]0*(\d*\w?)\.?\d*([YWZH])?' def main(): if not exists(DB_DIR): makedirs(DB_DIR, exist_ok=True) - for term in TERM_CODES.values(): + for term in CURRENT_TERM_CODES.values(): temp_path = join(DB_DIR, 'temp.json') temp = TinyDB(temp_path) @@ -37,7 +31,7 @@ def main(): def mine(term, write=False): ''' - Mine will hit the database for foothill's class listings and write it to a file. + Mine will hit the database for foothill's class listings :param term: (str) the term to mine :param write: (bool) write to file? :return res.content: (json) the html body diff --git a/server.py b/server.py index 517ffff..ffaac31 100644 --- a/server.py +++ b/server.py @@ -5,6 +5,8 @@ import itertools as itr import typing as ty +from settings import COURSE_PATTERN, DAYS_PATTERN, CAMPUS_LIST + # 3rd party from flask import Flask, jsonify, request, render_template from tinydb import TinyDB @@ -21,11 +23,6 @@ def add_cors_headers(response): DB_ROOT = 'db/' -CAMPUS_LIST = {'fh':'201911', 'da':'201912', 'test':'test'} - -COURSE_PATTERN = r'[FD]0*(\d*\w?)\.?\d*([YWZH])?' -DAYS_PATTERN = f"^{'(M|T|W|Th|F|S|U)?'*7}$" - FH_TYPE_ALIAS = {'standard': None, 'online': 'W', 'hybrid': 'Y'} DA_TYPE_ALIAS = {'standard': None, 'online': 'Z', 'hybrid': 'Y'} diff --git a/settings.py b/settings.py index df60686..d8f3a74 100644 --- a/settings.py +++ b/settings.py @@ -1,7 +1,18 @@ import os ROOT_DIR = os.path.abspath(os.path.dirname(__file__)) -API_DIR = os.path.join(ROOT_DIR, 'owlapi') DB_DIR = os.path.join(ROOT_DIR, 'db') TEST_DIR = os.path.join(ROOT_DIR, 'tests') TEST_DB_DIR = os.path.join(TEST_DIR, 'test_db') + +COURSE_PATTERN = r'[FD]0*(\d*\w?)\.?\d*([YWZH])?' +DAYS_PATTERN = f"^{'(M|T|W|Th|F|S|U)?'*7}$" + +SCHEDULE = 'schedule.html' +HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end', + 'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap') +CURRENT_TERM_CODES = CAMPUS_LIST = {'fh': '201911', 'da': '201912', 'test': 'test'} + +ADVANCED_HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end', + 'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap') +PAST_TERM_CODES = {'fh': '201841'} From 2cb8cd28ac0317c15d77e1655eca0a326e3f43f2 Mon Sep 17 00:00:00 2001 From: phi-line Date: Wed, 6 Jun 2018 03:37:44 -0700 Subject: [PATCH 02/19] finished more of the advanced scraper. some potential problems may arise with data that is inconsistent from year to year --- Pipfile | 1 + Pipfile.lock | 49 +++++---- scrape_advanced.py | 264 +++++++++++++++++++++++---------------------- scrape_term.py | 2 +- selenium_login.py | 52 +++++++++ settings.py | 5 +- 6 files changed, 222 insertions(+), 151 deletions(-) create mode 100644 selenium_login.py diff --git a/Pipfile b/Pipfile index 4977ced..5122131 100644 --- a/Pipfile +++ b/Pipfile @@ -16,6 +16,7 @@ maya = "*" pytest = "*" flask = "*" pylint = "*" +selenium = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index ae649ba..532c0c6 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "aebd83c7173a3d0dcc9013e1c610a33d4ffc785fe275cabb5825e246f33fbb93" + "sha256": "6098e2ee7f95b6c25b6293ccd2279203e4a83cbe3fe4660b3d9e3a39ea93de71" }, "host-environment-markers": { "implementation_name": "cpython", @@ -242,10 +242,10 @@ }, "pytest": { "hashes": [ - "sha256:c76e93f3145a44812955e8d46cdd302d8a45fbfc7bf22be24fe231f9d8d8853a", - "sha256:39555d023af3200d004d09e51b4dd9fdd828baa863cded3fd6ba2f29f757ae2d" + "sha256:26838b2bc58620e01675485491504c3aa7ee0faf335c37fcd5f8731ca4319591", + "sha256:32c49a69566aa7c333188149ad48b58ac11a426d5352ea3d8f6ce843f88199cb" ], - "version": "==3.6.0" + "version": "==3.6.1" }, "python-dateutil": { "hashes": [ @@ -270,23 +270,23 @@ }, "regex": { "hashes": [ - "sha256:333687d9a44738c486735955993f83bd22061a416c48f5a5f9e765e90cf1b0c9", - "sha256:361a1fd703a35580a4714ec28d85e29780081a4c399a99bbfb2aee695d72aedb", - "sha256:f69d1201a4750f763971ea8364ed95ee888fc128968b39d38883a72a4d005895", - "sha256:a50532f61b23d4ab9d216a6214f359dd05c911c1a1ad20986b6738a782926c1a", - "sha256:1b428a296531ea1642a7da48562746309c5c06471a97bd0c02dd6a82e9cecee8", - "sha256:5b9c0ddd5b4afa08c9074170a2ea9b34ea296e32aeea522faaaaeeeb2fe0af2e", - "sha256:27d72bb42dffb32516c28d218bb054ce128afd3e18464f30837166346758af67", - "sha256:32cf4743debee9ea12d3626ee21eae83052763740e04086304e7a74778bf58c9", - "sha256:35eeccf17af3b017a54d754e160af597036435c58eceae60f1dd1364ae1250c7", - "sha256:be42a601aaaeb7a317f818490a39d153952a97c40c6e9beeb2a1103616405348", - "sha256:eee4d94b1a626490fc8170ffd788883f8c641b576e11ba9b4a29c9f6623371e0", - "sha256:32f6408dbca35040bc65f9f4ae1444d5546411fde989cb71443a182dd643305e", - "sha256:a9243d7b359b72c681a2c32eaa7ace8d346b7e8ce09d172a683acf6853161d9c", - "sha256:494bed6396a20d3aa6376bdf2d3fbb1005b8f4339558d8ac7b53256755f80303", - "sha256:b44624a38d07d3c954c84ad302c29f7930f4bf01443beef5589e9157b14e2a29" - ], - "version": "==2018.2.21" + "sha256:60ff6be94b168ee7f6f5a8b334503f3d3eda21b2aa9cf3909736bc600ed9455d", + "sha256:92ef64d4afe7e709b57b1ca38a41ef0df54f03b4418f1c0e1b2edb52f671eec8", + "sha256:9ee46c7cb5306c96ae9dad07f608f5a474f47c9505fe046d32df6bcb5e6c18ba", + "sha256:3cc8106f31467d9b7a7ea6d0db95cb7a4097e3683e686c89cc14d3a81f66e637", + "sha256:8ad161a52107e6e4bd56f1ee299b1dc492873b8abbfcf639fea4765d96853e32", + "sha256:3326619c3716dbbfe5b2a3e4a109b0bbb6476a35398612539788b15663e0f0d3", + "sha256:f87f51647eeff0f7a1e787b2a8b56d059cfa3ea28f2d825b50a66a172574c6f0", + "sha256:712922a779b153290e3007f4bbdb0af459c36c70f00c6690acd0a86f2f3f52b0", + "sha256:7638a3babd94f947e639c45c0b13cee62caea31ad6fedce392bd3edacf412c5f", + "sha256:3b95120ffcbeb44eb3362456ec887c72190726a2a3e270f1c7343266941826d4", + "sha256:45fa86b2e6bf8b1f5b60820d2d9520d42f32497d4bf8903ed0b86285b29d3aa9", + "sha256:bfb99e3bdf1ff372c8876f217b00fe44dd08f3f53ab590df6fa93b3b72d9dfb6", + "sha256:99665c4ca23f9b09618b38afd3c11d0dd6424d0e2d4374afd4c3fc319236552b", + "sha256:ab174253361da55a8425f60bbe319fb32083b295507bace5513834bc3723fcd1", + "sha256:afeb71482e4f7c18ad94802c6c8fbabf2585d3804ca45a8c9db046c120a44a51" + ], + "version": "==2018.6.6" }, "requests": { "hashes": [ @@ -295,6 +295,13 @@ ], "version": "==2.18.4" }, + "selenium": { + "hashes": [ + "sha256:1372101ad23798462038481f92ba1c7fab8385c788b05da6b44318f10ea52422", + "sha256:b8a2630fd858636c894960726ca3c94d8277e516ea3a9d81614fb819a5844764" + ], + "version": "==3.12.0" + }, "six": { "hashes": [ "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb", diff --git a/scrape_advanced.py b/scrape_advanced.py index edc0488..4b4920b 100644 --- a/scrape_advanced.py +++ b/scrape_advanced.py @@ -1,7 +1,8 @@ -from collections import defaultdict from os import makedirs, rename, remove from os.path import join, exists +from collections import defaultdict +from selenium_login import scrape_cookies, kill_driver from scrape_term import get_key from settings import DB_DIR, ADVANCED_HEADERS, PAST_TERM_CODES, SCHEDULE @@ -18,8 +19,15 @@ def main(): temp_path = join(DB_DIR, 'temp.json') temp = TinyDB(temp_path) - content = mine(term, write=True) - advanced_parse(content, db=temp) + try: + cookies = scrape_cookies() + content = mine(term, cookies) + except KeyboardInterrupt: + kill_driver() + return + finally: + kill_driver() + advanced_parse(content, db=temp) if rename(temp_path, join(DB_DIR, f'old_{term}_database.json')): remove(temp_path) @@ -27,24 +35,14 @@ def main(): db = TinyDB(join(DB_DIR, f'old_{term}_database.json')) print(term, db.tables()) -def mine(term, write=False): +def mine(term, cookies, write=False): ''' Mine will hit the database for foothill's class listings :param term: (str) the term to mine + :param cookies: (dict) cookies to send with POST :param write: (bool) write to file? :return res.content: (json) the html body ''' - cookies = { - 'TESTID': 'SET', - 'badprotocol': '3', - 'shib_idp_session': 'ad53da4cf9b9a4c1f354e513c10c60802592d842df45e292e94acb39e4f22f3c', - 'BANSSO': 'F1538D4DA70C773174EDA34DF454DCA1E95FBB970A1B3C78D794E9F6850F167B', - 'fos.web.server': 'lumweb3', - 'fos.secure.web.server': 'lumweb3', - 'runId': '-5887403237976173719', - 'usid': 'nEekhNSMntHK2/GY/bTtOA__', - 'CPSESSID': 'AQARMjAxODA2MDUwMTE4MjUDABA1NlBKWTcxODI5MzAz', - } headers = { 'Connection': 'keep-alive', @@ -53,90 +51,123 @@ def mine(term, write=False): 'Origin': 'https://banssb.fhda.edu', 'Upgrade-Insecure-Requests': '1', 'Content-Type': 'application/x-www-form-urlencoded', - 'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3202.29 Safari/537.36', + 'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', - 'Referer': 'https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse', + 'Referer': 'https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9,la;q=0.8', 'DNT': '1', } + # data = [ + # ('rsts', 'dummy'), + # ('crn', 'dummy'), + # ('term_in', f'{term}'), + # ('sel_subj', 'dummy'), + # ('sel_subj', 'ACTG'), + # ('sel_subj', 'ALCB'), + # ('sel_subj', 'ALTW'), + # ('sel_subj', 'AHS'), + # ('sel_subj', 'ANTH'), + # ('sel_subj', 'APSM'), + # ('sel_subj', 'ART'), + # ('sel_subj', 'ASTR'), + # ('sel_subj', 'ATHL'), + # ('sel_subj', 'BIOL'), + # ('sel_subj', 'BUSI'), + # ('sel_subj', 'CRLP'), + # ('sel_subj', 'CHEM'), + # ('sel_subj', 'CHLD'), + # ('sel_subj', 'COMM'), + # ('sel_subj', 'C S'), + # ('sel_subj', 'CNSL'), + # ('sel_subj', 'CRWR'), + # ('sel_subj', 'DANC'), + # ('sel_subj', 'D A'), + # ('sel_subj', 'D H'), + # ('sel_subj', 'DMS'), + # ('sel_subj', 'ECON'), + # ('sel_subj', 'EMS'), + # ('sel_subj', 'ENGR'), + # ('sel_subj', 'ENGL'), + # ('sel_subj', 'ESLL'), + # ('sel_subj', 'HORT'), + # ('sel_subj', 'JFS'), + # ('sel_subj', 'GEOG'), + # ('sel_subj', 'GIST'), + # ('sel_subj', 'GID'), + # ('sel_subj', 'HLTH'), + # ('sel_subj', 'HIST'), + # ('sel_subj', 'HUMN'), + # ('sel_subj', 'ITRN'), + # ('sel_subj', 'JAPN'), + # ('sel_subj', 'KINS'), + # ('sel_subj', 'L A'), + # ('sel_subj', 'LINC'), + # ('sel_subj', 'LIBR'), + # ('sel_subj', 'MATH'), + # ('sel_subj', 'MDIA'), + # ('sel_subj', 'MTEC'), + # ('sel_subj', 'MUS'), + # ('sel_subj', 'NCBS'), + # ('sel_subj', 'NCBH'), + # ('sel_subj', 'NCEL'), + # ('sel_subj', 'NCLA'), + # ('sel_subj', 'NCP'), + # ('sel_subj', 'NCSV'), + # ('sel_subj', 'P A'), + # ('sel_subj', 'PHT'), + # ('sel_subj', 'PHIL'), + # ('sel_subj', 'PHOT'), + # ('sel_subj', 'PHED'), + # ('sel_subj', 'PHDA'), + # ('sel_subj', 'PSE'), + # ('sel_subj', 'PHYS'), + # ('sel_subj', 'POLI'), + # ('sel_subj', 'PCA'), + # ('sel_subj', 'PSYC'), + # ('sel_subj', 'R T'), + # ('sel_subj', 'RSPT'), + # ('sel_subj', 'SOSC'), + # ('sel_subj', 'SOC'), + # ('sel_subj', 'SPAN'), + # ('sel_subj', 'THTR'), + # ('sel_subj', 'V T'), + # ('sel_subj', 'VITI'), + # ('sel_subj', 'WMN'), + # ('sel_day', 'dummy'), + # ('sel_schd', 'dummy'), + # ('sel_insm', 'dummy'), + # ('sel_camp', 'dummy'), + # ('sel_camp', '%'), + # ('sel_levl', 'dummy'), + # ('sel_sess', 'dummy'), + # ('sel_sess', '%'), + # ('sel_instr', 'dummy'), + # ('sel_instr', '%'), + # ('sel_ptrm', 'dummy'), + # ('sel_ptrm', '%'), + # ('sel_attr', 'dummy'), + # ('sel_crse', ''), + # ('sel_title', ''), + # ('sel_from_cred', ''), + # ('sel_to_cred', ''), + # ('begin_hh', '0'), + # ('begin_mi', '0'), + # ('begin_ap', 'a'), + # ('end_hh', '0'), + # ('end_mi', '0'), + # ('end_ap', 'a'), + # ('SUB_BTN', 'Section Search'), + # ('path', '1'), + # ] + data = [ ('rsts', 'dummy'), ('crn', 'dummy'), ('term_in', f'{term}'), ('sel_subj', 'dummy'), ('sel_subj', 'ACTG'), - ('sel_subj', 'ALCB'), - ('sel_subj', 'ALTW'), - ('sel_subj', 'AHS'), - ('sel_subj', 'ANTH'), - ('sel_subj', 'APSM'), - ('sel_subj', 'ART'), - ('sel_subj', 'ASTR'), - ('sel_subj', 'ATHL'), - ('sel_subj', 'BIOL'), - ('sel_subj', 'BUSI'), - ('sel_subj', 'CRLP'), - ('sel_subj', 'CHEM'), - ('sel_subj', 'CHLD'), - ('sel_subj', 'COMM'), - ('sel_subj', 'C S'), - ('sel_subj', 'CNSL'), - ('sel_subj', 'CRWR'), - ('sel_subj', 'DANC'), - ('sel_subj', 'D A'), - ('sel_subj', 'D H'), - ('sel_subj', 'DMS'), - ('sel_subj', 'ECON'), - ('sel_subj', 'EMS'), - ('sel_subj', 'ENGR'), - ('sel_subj', 'ENGL'), - ('sel_subj', 'ESLL'), - ('sel_subj', 'HORT'), - ('sel_subj', 'JFS'), - ('sel_subj', 'GEOG'), - ('sel_subj', 'GIST'), - ('sel_subj', 'GID'), - ('sel_subj', 'HLTH'), - ('sel_subj', 'HIST'), - ('sel_subj', 'HUMN'), - ('sel_subj', 'ITRN'), - ('sel_subj', 'JAPN'), - ('sel_subj', 'KINS'), - ('sel_subj', 'L A'), - ('sel_subj', 'LINC'), - ('sel_subj', 'LIBR'), - ('sel_subj', 'MATH'), - ('sel_subj', 'MDIA'), - ('sel_subj', 'MTEC'), - ('sel_subj', 'MUS'), - ('sel_subj', 'NCBS'), - ('sel_subj', 'NCBH'), - ('sel_subj', 'NCEL'), - ('sel_subj', 'NCLA'), - ('sel_subj', 'NCP'), - ('sel_subj', 'NCSV'), - ('sel_subj', 'P A'), - ('sel_subj', 'PHT'), - ('sel_subj', 'PHIL'), - ('sel_subj', 'PHOT'), - ('sel_subj', 'PHED'), - ('sel_subj', 'PHDA'), - ('sel_subj', 'PSE'), - ('sel_subj', 'PHYS'), - ('sel_subj', 'POLI'), - ('sel_subj', 'PCA'), - ('sel_subj', 'PSYC'), - ('sel_subj', 'R T'), - ('sel_subj', 'RSPT'), - ('sel_subj', 'SOSC'), - ('sel_subj', 'SOC'), - ('sel_subj', 'SPAN'), - ('sel_subj', 'THTR'), - ('sel_subj', 'V T'), - ('sel_subj', 'VITI'), - ('sel_subj', 'WMN'), ('sel_day', 'dummy'), ('sel_schd', 'dummy'), ('sel_insm', 'dummy'), @@ -164,7 +195,8 @@ def mine(term, write=False): ('path', '1'), ] - res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced', headers=headers, cookies=cookies, data=data) + res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced', + headers=headers, cookies=cookies, data=data) res.raise_for_status() if write: @@ -187,52 +219,30 @@ def advanced_parse(content, db): table = soup.find('table', {'class': 'datadisplaytable'}) table_rows = table.find_all('tr') - for tr in table_rows[:5]: - td = tr.find_all('td', {'class': 'dddefault'}) - print(td) - - -def parse(content, db): - ''' - Parse takes the content from the request and then populates the database with the data - :param content: (html) The html containing the courses - :param db: (TinyDB) the current database - ''' - soup = BeautifulSoup(content, 'html5lib') - - tables = soup.find_all('table', {'class': 'TblCourses'}) - for t in tables: - dept = t['dept'].replace(' ', '') - dept_desc = t['dept-desc'] - rows = t.find_all('tr', {'class': 'CourseRow'}) - s = defaultdict(lambda: defaultdict(list)) - for r in rows: - cols = r.find_all(lambda tag: tag.name == 'td' and not tag.get_text().isspace()) + for tr in table_rows: + cols = tr.find_all(lambda tag: tag.name == 'td' and not tag.get_text().isspace()) - if cols: - for i, c in enumerate(cols): - a = c.find('a') - cols[i] = a.get_text() if a else cols[i].get_text() + if cols and len(cols) >= len(ADVANCED_HEADERS): + s = defaultdict(lambda: defaultdict(list)) - try: - key = get_key(f'{cols[0] if cols[0] else cols[1]}')[0] - data = dict(zip(ADVANCED_HEADERS, cols)) + for i, c in enumerate(cols[1:]): + a = c.find('a') + cols[i] = a.get_text() if a else cols[i].get_text() - crn = data['CRN'] - if s[key][crn]: - comb = set(s[key][crn][0].items()) ^ set(data.items()) - if not comb: - continue + try: + data = dict(zip(ADVANCED_HEADERS, cols)) - data['units'] = data['units'].lstrip() + key = data['course'] + crn = data['CRN'] + data['units'] = data['units'].lstrip() - s[key][crn].append(data) - except KeyError: - continue + s[key][crn].append(data) + except KeyError: + continue - j = dict(s) - db.table(f'{dept}').insert(j) + j = dict(s) + db.table(f'{key}').insert(j) if __name__ == '__main__': diff --git a/scrape_term.py b/scrape_term.py index 4bcc7ab..bb2bef7 100644 --- a/scrape_term.py +++ b/scrape_term.py @@ -1,7 +1,7 @@ -from collections import defaultdict from os import makedirs, rename, remove from os.path import join, exists from re import match +from collections import defaultdict # 3rd party import requests diff --git a/selenium_login.py b/selenium_login.py new file mode 100644 index 0000000..8ab581d --- /dev/null +++ b/selenium_login.py @@ -0,0 +1,52 @@ +import os + +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC + +from selenium.webdriver.chrome.options import Options + +chrome_options = Options() +chrome_options.add_argument("--window-size=300,400") + +driver = webdriver.Chrome(chrome_options=chrome_options) +driver.get("https://myportal.fhda.edu/cp/home/displaylogin") + + +def scrape_cookies(): + driver.execute_script(f"document.getElementById('user').value='{os.environ['MP_USER']}'") + driver.execute_script(f"document.getElementById('pass').value='{os.environ['MP_PASS']}'") + + try: + driver.execute_script("doLogin()") + WebDriverWait(driver, 10).until( + EC.title_is("MyPortal / Foothill-De Anza College District") + ) + + driver.get("https://myportal.fhda.edu/render.UserLayoutRootNode.uP?uP_tparam=utf&utf=%2fcp%2fip%2flogin%3fsys% \ + 3dsctssb%26url%3dhttps%3A%2F%2Fbanssb.fhda.edu%2FPROD%2Fbwskfcls.p_sel_crse_search") + driver.get("https://myportal.fhda.edu/render.UserLayoutRootNode.uP?uP_tparam=utf&utf=%2fcp%2fip%2flogin%3fsys%3dsctssb%26url%3dhttps%3A%2F%2Fbanssb.fhda.edu%2FPROD%2Fbwskfcls.p_sel_crse_search") + + WebDriverWait(driver, 10).until( + EC.title_is("MyPortal / Foothill-De Anza College District") + ) + finally: + cookies_list = driver.get_cookies() + return get_cookies(cookies_list) + + +def get_cookies(cookies_list): + cookies_dict = {} + for cookie in cookies_list: + cookies_dict[cookie['name']] = cookie['value'] + print(cookies_dict) + return cookies_dict + + +def kill_driver(): + driver.quit() + + +if __name__ == '__main__': + scrape_cookies() diff --git a/settings.py b/settings.py index d8f3a74..d45a26f 100644 --- a/settings.py +++ b/settings.py @@ -13,6 +13,7 @@ 'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap') CURRENT_TERM_CODES = CAMPUS_LIST = {'fh': '201911', 'da': '201912', 'test': 'test'} -ADVANCED_HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end', - 'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap') +ADVANCED_HEADERS = ('select', 'CRN', 'subject', 'section', 'course', 'campus', 'units', 'title', 'days', 'time', + 'seats_cap', 'seats_act', 'seats', 'wait_cap', 'wait_act', 'wait_seats', 'instructor', + 'date_range', 'location') PAST_TERM_CODES = {'fh': '201841'} From 564a09faf8cdf4bdd9fb8ec1dc70a4ed83d9f96a Mon Sep 17 00:00:00 2001 From: phi-line Date: Wed, 6 Jun 2018 11:59:56 -0700 Subject: [PATCH 03/19] scraper working for each quarter's data --- scrape_advanced.py | 259 +++++++++++++++++++++++---------------------- 1 file changed, 131 insertions(+), 128 deletions(-) diff --git a/scrape_advanced.py b/scrape_advanced.py index 4b4920b..2a3608c 100644 --- a/scrape_advanced.py +++ b/scrape_advanced.py @@ -1,5 +1,6 @@ from os import makedirs, rename, remove from os.path import join, exists +from re import compile from collections import defaultdict from selenium_login import scrape_cookies, kill_driver @@ -15,25 +16,24 @@ def main(): if not exists(DB_DIR): makedirs(DB_DIR, exist_ok=True) - for term in PAST_TERM_CODES.values(): - temp_path = join(DB_DIR, 'temp.json') - temp = TinyDB(temp_path) + cookies = scrape_cookies() + try: + for term in PAST_TERM_CODES.values(): + temp_path = join(DB_DIR, 'temp.json') + temp = TinyDB(temp_path) - try: - cookies = scrape_cookies() content = mine(term, cookies) - except KeyboardInterrupt: - kill_driver() - return - finally: - kill_driver() advanced_parse(content, db=temp) - if rename(temp_path, join(DB_DIR, f'old_{term}_database.json')): - remove(temp_path) + if rename(temp_path, join(DB_DIR, f'old_{term}_database.json')): + remove(temp_path) - db = TinyDB(join(DB_DIR, f'old_{term}_database.json')) - print(term, db.tables()) + db = TinyDB(join(DB_DIR, f'old_{term}_database.json')) + print(term, db.tables()) + except KeyboardInterrupt: + kill_driver() + finally: + kill_driver() def mine(term, cookies, write=False): ''' @@ -59,115 +59,82 @@ def mine(term, cookies, write=False): 'DNT': '1', } - # data = [ - # ('rsts', 'dummy'), - # ('crn', 'dummy'), - # ('term_in', f'{term}'), - # ('sel_subj', 'dummy'), - # ('sel_subj', 'ACTG'), - # ('sel_subj', 'ALCB'), - # ('sel_subj', 'ALTW'), - # ('sel_subj', 'AHS'), - # ('sel_subj', 'ANTH'), - # ('sel_subj', 'APSM'), - # ('sel_subj', 'ART'), - # ('sel_subj', 'ASTR'), - # ('sel_subj', 'ATHL'), - # ('sel_subj', 'BIOL'), - # ('sel_subj', 'BUSI'), - # ('sel_subj', 'CRLP'), - # ('sel_subj', 'CHEM'), - # ('sel_subj', 'CHLD'), - # ('sel_subj', 'COMM'), - # ('sel_subj', 'C S'), - # ('sel_subj', 'CNSL'), - # ('sel_subj', 'CRWR'), - # ('sel_subj', 'DANC'), - # ('sel_subj', 'D A'), - # ('sel_subj', 'D H'), - # ('sel_subj', 'DMS'), - # ('sel_subj', 'ECON'), - # ('sel_subj', 'EMS'), - # ('sel_subj', 'ENGR'), - # ('sel_subj', 'ENGL'), - # ('sel_subj', 'ESLL'), - # ('sel_subj', 'HORT'), - # ('sel_subj', 'JFS'), - # ('sel_subj', 'GEOG'), - # ('sel_subj', 'GIST'), - # ('sel_subj', 'GID'), - # ('sel_subj', 'HLTH'), - # ('sel_subj', 'HIST'), - # ('sel_subj', 'HUMN'), - # ('sel_subj', 'ITRN'), - # ('sel_subj', 'JAPN'), - # ('sel_subj', 'KINS'), - # ('sel_subj', 'L A'), - # ('sel_subj', 'LINC'), - # ('sel_subj', 'LIBR'), - # ('sel_subj', 'MATH'), - # ('sel_subj', 'MDIA'), - # ('sel_subj', 'MTEC'), - # ('sel_subj', 'MUS'), - # ('sel_subj', 'NCBS'), - # ('sel_subj', 'NCBH'), - # ('sel_subj', 'NCEL'), - # ('sel_subj', 'NCLA'), - # ('sel_subj', 'NCP'), - # ('sel_subj', 'NCSV'), - # ('sel_subj', 'P A'), - # ('sel_subj', 'PHT'), - # ('sel_subj', 'PHIL'), - # ('sel_subj', 'PHOT'), - # ('sel_subj', 'PHED'), - # ('sel_subj', 'PHDA'), - # ('sel_subj', 'PSE'), - # ('sel_subj', 'PHYS'), - # ('sel_subj', 'POLI'), - # ('sel_subj', 'PCA'), - # ('sel_subj', 'PSYC'), - # ('sel_subj', 'R T'), - # ('sel_subj', 'RSPT'), - # ('sel_subj', 'SOSC'), - # ('sel_subj', 'SOC'), - # ('sel_subj', 'SPAN'), - # ('sel_subj', 'THTR'), - # ('sel_subj', 'V T'), - # ('sel_subj', 'VITI'), - # ('sel_subj', 'WMN'), - # ('sel_day', 'dummy'), - # ('sel_schd', 'dummy'), - # ('sel_insm', 'dummy'), - # ('sel_camp', 'dummy'), - # ('sel_camp', '%'), - # ('sel_levl', 'dummy'), - # ('sel_sess', 'dummy'), - # ('sel_sess', '%'), - # ('sel_instr', 'dummy'), - # ('sel_instr', '%'), - # ('sel_ptrm', 'dummy'), - # ('sel_ptrm', '%'), - # ('sel_attr', 'dummy'), - # ('sel_crse', ''), - # ('sel_title', ''), - # ('sel_from_cred', ''), - # ('sel_to_cred', ''), - # ('begin_hh', '0'), - # ('begin_mi', '0'), - # ('begin_ap', 'a'), - # ('end_hh', '0'), - # ('end_mi', '0'), - # ('end_ap', 'a'), - # ('SUB_BTN', 'Section Search'), - # ('path', '1'), - # ] - data = [ ('rsts', 'dummy'), ('crn', 'dummy'), ('term_in', f'{term}'), ('sel_subj', 'dummy'), ('sel_subj', 'ACTG'), + ('sel_subj', 'ALCB'), + ('sel_subj', 'ALTW'), + ('sel_subj', 'AHS'), + ('sel_subj', 'ANTH'), + ('sel_subj', 'APSM'), + ('sel_subj', 'ART'), + ('sel_subj', 'ASTR'), + ('sel_subj', 'ATHL'), + ('sel_subj', 'BIOL'), + ('sel_subj', 'BUSI'), + ('sel_subj', 'CRLP'), + ('sel_subj', 'CHEM'), + ('sel_subj', 'CHLD'), + ('sel_subj', 'COMM'), + ('sel_subj', 'C S'), + ('sel_subj', 'CNSL'), + ('sel_subj', 'CRWR'), + ('sel_subj', 'DANC'), + ('sel_subj', 'D A'), + ('sel_subj', 'D H'), + ('sel_subj', 'DMS'), + ('sel_subj', 'ECON'), + ('sel_subj', 'EMS'), + ('sel_subj', 'ENGR'), + ('sel_subj', 'ENGL'), + ('sel_subj', 'ESLL'), + ('sel_subj', 'HORT'), + ('sel_subj', 'JFS'), + ('sel_subj', 'GEOG'), + ('sel_subj', 'GIST'), + ('sel_subj', 'GID'), + ('sel_subj', 'HLTH'), + ('sel_subj', 'HIST'), + ('sel_subj', 'HUMN'), + ('sel_subj', 'ITRN'), + ('sel_subj', 'JAPN'), + ('sel_subj', 'KINS'), + ('sel_subj', 'L A'), + ('sel_subj', 'LINC'), + ('sel_subj', 'LIBR'), + ('sel_subj', 'MATH'), + ('sel_subj', 'MDIA'), + ('sel_subj', 'MTEC'), + ('sel_subj', 'MUS'), + ('sel_subj', 'NCBS'), + ('sel_subj', 'NCBH'), + ('sel_subj', 'NCEL'), + ('sel_subj', 'NCLA'), + ('sel_subj', 'NCP'), + ('sel_subj', 'NCSV'), + ('sel_subj', 'P A'), + ('sel_subj', 'PHT'), + ('sel_subj', 'PHIL'), + ('sel_subj', 'PHOT'), + ('sel_subj', 'PHED'), + ('sel_subj', 'PHDA'), + ('sel_subj', 'PSE'), + ('sel_subj', 'PHYS'), + ('sel_subj', 'POLI'), + ('sel_subj', 'PCA'), + ('sel_subj', 'PSYC'), + ('sel_subj', 'R T'), + ('sel_subj', 'RSPT'), + ('sel_subj', 'SOSC'), + ('sel_subj', 'SOC'), + ('sel_subj', 'SPAN'), + ('sel_subj', 'THTR'), + ('sel_subj', 'V T'), + ('sel_subj', 'VITI'), + ('sel_subj', 'WMN'), ('sel_day', 'dummy'), ('sel_schd', 'dummy'), ('sel_insm', 'dummy'), @@ -195,6 +162,39 @@ def mine(term, cookies, write=False): ('path', '1'), ] + # data = [ + # ('rsts', 'dummy'), + # ('crn', 'dummy'), + # ('term_in', f'{term}'), + # ('sel_subj', 'dummy'), + # ('sel_subj', 'ACTG'), + # ('sel_day', 'dummy'), + # ('sel_schd', 'dummy'), + # ('sel_insm', 'dummy'), + # ('sel_camp', 'dummy'), + # ('sel_camp', '%'), + # ('sel_levl', 'dummy'), + # ('sel_sess', 'dummy'), + # ('sel_sess', '%'), + # ('sel_instr', 'dummy'), + # ('sel_instr', '%'), + # ('sel_ptrm', 'dummy'), + # ('sel_ptrm', '%'), + # ('sel_attr', 'dummy'), + # ('sel_crse', ''), + # ('sel_title', ''), + # ('sel_from_cred', ''), + # ('sel_to_cred', ''), + # ('begin_hh', '0'), + # ('begin_mi', '0'), + # ('begin_ap', 'a'), + # ('end_hh', '0'), + # ('end_mi', '0'), + # ('end_ap', 'a'), + # ('SUB_BTN', 'Section Search'), + # ('path', '1'), + # ] + res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced', headers=headers, cookies=cookies, data=data) res.raise_for_status() @@ -226,23 +226,26 @@ def advanced_parse(content, db): if cols and len(cols) >= len(ADVANCED_HEADERS): s = defaultdict(lambda: defaultdict(list)) - for i, c in enumerate(cols[1:]): + for i, c in enumerate(cols): a = c.find('a') - cols[i] = a.get_text() if a else cols[i].get_text() + cols[i] = get_parsed_text(a) if a else get_parsed_text(cols[i]) - try: - data = dict(zip(ADVANCED_HEADERS, cols)) + data = dict(zip(ADVANCED_HEADERS, cols)) - key = data['course'] - crn = data['CRN'] - data['units'] = data['units'].lstrip() + subject = data['subject'] + key = data['course'] + crn = data['CRN'] - s[key][crn].append(data) - except KeyError: - continue + s[key][crn].append(data) j = dict(s) - db.table(f'{key}').insert(j) + db.table(f'{subject}').insert(j) + + +def get_parsed_text(tag): + text = tag.get_text() + p = compile(r'<.*?>') + return p.sub('', text) if __name__ == '__main__': From b8e6ddf084f63901f1001762aea1b92c44cebc4d Mon Sep 17 00:00:00 2001 From: phi-line Date: Wed, 6 Jun 2018 13:39:07 -0700 Subject: [PATCH 04/19] scraping working for one quarter. hybrid classes handles --- scrape_advanced.py | 112 +++++++++++++++++++++------------------------ scrape_term.py | 4 +- settings.py | 3 +- 3 files changed, 56 insertions(+), 63 deletions(-) diff --git a/scrape_advanced.py b/scrape_advanced.py index 2a3608c..749a80e 100644 --- a/scrape_advanced.py +++ b/scrape_advanced.py @@ -5,36 +5,38 @@ from selenium_login import scrape_cookies, kill_driver from scrape_term import get_key -from settings import DB_DIR, ADVANCED_HEADERS, PAST_TERM_CODES, SCHEDULE +from settings import OLD_DB_DIR, ADVANCED_HEADERS, PAST_TERM_CODES, SCHEDULE # 3rd party import requests from bs4 import BeautifulSoup from tinydb import TinyDB + def main(): - if not exists(DB_DIR): - makedirs(DB_DIR, exist_ok=True) + if not exists(OLD_DB_DIR): + makedirs(OLD_DB_DIR, exist_ok=True) cookies = scrape_cookies() try: for term in PAST_TERM_CODES.values(): - temp_path = join(DB_DIR, 'temp.json') + temp_path = join(OLD_DB_DIR, 'temp.json') temp = TinyDB(temp_path) - content = mine(term, cookies) + content = mine(term, cookies, write=False) advanced_parse(content, db=temp) - if rename(temp_path, join(DB_DIR, f'old_{term}_database.json')): + if rename(temp_path, join(OLD_DB_DIR, f'old_{term}_database.json')): remove(temp_path) - db = TinyDB(join(DB_DIR, f'old_{term}_database.json')) + db = TinyDB(join(OLD_DB_DIR, f'old_{term}_database.json')) print(term, db.tables()) except KeyboardInterrupt: kill_driver() finally: kill_driver() + def mine(term, cookies, write=False): ''' Mine will hit the database for foothill's class listings @@ -162,39 +164,6 @@ def mine(term, cookies, write=False): ('path', '1'), ] - # data = [ - # ('rsts', 'dummy'), - # ('crn', 'dummy'), - # ('term_in', f'{term}'), - # ('sel_subj', 'dummy'), - # ('sel_subj', 'ACTG'), - # ('sel_day', 'dummy'), - # ('sel_schd', 'dummy'), - # ('sel_insm', 'dummy'), - # ('sel_camp', 'dummy'), - # ('sel_camp', '%'), - # ('sel_levl', 'dummy'), - # ('sel_sess', 'dummy'), - # ('sel_sess', '%'), - # ('sel_instr', 'dummy'), - # ('sel_instr', '%'), - # ('sel_ptrm', 'dummy'), - # ('sel_ptrm', '%'), - # ('sel_attr', 'dummy'), - # ('sel_crse', ''), - # ('sel_title', ''), - # ('sel_from_cred', ''), - # ('sel_to_cred', ''), - # ('begin_hh', '0'), - # ('begin_mi', '0'), - # ('begin_ap', 'a'), - # ('end_hh', '0'), - # ('end_mi', '0'), - # ('end_ap', 'a'), - # ('SUB_BTN', 'Section Search'), - # ('path', '1'), - # ] - res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced', headers=headers, cookies=cookies, data=data) res.raise_for_status() @@ -208,6 +177,10 @@ def mine(term, cookies, write=False): return res.content +class BlankRow(Exception): + pass + + def advanced_parse(content, db): ''' Advanced parse takes the content from the request and then populates the database with the data @@ -220,26 +193,45 @@ def advanced_parse(content, db): table = soup.find('table', {'class': 'datadisplaytable'}) table_rows = table.find_all('tr') - for tr in table_rows: - cols = tr.find_all(lambda tag: tag.name == 'td' and not tag.get_text().isspace()) - - if cols and len(cols) >= len(ADVANCED_HEADERS): - s = defaultdict(lambda: defaultdict(list)) - - for i, c in enumerate(cols): - a = c.find('a') - cols[i] = get_parsed_text(a) if a else get_parsed_text(cols[i]) - - data = dict(zip(ADVANCED_HEADERS, cols)) - - subject = data['subject'] - key = data['course'] - crn = data['CRN'] - - s[key][crn].append(data) - - j = dict(s) - db.table(f'{subject}').insert(j) + table_headers = list() + start_idx = 0 + for i, tr in enumerate(table_rows): + header_cols = tr.find_all('th', {'class': 'ddheader'}) + for th in header_cols: + table_headers.append(get_parsed_text(th)) + if table_headers: + start_idx = i + break + + for tr in table_rows[start_idx:]: + try: + cols = tr.find_all('td', {'class': 'dddefault'}) + + if len(cols) > 0: + s = defaultdict(lambda: defaultdict(list)) + + num_blank = 0 + for i, c in enumerate(cols): + a = c.find('a') + cols[i] = get_parsed_text(a) if a else get_parsed_text(cols[i]) + if cols[i].isspace(): + num_blank += 1 + + if num_blank > len(cols) - num_blank: + raise BlankRow + + data = dict(zip(table_headers, cols)) + + subject = data['Subj'] + key = data['Crse'] + crn = data['CRN'] + + s[key][crn].append(data) + + j = dict(s) + db.table(f'{subject}').insert(j) + except BlankRow: + continue def get_parsed_text(tag): diff --git a/scrape_term.py b/scrape_term.py index bb2bef7..6fab54e 100644 --- a/scrape_term.py +++ b/scrape_term.py @@ -78,8 +78,8 @@ def parse(content, db): rows = t.find_all('tr', {'class': 'CourseRow'}) s = defaultdict(lambda: defaultdict(list)) - for r in rows: - cols = r.find_all(lambda tag: tag.name == 'td' and not tag.get_text().isspace()) + for tr in rows: + cols = tr.find_all(lambda tag: tag.name == 'td' and not tag.get_text().isspace()) if cols: for i, c in enumerate(cols): diff --git a/settings.py b/settings.py index d45a26f..fbc3ba2 100644 --- a/settings.py +++ b/settings.py @@ -2,6 +2,7 @@ ROOT_DIR = os.path.abspath(os.path.dirname(__file__)) DB_DIR = os.path.join(ROOT_DIR, 'db') +OLD_DB_DIR = os.path.join(DB_DIR, 'old') TEST_DIR = os.path.join(ROOT_DIR, 'tests') TEST_DB_DIR = os.path.join(TEST_DIR, 'test_db') @@ -16,4 +17,4 @@ ADVANCED_HEADERS = ('select', 'CRN', 'subject', 'section', 'course', 'campus', 'units', 'title', 'days', 'time', 'seats_cap', 'seats_act', 'seats', 'wait_cap', 'wait_act', 'wait_seats', 'instructor', 'date_range', 'location') -PAST_TERM_CODES = {'fh': '201841'} +PAST_TERM_CODES = {'fh': '201831'} From e442b8fbaaa76ed0b82e1a7ff7028b088e09af00 Mon Sep 17 00:00:00 2001 From: phi-line Date: Wed, 6 Jun 2018 22:20:31 -0700 Subject: [PATCH 05/19] refined scraper --- scrape_advanced.py | 121 +++++++++++++++++++++++++++------------------ selenium_login.py | 1 - 2 files changed, 72 insertions(+), 50 deletions(-) diff --git a/scrape_advanced.py b/scrape_advanced.py index 749a80e..e0dc6eb 100644 --- a/scrape_advanced.py +++ b/scrape_advanced.py @@ -4,35 +4,44 @@ from collections import defaultdict from selenium_login import scrape_cookies, kill_driver -from scrape_term import get_key -from settings import OLD_DB_DIR, ADVANCED_HEADERS, PAST_TERM_CODES, SCHEDULE +from settings import OLD_DB_DIR, SCHEDULE # 3rd party import requests from bs4 import BeautifulSoup from tinydb import TinyDB +CAMPUS_RANGE = (1, 2) +YEAR_RANGE = (8, 0) +QUARTER_RANGE = (4, 1) def main(): if not exists(OLD_DB_DIR): makedirs(OLD_DB_DIR, exist_ok=True) + codes = generate_term_codes() + print(codes) + cookies = scrape_cookies() + print(cookies) + + temp_path = join(OLD_DB_DIR, 'temp.json') + try: - for term in PAST_TERM_CODES.values(): - temp_path = join(OLD_DB_DIR, 'temp.json') + for term in codes: temp = TinyDB(temp_path) content = mine(term, cookies, write=False) - advanced_parse(content, db=temp) + if not advanced_parse(content,db=temp): + continue - if rename(temp_path, join(OLD_DB_DIR, f'old_{term}_database.json')): - remove(temp_path) + rename(temp_path, join(OLD_DB_DIR, f'old_{term}_database.json')) db = TinyDB(join(OLD_DB_DIR, f'old_{term}_database.json')) print(term, db.tables()) except KeyboardInterrupt: kill_driver() + remove(temp_path) finally: kill_driver() @@ -190,48 +199,62 @@ def advanced_parse(content, db): ''' soup = BeautifulSoup(content, 'html5lib') - table = soup.find('table', {'class': 'datadisplaytable'}) - table_rows = table.find_all('tr') - - table_headers = list() - start_idx = 0 - for i, tr in enumerate(table_rows): - header_cols = tr.find_all('th', {'class': 'ddheader'}) - for th in header_cols: - table_headers.append(get_parsed_text(th)) - if table_headers: - start_idx = i - break - - for tr in table_rows[start_idx:]: - try: - cols = tr.find_all('td', {'class': 'dddefault'}) - - if len(cols) > 0: - s = defaultdict(lambda: defaultdict(list)) - - num_blank = 0 - for i, c in enumerate(cols): - a = c.find('a') - cols[i] = get_parsed_text(a) if a else get_parsed_text(cols[i]) - if cols[i].isspace(): - num_blank += 1 - - if num_blank > len(cols) - num_blank: - raise BlankRow - - data = dict(zip(table_headers, cols)) - - subject = data['Subj'] - key = data['Crse'] - crn = data['CRN'] - - s[key][crn].append(data) - - j = dict(s) - db.table(f'{subject}').insert(j) - except BlankRow: - continue + try: + table = soup.find('table', {'class': 'datadisplaytable'}) + table_rows = table.find_all('tr') + + table_headers = list() + start_idx = 0 + for i, tr in enumerate(table_rows): + header_cols = tr.find_all('th', {'class': 'ddheader'}) + for th in header_cols: + table_headers.append(get_parsed_text(th)) + if table_headers: + start_idx = i + break + + for tr in table_rows[start_idx:]: + try: + cols = tr.find_all('td', {'class': 'dddefault'}) + + if len(cols) > 0: + s = defaultdict(lambda: defaultdict(list)) + + num_blank = 0 + for i, c in enumerate(cols): + a = c.find('a') + cols[i] = get_parsed_text(a) if a else get_parsed_text(cols[i]) + if cols[i].isspace(): + num_blank += 1 + + if num_blank > len(cols) - num_blank: + raise BlankRow + + data = dict(zip(table_headers, cols)) + + subject = data['Subj'] + key = data['Crse'] + crn = data['CRN'] + + s[key][crn].append(data) + + j = dict(s) + db.table(f'{subject}').insert(j) + except BlankRow: + continue + except AttributeError as e: + print(e) + return False + return True + + +def generate_term_codes(): + codes = [] + for i in range(YEAR_RANGE[0], YEAR_RANGE[1], -1): + for j in range(QUARTER_RANGE[0], QUARTER_RANGE[1], -1): + for k in range(CAMPUS_RANGE[0], CAMPUS_RANGE[1], 1): + codes.append(f'201{i}{j}{k}') + return codes def get_parsed_text(tag): diff --git a/selenium_login.py b/selenium_login.py index 8ab581d..3c09c02 100644 --- a/selenium_login.py +++ b/selenium_login.py @@ -40,7 +40,6 @@ def get_cookies(cookies_list): cookies_dict = {} for cookie in cookies_list: cookies_dict[cookie['name']] = cookie['value'] - print(cookies_dict) return cookies_dict From da81de16a4680c73b2debe24ea1e80f0a05afb06 Mon Sep 17 00:00:00 2001 From: phi-line Date: Thu, 7 Jun 2018 22:21:44 -0700 Subject: [PATCH 06/19] added some feedback to the scraping process --- scrape_advanced.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/scrape_advanced.py b/scrape_advanced.py index e0dc6eb..0369e16 100644 --- a/scrape_advanced.py +++ b/scrape_advanced.py @@ -1,3 +1,4 @@ +import sys from os import makedirs, rename, remove from os.path import join, exists from re import compile @@ -12,8 +13,8 @@ from tinydb import TinyDB CAMPUS_RANGE = (1, 2) -YEAR_RANGE = (8, 0) -QUARTER_RANGE = (4, 1) +YEAR_RANGE = (0, 8) +QUARTER_RANGE = (1, 4) def main(): if not exists(OLD_DB_DIR): @@ -29,16 +30,20 @@ def main(): try: for term in codes: + sys.stdout.write(f'[{term}] | Scraping…\r') + sys.stdout.flush() + temp = TinyDB(temp_path) content = mine(term, cookies, write=False) - if not advanced_parse(content,db=temp): + if not advanced_parse(content, db=temp, term=term): continue rename(temp_path, join(OLD_DB_DIR, f'old_{term}_database.json')) db = TinyDB(join(OLD_DB_DIR, f'old_{term}_database.json')) - print(term, db.tables()) + print(f'[{term}] | ', db.tables()) + except KeyboardInterrupt: kill_driver() remove(temp_path) @@ -190,7 +195,7 @@ class BlankRow(Exception): pass -def advanced_parse(content, db): +def advanced_parse(content, db, term=''): ''' Advanced parse takes the content from the request and then populates the database with the data :param content: (html) The html containing the courses @@ -243,16 +248,16 @@ def advanced_parse(content, db): except BlankRow: continue except AttributeError as e: - print(e) + print(f'[{term}] | ERROR: {e}') return False return True def generate_term_codes(): codes = [] - for i in range(YEAR_RANGE[0], YEAR_RANGE[1], -1): - for j in range(QUARTER_RANGE[0], QUARTER_RANGE[1], -1): - for k in range(CAMPUS_RANGE[0], CAMPUS_RANGE[1], 1): + for i in range(YEAR_RANGE[0], YEAR_RANGE[1] + 1): + for j in range(QUARTER_RANGE[0], QUARTER_RANGE[1] + 1): + for k in range(CAMPUS_RANGE[0], CAMPUS_RANGE[1] + 1): codes.append(f'201{i}{j}{k}') return codes From 920cab8b883a9cbea991b8ba9c3633785dfacbf7 Mon Sep 17 00:00:00 2001 From: phi-line Date: Fri, 8 Jun 2018 17:14:52 -0700 Subject: [PATCH 07/19] refined scraper to use dynamic department codes --- scrape_advanced.py | 174 ++++++++++++--------------------------------- selenium_login.py | 8 +-- settings.py | 11 +-- 3 files changed, 57 insertions(+), 136 deletions(-) diff --git a/scrape_advanced.py b/scrape_advanced.py index 0369e16..c86852c 100644 --- a/scrape_advanced.py +++ b/scrape_advanced.py @@ -5,7 +5,7 @@ from collections import defaultdict from selenium_login import scrape_cookies, kill_driver -from settings import OLD_DB_DIR, SCHEDULE +from settings import OLD_DB_DIR, SCHEDULE, ADVANCED_FORM_DATA # 3rd party import requests @@ -13,9 +13,10 @@ from tinydb import TinyDB CAMPUS_RANGE = (1, 2) -YEAR_RANGE = (0, 8) +YEAR_RANGE = (1, 8) QUARTER_RANGE = (1, 4) + def main(): if not exists(OLD_DB_DIR): makedirs(OLD_DB_DIR, exist_ok=True) @@ -35,7 +36,11 @@ def main(): temp = TinyDB(temp_path) - content = mine(term, cookies, write=False) + dept_data = mine_dept_data(term, cookies, write=False) + sys.stdout.write(f'[{term}] | Mining Depts… {[dept[1] for dept in dept_data]}\r') + sys.stdout.flush() + + content = mine_table_data(term, dept_data, cookies, write=False) if not advanced_parse(content, db=temp, term=term): continue @@ -51,7 +56,32 @@ def main(): kill_driver() -def mine(term, cookies, write=False): +def mine_dept_data(term, cookies, write=False): + import requests + + data = [('p_calling_proc', 'bwckschd.p_disp_dyn_sched'), ('p_term', f'{term}')] + + res = requests.post('https://banssb.fhda.edu/PROD/bwckgens.p_proc_term_date', data=data) + res.raise_for_status() + + if write: + with open(f'{join(OLD_DB_DIR, SCHEDULE)}', "wb") as file: + for chunk in res.iter_content(chunk_size=512): + if not chunk: + break + + file.write(chunk) + file.flush() + + soup = BeautifulSoup(res.content, "html5lib") + select = soup.find('select', {'id': 'subj_id'}) + options = select.find_all('option') + + data = [('sel_subj', o['value']) for o in options] + return data + + +def mine_table_data(term, dept_data, cookies, write=False): ''' Mine will hit the database for foothill's class listings :param term: (str) the term to mine @@ -60,133 +90,23 @@ def mine(term, cookies, write=False): :return res.content: (json) the html body ''' - headers = { - 'Connection': 'keep-alive', - 'Pragma': 'no-cache', - 'Cache-Control': 'no-cache', - 'Origin': 'https://banssb.fhda.edu', - 'Upgrade-Insecure-Requests': '1', - 'Content-Type': 'application/x-www-form-urlencoded', - 'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686)', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', - 'Referer': 'https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced', - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept-Language': 'en-US,en;q=0.9,la;q=0.8', - 'DNT': '1', - } - - data = [ - ('rsts', 'dummy'), - ('crn', 'dummy'), - ('term_in', f'{term}'), - ('sel_subj', 'dummy'), - ('sel_subj', 'ACTG'), - ('sel_subj', 'ALCB'), - ('sel_subj', 'ALTW'), - ('sel_subj', 'AHS'), - ('sel_subj', 'ANTH'), - ('sel_subj', 'APSM'), - ('sel_subj', 'ART'), - ('sel_subj', 'ASTR'), - ('sel_subj', 'ATHL'), - ('sel_subj', 'BIOL'), - ('sel_subj', 'BUSI'), - ('sel_subj', 'CRLP'), - ('sel_subj', 'CHEM'), - ('sel_subj', 'CHLD'), - ('sel_subj', 'COMM'), - ('sel_subj', 'C S'), - ('sel_subj', 'CNSL'), - ('sel_subj', 'CRWR'), - ('sel_subj', 'DANC'), - ('sel_subj', 'D A'), - ('sel_subj', 'D H'), - ('sel_subj', 'DMS'), - ('sel_subj', 'ECON'), - ('sel_subj', 'EMS'), - ('sel_subj', 'ENGR'), - ('sel_subj', 'ENGL'), - ('sel_subj', 'ESLL'), - ('sel_subj', 'HORT'), - ('sel_subj', 'JFS'), - ('sel_subj', 'GEOG'), - ('sel_subj', 'GIST'), - ('sel_subj', 'GID'), - ('sel_subj', 'HLTH'), - ('sel_subj', 'HIST'), - ('sel_subj', 'HUMN'), - ('sel_subj', 'ITRN'), - ('sel_subj', 'JAPN'), - ('sel_subj', 'KINS'), - ('sel_subj', 'L A'), - ('sel_subj', 'LINC'), - ('sel_subj', 'LIBR'), - ('sel_subj', 'MATH'), - ('sel_subj', 'MDIA'), - ('sel_subj', 'MTEC'), - ('sel_subj', 'MUS'), - ('sel_subj', 'NCBS'), - ('sel_subj', 'NCBH'), - ('sel_subj', 'NCEL'), - ('sel_subj', 'NCLA'), - ('sel_subj', 'NCP'), - ('sel_subj', 'NCSV'), - ('sel_subj', 'P A'), - ('sel_subj', 'PHT'), - ('sel_subj', 'PHIL'), - ('sel_subj', 'PHOT'), - ('sel_subj', 'PHED'), - ('sel_subj', 'PHDA'), - ('sel_subj', 'PSE'), - ('sel_subj', 'PHYS'), - ('sel_subj', 'POLI'), - ('sel_subj', 'PCA'), - ('sel_subj', 'PSYC'), - ('sel_subj', 'R T'), - ('sel_subj', 'RSPT'), - ('sel_subj', 'SOSC'), - ('sel_subj', 'SOC'), - ('sel_subj', 'SPAN'), - ('sel_subj', 'THTR'), - ('sel_subj', 'V T'), - ('sel_subj', 'VITI'), - ('sel_subj', 'WMN'), - ('sel_day', 'dummy'), - ('sel_schd', 'dummy'), - ('sel_insm', 'dummy'), - ('sel_camp', 'dummy'), - ('sel_camp', '%'), - ('sel_levl', 'dummy'), - ('sel_sess', 'dummy'), - ('sel_sess', '%'), - ('sel_instr', 'dummy'), - ('sel_instr', '%'), - ('sel_ptrm', 'dummy'), - ('sel_ptrm', '%'), - ('sel_attr', 'dummy'), - ('sel_crse', ''), - ('sel_title', ''), - ('sel_from_cred', ''), - ('sel_to_cred', ''), - ('begin_hh', '0'), - ('begin_mi', '0'), - ('begin_ap', 'a'), - ('end_hh', '0'), - ('end_mi', '0'), - ('end_ap', 'a'), - ('SUB_BTN', 'Section Search'), - ('path', '1'), - ] - - res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced', - headers=headers, cookies=cookies, data=data) + data = [('rsts', 'dummy'), ('crn', 'dummy'), ('term_in', f'{term}'), ('sel_subj', 'dummy')] + + data.extend(dept_data) + + data.extend(ADVANCED_FORM_DATA) + + res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced', cookies=cookies, data=data) res.raise_for_status() if write: - with open(f'{join(DB_DIR, SCHEDULE)}', "wb") as file: + with open(f'{join(OLD_DB_DIR, SCHEDULE)}', "wb") as file: for chunk in res.iter_content(chunk_size=512): - if chunk: - file.write(chunk) + if not chunk: + break + + file.write(chunk) + file.flush() return res.content diff --git a/selenium_login.py b/selenium_login.py index 3c09c02..b100a24 100644 --- a/selenium_login.py +++ b/selenium_login.py @@ -20,16 +20,14 @@ def scrape_cookies(): try: driver.execute_script("doLogin()") - WebDriverWait(driver, 10).until( + WebDriverWait(driver, 5).until( EC.title_is("MyPortal / Foothill-De Anza College District") ) - driver.get("https://myportal.fhda.edu/render.UserLayoutRootNode.uP?uP_tparam=utf&utf=%2fcp%2fip%2flogin%3fsys% \ - 3dsctssb%26url%3dhttps%3A%2F%2Fbanssb.fhda.edu%2FPROD%2Fbwskfcls.p_sel_crse_search") driver.get("https://myportal.fhda.edu/render.UserLayoutRootNode.uP?uP_tparam=utf&utf=%2fcp%2fip%2flogin%3fsys%3dsctssb%26url%3dhttps%3A%2F%2Fbanssb.fhda.edu%2FPROD%2Fbwskfcls.p_sel_crse_search") - WebDriverWait(driver, 10).until( - EC.title_is("MyPortal / Foothill-De Anza College District") + WebDriverWait(driver, 5).until( + EC.presence_of_element_located((By.ID, "ssbbackurl")) ) finally: cookies_list = driver.get_cookies() diff --git a/settings.py b/settings.py index fbc3ba2..1201dcb 100644 --- a/settings.py +++ b/settings.py @@ -10,11 +10,14 @@ DAYS_PATTERN = f"^{'(M|T|W|Th|F|S|U)?'*7}$" SCHEDULE = 'schedule.html' +SEARCH = 'search.html' HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end', 'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap') CURRENT_TERM_CODES = CAMPUS_LIST = {'fh': '201911', 'da': '201912', 'test': 'test'} -ADVANCED_HEADERS = ('select', 'CRN', 'subject', 'section', 'course', 'campus', 'units', 'title', 'days', 'time', - 'seats_cap', 'seats_act', 'seats', 'wait_cap', 'wait_act', 'wait_seats', 'instructor', - 'date_range', 'location') -PAST_TERM_CODES = {'fh': '201831'} +ADVANCED_FORM_DATA = [('sel_day', 'dummy'), ('sel_schd', 'dummy'), ('sel_insm', 'dummy'), ('sel_camp', 'dummy'), + ('sel_camp', '%'), ('sel_levl', 'dummy'), ('sel_sess', 'dummy'), ('sel_sess', '%'), + ('sel_instr', 'dummy'), ('sel_instr', '%'), ('sel_ptrm', 'dummy'), ('sel_ptrm', '%'), + ('sel_attr', 'dummy'), ('sel_crse', ''), ('sel_title', ''), ('sel_from_cred', ''), + ('sel_to_cred', ''), ('begin_hh', '0'), ('begin_mi', '0'), ('begin_ap', 'a'), ('end_hh', '0'), + ('end_mi', '0'), ('end_ap', 'a'), ('SUB_BTN', 'Section Search'), ('path', '1')] From 5119fc72916bfb6b8a7b9a39da49a408b44f5a77 Mon Sep 17 00:00:00 2001 From: phi-line Date: Fri, 8 Jun 2018 18:26:36 -0700 Subject: [PATCH 08/19] made things look pretty (important) --- Pipfile | 1 + Pipfile.lock | 21 +++++++---- scrape_advanced.py | 86 +++++++++++++++++++++++++++------------------- selenium_login.py | 4 +-- 4 files changed, 67 insertions(+), 45 deletions(-) diff --git a/Pipfile b/Pipfile index 5122131..1e5b2c2 100644 --- a/Pipfile +++ b/Pipfile @@ -17,6 +17,7 @@ pytest = "*" flask = "*" pylint = "*" selenium = "*" +colorama = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 532c0c6..97f0feb 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "6098e2ee7f95b6c25b6293ccd2279203e4a83cbe3fe4660b3d9e3a39ea93de71" + "sha256": "4811f7bec1a856e27bc2c2fff11d68d045e2c5eaa2d2a0c04e1e3bfc7125ea58" }, "host-environment-markers": { "implementation_name": "cpython", @@ -31,10 +31,10 @@ "default": { "astroid": { "hashes": [ - "sha256:032f6e09161e96f417ea7fad46d3fac7a9019c775f202182c22df0e4f714cb1c", - "sha256:dea42ae6e0b789b543f728ddae7ddb6740ba33a49fb52c4a4d9cb7bb4aa6ec09" + "sha256:0ef2bf9f07c3150929b25e8e61b5198c27b0dca195e156f0e4d5bdd89185ca1a", + "sha256:fc9b582dba0366e63540982c3944a9230cbc6f303641c51483fa547dcc22393a" ], - "version": "==1.6.4" + "version": "==1.6.5" }, "atomicwrites": { "hashes": [ @@ -79,6 +79,13 @@ ], "version": "==6.7" }, + "colorama": { + "hashes": [ + "sha256:463f8483208e921368c9f306094eb6f725c6ca42b0f97e313cb5d5512459feda", + "sha256:48eb22f4f8461b1df5734a074b57042430fb06e1d61bd1e11b078c0fe6d7a1f1" + ], + "version": "==0.3.9" + }, "dateparser": { "hashes": [ "sha256:b452ef8b36cd78ae86a50721794bc674aa3994e19b570f7ba92810f4e0a2ae03", @@ -235,10 +242,10 @@ }, "pylint": { "hashes": [ - "sha256:aa519865f8890a5905fa34924fed0f3bfc7d84fc9f9142c16dac52ffecd25a39", - "sha256:c353d8225195b37cc3aef18248b8f3fe94c5a6a95affaf885ae21a24ca31d8eb" + "sha256:a48070545c12430cfc4e865bf62f5ad367784765681b3db442d8230f0960aa3c", + "sha256:fff220bcb996b4f7e2b0f6812fd81507b72ca4d8c4d05daf2655c333800cb9b3" ], - "version": "==1.9.1" + "version": "==1.9.2" }, "pytest": { "hashes": [ diff --git a/scrape_advanced.py b/scrape_advanced.py index c86852c..6e0a6ef 100644 --- a/scrape_advanced.py +++ b/scrape_advanced.py @@ -4,14 +4,16 @@ from re import compile from collections import defaultdict -from selenium_login import scrape_cookies, kill_driver -from settings import OLD_DB_DIR, SCHEDULE, ADVANCED_FORM_DATA - # 3rd party import requests from bs4 import BeautifulSoup from tinydb import TinyDB +from selenium_login import scrape_cookies, kill_driver +from settings import OLD_DB_DIR, SCHEDULE, ADVANCED_FORM_DATA + +from colorama import init, Fore, Style + CAMPUS_RANGE = (1, 2) YEAR_RANGE = (1, 8) QUARTER_RANGE = (1, 4) @@ -22,23 +24,23 @@ def main(): makedirs(OLD_DB_DIR, exist_ok=True) codes = generate_term_codes() - print(codes) + print(f'Loaded {len(codes)} term codes') + + print_c(f'Scraping session cookie…\r') cookies = scrape_cookies() - print(cookies) + print(f"Scraped session cookie {cookies['CPSESSID']}", end=f"\n{'-'*79}\n") temp_path = join(OLD_DB_DIR, 'temp.json') try: for term in codes: - sys.stdout.write(f'[{term}] | Scraping…\r') - sys.stdout.flush() + print_c(f" [{term}] [{color(Fore.YELLOW, 'MINING…')}] Scraping…\r") temp = TinyDB(temp_path) - dept_data = mine_dept_data(term, cookies, write=False) - sys.stdout.write(f'[{term}] | Mining Depts… {[dept[1] for dept in dept_data]}\r') - sys.stdout.flush() + dept_data = mine_dept_data(term, write=False) + print_c(f" [{term}] [{color(Fore.YELLOW, 'MINING…')}] Parsing {len(dept_data)} departments…\r") content = mine_table_data(term, dept_data, cookies, write=False) if not advanced_parse(content, db=temp, term=term): @@ -47,7 +49,9 @@ def main(): rename(temp_path, join(OLD_DB_DIR, f'old_{term}_database.json')) db = TinyDB(join(OLD_DB_DIR, f'old_{term}_database.json')) - print(f'[{term}] | ', db.tables()) + + num_courses = sum([len(db.table(t).all()) for t in db.tables()]) + print_c(f" [{term}] [{color(Fore.GREEN, 'SUCCESS')}] Mined {num_courses} courses\n") except KeyboardInterrupt: kill_driver() @@ -56,22 +60,19 @@ def main(): kill_driver() -def mine_dept_data(term, cookies, write=False): - import requests - +def mine_dept_data(term, write=False): + ''' + Mine dept data will grab the department IDs for a given quarter. + :param term: (str) the term to mine + :param write: (bool) write to file? + :return data (list(tuple)) the html body + ''' data = [('p_calling_proc', 'bwckschd.p_disp_dyn_sched'), ('p_term', f'{term}')] res = requests.post('https://banssb.fhda.edu/PROD/bwckgens.p_proc_term_date', data=data) res.raise_for_status() - if write: - with open(f'{join(OLD_DB_DIR, SCHEDULE)}', "wb") as file: - for chunk in res.iter_content(chunk_size=512): - if not chunk: - break - - file.write(chunk) - file.flush() + write and write_to_file(res) soup = BeautifulSoup(res.content, "html5lib") select = soup.find('select', {'id': 'subj_id'}) @@ -89,7 +90,6 @@ def mine_table_data(term, dept_data, cookies, write=False): :param write: (bool) write to file? :return res.content: (json) the html body ''' - data = [('rsts', 'dummy'), ('crn', 'dummy'), ('term_in', f'{term}'), ('sel_subj', 'dummy')] data.extend(dept_data) @@ -99,22 +99,11 @@ def mine_table_data(term, dept_data, cookies, write=False): res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced', cookies=cookies, data=data) res.raise_for_status() - if write: - with open(f'{join(OLD_DB_DIR, SCHEDULE)}', "wb") as file: - for chunk in res.iter_content(chunk_size=512): - if not chunk: - break - - file.write(chunk) - file.flush() + write and write_to_file(res) return res.content -class BlankRow(Exception): - pass - - def advanced_parse(content, db, term=''): ''' Advanced parse takes the content from the request and then populates the database with the data @@ -168,7 +157,7 @@ def advanced_parse(content, db, term=''): except BlankRow: continue except AttributeError as e: - print(f'[{term}] | ERROR: {e}') + print(f" [{term}] [{color(Fore.RED, 'ERROR!!')}] {e}") return False return True @@ -182,11 +171,36 @@ def generate_term_codes(): return codes +class BlankRow(Exception): + pass + + def get_parsed_text(tag): text = tag.get_text() p = compile(r'<.*?>') return p.sub('', text) +def print_c(message): + sys.stdout.write('\x1b[2K') + sys.stdout.write(message) + sys.stdout.flush() + + +def color(c, word): + return f'{c}{word}{Style.RESET_ALL}' + + +def write_to_file(res): + with open(f'{join(OLD_DB_DIR, SCHEDULE)}', "wb") as file: + for chunk in res.iter_content(chunk_size=512): + if not chunk: + break + + file.write(chunk) + file.flush() + + if __name__ == '__main__': + init() main() diff --git a/selenium_login.py b/selenium_login.py index b100a24..c6d654d 100644 --- a/selenium_login.py +++ b/selenium_login.py @@ -20,13 +20,13 @@ def scrape_cookies(): try: driver.execute_script("doLogin()") - WebDriverWait(driver, 5).until( + WebDriverWait(driver, 3).until( EC.title_is("MyPortal / Foothill-De Anza College District") ) driver.get("https://myportal.fhda.edu/render.UserLayoutRootNode.uP?uP_tparam=utf&utf=%2fcp%2fip%2flogin%3fsys%3dsctssb%26url%3dhttps%3A%2F%2Fbanssb.fhda.edu%2FPROD%2Fbwskfcls.p_sel_crse_search") - WebDriverWait(driver, 5).until( + WebDriverWait(driver, 3).until( EC.presence_of_element_located((By.ID, "ssbbackurl")) ) finally: From d5e1d9f36d46d1f07d3199fc2696baacdbedc79b Mon Sep 17 00:00:00 2001 From: phi-line Date: Sun, 10 Jun 2018 16:15:47 -0700 Subject: [PATCH 09/19] added alternate form field format. fixes 98% of cases --- scrape_advanced.py | 42 +++++++++++++++++++---------- settings.py | 67 +++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 89 insertions(+), 20 deletions(-) diff --git a/scrape_advanced.py b/scrape_advanced.py index 6e0a6ef..10c3f77 100644 --- a/scrape_advanced.py +++ b/scrape_advanced.py @@ -10,7 +10,7 @@ from tinydb import TinyDB from selenium_login import scrape_cookies, kill_driver -from settings import OLD_DB_DIR, SCHEDULE, ADVANCED_FORM_DATA +from settings import OLD_DB_DIR, ADVANCED_FORM_DATA from colorama import init, Fore, Style @@ -23,6 +23,9 @@ def main(): if not exists(OLD_DB_DIR): makedirs(OLD_DB_DIR, exist_ok=True) + if not exists(join(OLD_DB_DIR, 'html')): + makedirs(join(OLD_DB_DIR, 'html'), exist_ok=True) + codes = generate_term_codes() print(f'Loaded {len(codes)} term codes') @@ -42,16 +45,25 @@ def main(): dept_data = mine_dept_data(term, write=False) print_c(f" [{term}] [{color(Fore.YELLOW, 'MINING…')}] Parsing {len(dept_data)} departments…\r") - content = mine_table_data(term, dept_data, cookies, write=False) - if not advanced_parse(content, db=temp, term=term): - continue + failed = False + for idx, variant in enumerate(ADVANCED_FORM_DATA): + content = mine_table_data(term, variant, dept_data, cookies, write=True) + if advanced_parse(content, db=temp, term=term): + break + elif idx == len(ADVANCED_FORM_DATA) - 1: + failed = True - rename(temp_path, join(OLD_DB_DIR, f'old_{term}_database.json')) + if rename(temp_path, join(OLD_DB_DIR, f'old_{term}_database.json')): + remove(temp_path) db = TinyDB(join(OLD_DB_DIR, f'old_{term}_database.json')) num_courses = sum([len(db.table(t).all()) for t in db.tables()]) - print_c(f" [{term}] [{color(Fore.GREEN, 'SUCCESS')}] Mined {num_courses} courses\n") + + if failed: + print_c(f" [{term}] [{color(Fore.RED, 'ERROR!!')}] Payload failed…\n") + else: + print_c(f" [{term}] [{color(Fore.GREEN, 'SUCCESS')}] Mined {num_courses} courses\n") except KeyboardInterrupt: kill_driver() @@ -72,7 +84,7 @@ def mine_dept_data(term, write=False): res = requests.post('https://banssb.fhda.edu/PROD/bwckgens.p_proc_term_date', data=data) res.raise_for_status() - write and write_to_file(res) + write and write_to_file(res, term) soup = BeautifulSoup(res.content, "html5lib") select = soup.find('select', {'id': 'subj_id'}) @@ -82,7 +94,7 @@ def mine_dept_data(term, write=False): return data -def mine_table_data(term, dept_data, cookies, write=False): +def mine_table_data(term, payload, dept_data, cookies, write=False): ''' Mine will hit the database for foothill's class listings :param term: (str) the term to mine @@ -90,16 +102,18 @@ def mine_table_data(term, dept_data, cookies, write=False): :param write: (bool) write to file? :return res.content: (json) the html body ''' - data = [('rsts', 'dummy'), ('crn', 'dummy'), ('term_in', f'{term}'), ('sel_subj', 'dummy')] + data = [('rsts', 'dummy'), ('crn', 'dummy'), ('term_in', f'{term}')] + + data.extend(payload[0]) data.extend(dept_data) - data.extend(ADVANCED_FORM_DATA) + data.extend(payload[1]) res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced', cookies=cookies, data=data) res.raise_for_status() - write and write_to_file(res) + write and write_to_file(res, term) return res.content @@ -157,7 +171,6 @@ def advanced_parse(content, db, term=''): except BlankRow: continue except AttributeError as e: - print(f" [{term}] [{color(Fore.RED, 'ERROR!!')}] {e}") return False return True @@ -191,8 +204,9 @@ def color(c, word): return f'{c}{word}{Style.RESET_ALL}' -def write_to_file(res): - with open(f'{join(OLD_DB_DIR, SCHEDULE)}', "wb") as file: +def write_to_file(res, term): + + with open(f"{join(OLD_DB_DIR, 'html', term+'.html')}", "wb") as file: for chunk in res.iter_content(chunk_size=512): if not chunk: break diff --git a/settings.py b/settings.py index 1201dcb..155682d 100644 --- a/settings.py +++ b/settings.py @@ -15,9 +15,64 @@ 'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap') CURRENT_TERM_CODES = CAMPUS_LIST = {'fh': '201911', 'da': '201912', 'test': 'test'} -ADVANCED_FORM_DATA = [('sel_day', 'dummy'), ('sel_schd', 'dummy'), ('sel_insm', 'dummy'), ('sel_camp', 'dummy'), - ('sel_camp', '%'), ('sel_levl', 'dummy'), ('sel_sess', 'dummy'), ('sel_sess', '%'), - ('sel_instr', 'dummy'), ('sel_instr', '%'), ('sel_ptrm', 'dummy'), ('sel_ptrm', '%'), - ('sel_attr', 'dummy'), ('sel_crse', ''), ('sel_title', ''), ('sel_from_cred', ''), - ('sel_to_cred', ''), ('begin_hh', '0'), ('begin_mi', '0'), ('begin_ap', 'a'), ('end_hh', '0'), - ('end_mi', '0'), ('end_ap', 'a'), ('SUB_BTN', 'Section Search'), ('path', '1')] +ADVANCED_FORM_DATA = [ + [ + [('sel_subj', 'dummy'), + ('sel_day', 'dummy'), + ('sel_schd', 'dummy'), + ('sel_insm', 'dummy'), + ('sel_camp', 'dummy'), + ('sel_levl', 'dummy'), + ('sel_sess', 'dummy'), + ('sel_instr', 'dummy'), + ('sel_ptrm', 'dummy'), + ('sel_attr', 'dummy')], + [('sel_crse', ''), + ('sel_title', ''), + ('sel_from_cred', ''), + ('sel_to_cred', ''), + ('sel_camp', '%'), + ('sel_sess', '%'), + ('sel_instr', '%'), + ('sel_ptrm', '%'), + ('begin_hh', '0'), + ('begin_mi', '0'), + ('begin_ap', 'a'), + ('end_hh', '0'), + ('end_mi', '0'), + ('end_ap', 'a'), + ('SUB_BTN', 'Section Search'), + ('path', '1')] + ], + + [ + [('sel_subj', 'dummy'), + ('sel_day', 'dummy'), + ('sel_schd', 'dummy'), + ('sel_insm', 'dummy'), + ('sel_camp', 'dummy'), + ('sel_levl', 'dummy'), + ('sel_sess', 'dummy'), + ('sel_instr', 'dummy'), + ('sel_ptrm', 'dummy'), + ('sel_attr', 'dummy')], + [('sel_crse', ''), + ('sel_title', ''), + ('sel_schd', '%'), + ('sel_from_cred', ''), + ('sel_to_cred', ''), + ('sel_camp', '%'), + ('sel_instr', '%'), + ('sel_sess', '%'), + ('sel_ptrm', '%'), + ('sel_attr', '%'), + ('begin_hh', '0'), + ('begin_mi', '0'), + ('begin_ap', 'a'), + ('end_hh', '0'), + ('end_mi', '0'), + ('end_ap', 'a'), + ('SUB_BTN', 'Section Search'), + ('path', '1')] + ] +] From c9edb41b6ba46755a02914a3b9d46277c90774f3 Mon Sep 17 00:00:00 2001 From: phi-line Date: Sun, 10 Jun 2018 16:20:10 -0700 Subject: [PATCH 10/19] removed html files from .gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 56ac861..316e78c 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,7 @@ __pycache__/ .idea/ *.DS_Store *.json -schedule.html +*.html frontend/static/README.md # pytest From 21a397bc0716dac6fdc5e3829ce6ad3cf894987a Mon Sep 17 00:00:00 2001 From: phi-line Date: Sun, 10 Jun 2018 18:53:52 -0700 Subject: [PATCH 11/19] fixed edgecase for past quarters --- settings.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/settings.py b/settings.py index 155682d..821401e 100644 --- a/settings.py +++ b/settings.py @@ -74,5 +74,37 @@ ('end_ap', 'a'), ('SUB_BTN', 'Section Search'), ('path', '1')] + ], + + [ + [('sel_subj', 'dummy'), + ('sel_day', 'dummy'), + ('sel_schd', 'dummy'), + ('sel_insm', 'dummy'), + ('sel_camp', 'dummy'), + ('sel_levl', 'dummy'), + ('sel_sess', 'dummy'), + ('sel_instr', 'dummy'), + ('sel_ptrm', 'dummy'), + ('sel_attr', 'dummy')], + [('sel_crse', ''), + ('sel_title', ''), + ('sel_schd', '%'), + ('sel_from_cred', ''), + ('sel_to_cred', ''), + ('sel_camp', '%'), + ('sel_levl', '%'), + ('sel_ptrm', '%'), + ('sel_instr', '%'), + ('sel_sess', '%'), + ('sel_attr', '%'), + ('begin_hh', '0'), + ('begin_mi', '0'), + ('begin_ap', 'a'), + ('end_hh', '0'), + ('end_mi', '0'), + ('end_ap', 'a'), + ('SUB_BTN', 'Section Search'), + ('path', '1')] ] ] From 2d498971d42a99a78f15b6081fbf99391dd1d3b5 Mon Sep 17 00:00:00 2001 From: phi-line Date: Mon, 11 Jun 2018 00:44:56 -0700 Subject: [PATCH 12/19] increased pylint score --- scrape_advanced.py | 27 +++++++++++++++------------ selenium_login.py | 4 +++- server.py | 6 +++--- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/scrape_advanced.py b/scrape_advanced.py index 10c3f77..fc98fd6 100644 --- a/scrape_advanced.py +++ b/scrape_advanced.py @@ -1,18 +1,17 @@ import sys from os import makedirs, rename, remove from os.path import join, exists -from re import compile from collections import defaultdict +import re # 3rd party import requests from bs4 import BeautifulSoup from tinydb import TinyDB - +from colorama import init, Fore, Style from selenium_login import scrape_cookies, kill_driver -from settings import OLD_DB_DIR, ADVANCED_FORM_DATA -from colorama import init, Fore, Style +from settings import OLD_DB_DIR, ADVANCED_FORM_DATA CAMPUS_RANGE = (1, 2) YEAR_RANGE = (1, 8) @@ -27,12 +26,12 @@ def main(): makedirs(join(OLD_DB_DIR, 'html'), exist_ok=True) codes = generate_term_codes() - print(f'Loaded {len(codes)} term codes') + print_c(f'Loaded {color(Fore.CYAN, len(codes))} term codes\n') print_c(f'Scraping session cookie…\r') cookies = scrape_cookies() - print(f"Scraped session cookie {cookies['CPSESSID']}", end=f"\n{'-'*79}\n") + print_c(f"Scraped session cookie {color(Fore.YELLOW, cookies['CPSESSID'])}\n{'-'*79}\n") temp_path = join(OLD_DB_DIR, 'temp.json') @@ -43,7 +42,8 @@ def main(): temp = TinyDB(temp_path) dept_data = mine_dept_data(term, write=False) - print_c(f" [{term}] [{color(Fore.YELLOW, 'MINING…')}] Parsing {len(dept_data)} departments…\r") + print_c(f" [{term}] [{color(Fore.YELLOW, 'MINING…')}] \ + Parsing {len(dept_data)} departments…\r") failed = False for idx, variant in enumerate(ADVANCED_FORM_DATA): @@ -84,7 +84,8 @@ def mine_dept_data(term, write=False): res = requests.post('https://banssb.fhda.edu/PROD/bwckgens.p_proc_term_date', data=data) res.raise_for_status() - write and write_to_file(res, term) + if write: + write_to_file(res, term) soup = BeautifulSoup(res.content, "html5lib") select = soup.find('select', {'id': 'subj_id'}) @@ -110,10 +111,12 @@ def mine_table_data(term, payload, dept_data, cookies, write=False): data.extend(payload[1]) - res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced', cookies=cookies, data=data) + res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced', + cookies=cookies, data=data) res.raise_for_status() - write and write_to_file(res, term) + if write: + write_to_file(res, term) return res.content @@ -145,7 +148,7 @@ def advanced_parse(content, db, term=''): try: cols = tr.find_all('td', {'class': 'dddefault'}) - if len(cols) > 0: + if cols: s = defaultdict(lambda: defaultdict(list)) num_blank = 0 @@ -190,7 +193,7 @@ class BlankRow(Exception): def get_parsed_text(tag): text = tag.get_text() - p = compile(r'<.*?>') + p = re.compile(r'<.*?>') return p.sub('', text) diff --git a/selenium_login.py b/selenium_login.py index c6d654d..617e435 100644 --- a/selenium_login.py +++ b/selenium_login.py @@ -18,6 +18,7 @@ def scrape_cookies(): driver.execute_script(f"document.getElementById('user').value='{os.environ['MP_USER']}'") driver.execute_script(f"document.getElementById('pass').value='{os.environ['MP_PASS']}'") + cookies_list = list() try: driver.execute_script("doLogin()") WebDriverWait(driver, 3).until( @@ -31,7 +32,8 @@ def scrape_cookies(): ) finally: cookies_list = driver.get_cookies() - return get_cookies(cookies_list) + + return get_cookies(cookies_list) def get_cookies(cookies_list): diff --git a/server.py b/server.py index ffaac31..23504c1 100644 --- a/server.py +++ b/server.py @@ -5,14 +5,14 @@ import itertools as itr import typing as ty -from settings import COURSE_PATTERN, DAYS_PATTERN, CAMPUS_LIST - # 3rd party from flask import Flask, jsonify, request, render_template from tinydb import TinyDB from maya import when, MayaInterval -# Quart config +from settings import COURSE_PATTERN, DAYS_PATTERN, CAMPUS_LIST + +# Flask config def add_cors_headers(response): response.headers['Access-Control-Allow-Origin'] = '*' return response From 93ce5c6102a4bf7186d19bfd0cc08fddb0c533b7 Mon Sep 17 00:00:00 2001 From: phi-line Date: Mon, 11 Jun 2018 01:37:08 -0700 Subject: [PATCH 13/19] cleaned up code and added docstrings --- scrape_advanced.py | 32 ++++++++++++++++++++++++++++++-- scrape_term.py | 19 +++---------------- 2 files changed, 33 insertions(+), 18 deletions(-) diff --git a/scrape_advanced.py b/scrape_advanced.py index fc98fd6..4cbf5ef 100644 --- a/scrape_advanced.py +++ b/scrape_advanced.py @@ -47,7 +47,7 @@ def main(): failed = False for idx, variant in enumerate(ADVANCED_FORM_DATA): - content = mine_table_data(term, variant, dept_data, cookies, write=True) + content = mine_table_data(term, variant, dept_data, cookies, write=False) if advanced_parse(content, db=temp, term=term): break elif idx == len(ADVANCED_FORM_DATA) - 1: @@ -99,6 +99,8 @@ def mine_table_data(term, payload, dept_data, cookies, write=False): ''' Mine will hit the database for foothill's class listings :param term: (str) the term to mine + :param payload: (str) data payload for request + :param dept_data: (str) department data payload :param cookies: (dict) cookies to send with POST :param write: (bool) write to file? :return res.content: (json) the html body @@ -179,6 +181,11 @@ def advanced_parse(content, db, term=''): def generate_term_codes(): + """ + This helper generates a list of term codes based on the ranges set by: + YEAR_RANGE, QUARTER_RANGE, CAMPUS_RANGE + :return: (list(str)) list of term codes + """ codes = [] for i in range(YEAR_RANGE[0], YEAR_RANGE[1] + 1): for j in range(QUARTER_RANGE[0], QUARTER_RANGE[1] + 1): @@ -192,23 +199,44 @@ class BlankRow(Exception): def get_parsed_text(tag): + """ + Regex that strips all html tags and their contents + :param tag: (str) inner contents of parent tag + :return: (str) isolated text + """ text = tag.get_text() p = re.compile(r'<.*?>') return p.sub('', text) def print_c(message): + """ + Clears last carriage returned line and writes a new one + :param message: (str) + :return: None + """ sys.stdout.write('\x1b[2K') sys.stdout.write(message) sys.stdout.flush() def color(c, word): + """ + Format template that inserts a color for a given word + :param c: (Color) Color to format to + :param word: (str) Word to format + :return: (str) Formatted String + """ return f'{c}{word}{Style.RESET_ALL}' def write_to_file(res, term): - + """ + Writes a bytestream to a nested file directory + :param res: response object + :param term: term code + :return: None + """ with open(f"{join(OLD_DB_DIR, 'html', term+'.html')}", "wb") as file: for chunk in res.iter_content(chunk_size=512): if not chunk: diff --git a/scrape_term.py b/scrape_term.py index 6fab54e..ef12a45 100644 --- a/scrape_term.py +++ b/scrape_term.py @@ -36,22 +36,9 @@ def mine(term, write=False): :param write: (bool) write to file? :return res.content: (json) the html body ''' - headers = { - 'Origin': 'https://banssb.fhda.edu', - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept-Language': 'en-US,en;q=0.9', - 'User-Agent': 'FoothillAPI', - 'Content-Type': 'application/x-www-form-urlencoded', - 'Accept': 'text/html, */*; q=0.01', - 'Referer': 'https://banssb.fhda.edu/PROD/fhda_opencourses.P_Application', - 'X-Requested-With': 'XMLHttpRequest', - 'Connection': 'keep-alive', - } - - data = [('termcode', f'{term}'), ] - - res = requests.post('https://banssb.fhda.edu/PROD/fhda_opencourses.P_GetCourseList', - headers=headers, data=data) + data = [('termcode', f'{term}')] + + res = requests.post('https://banssb.fhda.edu/PROD/fhda_opencourses.P_GetCourseList', data=data) res.raise_for_status() if write: From 5227f68787eab7b7aa0e9358ee3cd6f21afbafc4 Mon Sep 17 00:00:00 2001 From: phi-line Date: Mon, 11 Jun 2018 02:04:07 -0700 Subject: [PATCH 14/19] removed need for 'old' subdirectory --- scrape_advanced.py | 12 ++++++------ settings.py | 3 --- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/scrape_advanced.py b/scrape_advanced.py index 4cbf5ef..64a15c9 100644 --- a/scrape_advanced.py +++ b/scrape_advanced.py @@ -11,7 +11,7 @@ from colorama import init, Fore, Style from selenium_login import scrape_cookies, kill_driver -from settings import OLD_DB_DIR, ADVANCED_FORM_DATA +from settings import DB_DIR, ADVANCED_FORM_DATA CAMPUS_RANGE = (1, 2) YEAR_RANGE = (1, 8) @@ -19,11 +19,11 @@ def main(): - if not exists(OLD_DB_DIR): - makedirs(OLD_DB_DIR, exist_ok=True) + if not exists(DB_DIR): + makedirs(DB_DIR, exist_ok=True) - if not exists(join(OLD_DB_DIR, 'html')): - makedirs(join(OLD_DB_DIR, 'html'), exist_ok=True) + if not exists(join(DB_DIR, 'html')): + makedirs(join(DB_DIR, 'html'), exist_ok=True) codes = generate_term_codes() print_c(f'Loaded {color(Fore.CYAN, len(codes))} term codes\n') @@ -237,7 +237,7 @@ def write_to_file(res, term): :param term: term code :return: None """ - with open(f"{join(OLD_DB_DIR, 'html', term+'.html')}", "wb") as file: + with open(f"{join(DB_DIR, 'html', term+'.html')}", "wb") as file: for chunk in res.iter_content(chunk_size=512): if not chunk: break diff --git a/settings.py b/settings.py index 821401e..3093a0b 100644 --- a/settings.py +++ b/settings.py @@ -2,7 +2,6 @@ ROOT_DIR = os.path.abspath(os.path.dirname(__file__)) DB_DIR = os.path.join(ROOT_DIR, 'db') -OLD_DB_DIR = os.path.join(DB_DIR, 'old') TEST_DIR = os.path.join(ROOT_DIR, 'tests') TEST_DB_DIR = os.path.join(TEST_DIR, 'test_db') @@ -10,7 +9,6 @@ DAYS_PATTERN = f"^{'(M|T|W|Th|F|S|U)?'*7}$" SCHEDULE = 'schedule.html' -SEARCH = 'search.html' HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end', 'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap') CURRENT_TERM_CODES = CAMPUS_LIST = {'fh': '201911', 'da': '201912', 'test': 'test'} @@ -44,7 +42,6 @@ ('SUB_BTN', 'Section Search'), ('path', '1')] ], - [ [('sel_subj', 'dummy'), ('sel_day', 'dummy'), From 363b9e3f624de8014c2f75c679e0cffe1384b3cf Mon Sep 17 00:00:00 2001 From: phi-line Date: Mon, 11 Jun 2018 03:22:46 -0700 Subject: [PATCH 15/19] fixed big created when reformatted --- scrape_advanced.py | 10 +++++----- selenium_login.py | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/scrape_advanced.py b/scrape_advanced.py index 64a15c9..b2df6d4 100644 --- a/scrape_advanced.py +++ b/scrape_advanced.py @@ -33,7 +33,7 @@ def main(): cookies = scrape_cookies() print_c(f"Scraped session cookie {color(Fore.YELLOW, cookies['CPSESSID'])}\n{'-'*79}\n") - temp_path = join(OLD_DB_DIR, 'temp.json') + temp_path = join(DB_DIR, 'temp.json') try: for term in codes: @@ -42,8 +42,8 @@ def main(): temp = TinyDB(temp_path) dept_data = mine_dept_data(term, write=False) - print_c(f" [{term}] [{color(Fore.YELLOW, 'MINING…')}] \ - Parsing {len(dept_data)} departments…\r") + print_c(f" [{term}] [{color(Fore.YELLOW, 'MINING…')}] " + + f"Parsing {len(dept_data)} departments…\r") failed = False for idx, variant in enumerate(ADVANCED_FORM_DATA): @@ -53,10 +53,10 @@ def main(): elif idx == len(ADVANCED_FORM_DATA) - 1: failed = True - if rename(temp_path, join(OLD_DB_DIR, f'old_{term}_database.json')): + if rename(temp_path, join(DB_DIR, f'old_{term}_database.json')): remove(temp_path) - db = TinyDB(join(OLD_DB_DIR, f'old_{term}_database.json')) + db = TinyDB(join(DB_DIR, f'old_{term}_database.json')) num_courses = sum([len(db.table(t).all()) for t in db.tables()]) diff --git a/selenium_login.py b/selenium_login.py index 617e435..1487172 100644 --- a/selenium_login.py +++ b/selenium_login.py @@ -18,17 +18,17 @@ def scrape_cookies(): driver.execute_script(f"document.getElementById('user').value='{os.environ['MP_USER']}'") driver.execute_script(f"document.getElementById('pass').value='{os.environ['MP_PASS']}'") - cookies_list = list() try: driver.execute_script("doLogin()") WebDriverWait(driver, 3).until( EC.title_is("MyPortal / Foothill-De Anza College District") ) - driver.get("https://myportal.fhda.edu/render.UserLayoutRootNode.uP?uP_tparam=utf&utf=%2fcp%2fip%2flogin%3fsys%3dsctssb%26url%3dhttps%3A%2F%2Fbanssb.fhda.edu%2FPROD%2Fbwskfcls.p_sel_crse_search") + driver.get( + "https://myportal.fhda.edu/render.UserLayoutRootNode.uP?uP_tparam=utf&utf=%2fcp%2fip%2flogin%3fsys%3dsctssb%26url%3dhttps%3A%2F%2Fbanssb.fhda.edu%2FPROD%2Fbwskfcls.p_sel_crse_search") WebDriverWait(driver, 3).until( - EC.presence_of_element_located((By.ID, "ssbbackurl")) + EC.title_is("MyPortal / Foothill-De Anza College District") ) finally: cookies_list = driver.get_cookies() From 1872b6c510419564fd86a0dbdf8ad7b2bdc5f16d Mon Sep 17 00:00:00 2001 From: phi-line Date: Mon, 11 Jun 2018 19:58:23 -0700 Subject: [PATCH 16/19] added debug mode --- scrape_advanced.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/scrape_advanced.py b/scrape_advanced.py index b2df6d4..50ceb37 100644 --- a/scrape_advanced.py +++ b/scrape_advanced.py @@ -17,6 +17,9 @@ YEAR_RANGE = (1, 8) QUARTER_RANGE = (1, 4) +PREFIX = 'old' +DEBUG = False + def main(): if not exists(DB_DIR): @@ -28,6 +31,10 @@ def main(): codes = generate_term_codes() print_c(f'Loaded {color(Fore.CYAN, len(codes))} term codes\n') + if DEBUG: + PREFIX = 'debug' + codes = codes[:5] + print_c(f'Scraping session cookie…\r') cookies = scrape_cookies() @@ -53,10 +60,10 @@ def main(): elif idx == len(ADVANCED_FORM_DATA) - 1: failed = True - if rename(temp_path, join(DB_DIR, f'old_{term}_database.json')): + if rename(temp_path, join(DB_DIR, f'{PREFIX}_{term}_database.json')): remove(temp_path) - db = TinyDB(join(DB_DIR, f'old_{term}_database.json')) + db = TinyDB(join(DB_DIR, f'{PREFIX}_{term}_database.json')) num_courses = sum([len(db.table(t).all()) for t in db.tables()]) @@ -109,6 +116,9 @@ def mine_table_data(term, payload, dept_data, cookies, write=False): data.extend(payload[0]) + if DEBUG: + dept_data = dept_data[:1] + data.extend(dept_data) data.extend(payload[1]) From 787cd3b385edd4d5e6947ff899bab607ef3bee18 Mon Sep 17 00:00:00 2001 From: phi-line Date: Wed, 13 Jun 2018 11:50:50 -0700 Subject: [PATCH 17/19] increased pylint score and made debug mode better --- scrape_advanced.py | 9 ++++----- selenium_login.py | 1 - settings.py | 1 + 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/scrape_advanced.py b/scrape_advanced.py index 50ceb37..cb7041c 100644 --- a/scrape_advanced.py +++ b/scrape_advanced.py @@ -11,13 +11,12 @@ from colorama import init, Fore, Style from selenium_login import scrape_cookies, kill_driver -from settings import DB_DIR, ADVANCED_FORM_DATA +from settings import DB_DIR, ADVANCED_FORM_DATA, PREFIXES CAMPUS_RANGE = (1, 2) YEAR_RANGE = (1, 8) QUARTER_RANGE = (1, 4) -PREFIX = 'old' DEBUG = False @@ -31,8 +30,8 @@ def main(): codes = generate_term_codes() print_c(f'Loaded {color(Fore.CYAN, len(codes))} term codes\n') + prefix = PREFIXES[0] if not DEBUG else PREFIXES[1] if DEBUG: - PREFIX = 'debug' codes = codes[:5] print_c(f'Scraping session cookie…\r') @@ -60,10 +59,10 @@ def main(): elif idx == len(ADVANCED_FORM_DATA) - 1: failed = True - if rename(temp_path, join(DB_DIR, f'{PREFIX}_{term}_database.json')): + if rename(temp_path, join(DB_DIR, f'{prefix}_{term}_database.json')): remove(temp_path) - db = TinyDB(join(DB_DIR, f'{PREFIX}_{term}_database.json')) + db = TinyDB(join(DB_DIR, f'{prefix}_{term}_database.json')) num_courses = sum([len(db.table(t).all()) for t in db.tables()]) diff --git a/selenium_login.py b/selenium_login.py index 1487172..ea35661 100644 --- a/selenium_login.py +++ b/selenium_login.py @@ -1,7 +1,6 @@ import os from selenium import webdriver -from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC diff --git a/settings.py b/settings.py index 3093a0b..2134631 100644 --- a/settings.py +++ b/settings.py @@ -8,6 +8,7 @@ COURSE_PATTERN = r'[FD]0*(\d*\w?)\.?\d*([YWZH])?' DAYS_PATTERN = f"^{'(M|T|W|Th|F|S|U)?'*7}$" +PREFIXES = ('old', 'debug') SCHEDULE = 'schedule.html' HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end', 'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap') From 363b07b1b68ae9c048750b656d4b492f7e6ce2b2 Mon Sep 17 00:00:00 2001 From: phi-line Date: Wed, 13 Jun 2018 23:03:40 -0700 Subject: [PATCH 18/19] addressed TryExceptElse's changes --- .gitignore | 2 +- scrape_advanced.py | 133 ++++++++++++++++++++++++++++++--------------- settings.py | 95 +------------------------------- 3 files changed, 92 insertions(+), 138 deletions(-) diff --git a/.gitignore b/.gitignore index 316e78c..63e00fa 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,7 @@ __pycache__/ .idea/ *.DS_Store *.json -*.html +db/html/ frontend/static/README.md # pytest diff --git a/scrape_advanced.py b/scrape_advanced.py index cb7041c..5b93eea 100644 --- a/scrape_advanced.py +++ b/scrape_advanced.py @@ -2,6 +2,7 @@ from os import makedirs, rename, remove from os.path import join, exists from collections import defaultdict +from itertools import product import re # 3rd party @@ -10,14 +11,55 @@ from tinydb import TinyDB from colorama import init, Fore, Style from selenium_login import scrape_cookies, kill_driver +from selenium.common.exceptions import TimeoutException -from settings import DB_DIR, ADVANCED_FORM_DATA, PREFIXES +from settings import DB_DIR CAMPUS_RANGE = (1, 2) YEAR_RANGE = (1, 8) QUARTER_RANGE = (1, 4) -DEBUG = False +DEBUG = True + +PREFIXES = ('old', 'debug') + +ADVANCED_FORM_DATA = [ + [ + [('sel_subj', 'dummy'), ('sel_day', 'dummy'), ('sel_schd', 'dummy'), + ('sel_insm', 'dummy'), ('sel_camp', 'dummy'), ('sel_levl', 'dummy'), + ('sel_sess', 'dummy'), ('sel_instr', 'dummy'), ('sel_ptrm', 'dummy'), + ('sel_attr', 'dummy')], + [('sel_crse', ''), ('sel_title', ''), ('sel_from_cred', ''), + ('sel_to_cred', ''), ('sel_camp', '%'), ('sel_sess', '%'), + ('sel_instr', '%'), ('sel_ptrm', '%'), ('begin_hh', '0'), + ('begin_mi', '0'), ('begin_ap', 'a'), ('end_hh', '0'), ('end_mi', '0'), + ('end_ap', 'a'), ('SUB_BTN', 'Section Search'), ('path', '1')] + ], + [ + [('sel_subj', 'dummy'), ('sel_day', 'dummy'), ('sel_schd', 'dummy'), + ('sel_insm', 'dummy'), ('sel_camp', 'dummy'), ('sel_levl', 'dummy'), + ('sel_sess', 'dummy'), ('sel_instr', 'dummy'), ('sel_ptrm', 'dummy'), + ('sel_attr', 'dummy')], + [('sel_crse', ''), ('sel_title', ''), ('sel_schd', '%'), + ('sel_from_cred', ''), ('sel_to_cred', ''), ('sel_camp', '%'), + ('sel_instr', '%'), ('sel_sess', '%'), ('sel_ptrm', '%'), + ('sel_attr', '%'), ('begin_hh', '0'), ('begin_mi', '0'), + ('begin_ap', 'a'), ('end_hh', '0'), ('end_mi', '0'), + ('end_ap', 'a'), ('SUB_BTN', 'Section Search'), ('path', '1')] + ], + [ + [('sel_subj', 'dummy'), ('sel_day', 'dummy'), ('sel_schd', 'dummy'), + ('sel_insm', 'dummy'), ('sel_camp', 'dummy'), ('sel_levl', 'dummy'), + ('sel_sess', 'dummy'), ('sel_instr', 'dummy'), ('sel_ptrm', 'dummy'), + ('sel_attr', 'dummy')], + [('sel_crse', ''), ('sel_title', ''), ('sel_schd', '%'), + ('sel_from_cred', ''), ('sel_to_cred', ''), ('sel_camp', '%'), + ('sel_levl', '%'), ('sel_ptrm', '%'), ('sel_instr', '%'), + ('sel_sess', '%'), ('sel_attr', '%'), ('begin_hh', '0'), + ('begin_mi', '0'), ('begin_ap', 'a'), ('end_hh', '0'), ('end_mi', '0'), + ('end_ap', 'a'), ('SUB_BTN', 'Section Search'), ('path', '1')] + ] +] def main(): @@ -71,7 +113,8 @@ def main(): else: print_c(f" [{term}] [{color(Fore.GREEN, 'SUCCESS')}] Mined {num_courses} courses\n") - except KeyboardInterrupt: + except (KeyboardInterrupt, TimeoutException) as e: + print_c(f"{color(Fore.GREEN, e)}\n") kill_driver() remove(temp_path) finally: @@ -140,53 +183,58 @@ def advanced_parse(content, db, term=''): :return: None ''' soup = BeautifulSoup(content, 'html5lib') + table_rows = None try: table = soup.find('table', {'class': 'datadisplaytable'}) table_rows = table.find_all('tr') + except AttributeError as e: + return False - table_headers = list() - start_idx = 0 - for i, tr in enumerate(table_rows): - header_cols = tr.find_all('th', {'class': 'ddheader'}) - for th in header_cols: - table_headers.append(get_parsed_text(th)) - if table_headers: - start_idx = i - break + table_headers = list() + start_idx = 0 + for i, tr in enumerate(table_rows): + header_cols = tr.find_all('th', {'class': 'ddheader'}) + for th in header_cols: + table_headers.append(get_parsed_text(th)) + if table_headers: + start_idx = i + break + + for tr in table_rows[start_idx:]: + parse_row(tr, table_headers, db) + return True - for tr in table_rows[start_idx:]: - try: - cols = tr.find_all('td', {'class': 'dddefault'}) - if cols: - s = defaultdict(lambda: defaultdict(list)) +def parse_row(tr, th, db): + try: + cols = tr.find_all('td', {'class': 'dddefault'}) - num_blank = 0 - for i, c in enumerate(cols): - a = c.find('a') - cols[i] = get_parsed_text(a) if a else get_parsed_text(cols[i]) - if cols[i].isspace(): - num_blank += 1 + if cols: + s = defaultdict(lambda: defaultdict(list)) - if num_blank > len(cols) - num_blank: - raise BlankRow + num_blank = 0 + for i, c in enumerate(cols): + a = c.find('a') + cols[i] = get_parsed_text(a) if a else get_parsed_text(cols[i]) + if cols[i].isspace(): + num_blank += 1 - data = dict(zip(table_headers, cols)) + if num_blank > len(cols) - num_blank: + raise BlankRow - subject = data['Subj'] - key = data['Crse'] - crn = data['CRN'] + data = dict(zip(th, cols)) - s[key][crn].append(data) + subject = data['Subj'] + key = data['Crse'] + crn = data['CRN'] - j = dict(s) - db.table(f'{subject}').insert(j) - except BlankRow: - continue - except AttributeError as e: - return False - return True + s[key][crn].append(data) + j = dict(s) + + db.table(f'{subject}').insert(j) + except BlankRow: + return def generate_term_codes(): @@ -195,11 +243,10 @@ def generate_term_codes(): YEAR_RANGE, QUARTER_RANGE, CAMPUS_RANGE :return: (list(str)) list of term codes """ - codes = [] - for i in range(YEAR_RANGE[0], YEAR_RANGE[1] + 1): - for j in range(QUARTER_RANGE[0], QUARTER_RANGE[1] + 1): - for k in range(CAMPUS_RANGE[0], CAMPUS_RANGE[1] + 1): - codes.append(f'201{i}{j}{k}') + i = range(YEAR_RANGE[0], YEAR_RANGE[1] + 1) + j = range(QUARTER_RANGE[0], QUARTER_RANGE[1] + 1) + k = range(CAMPUS_RANGE[0], CAMPUS_RANGE[1] + 1) + codes = [f'201{x[0]}{x[1]}{x[2]}' for x in product(i, j, k)] return codes @@ -256,5 +303,5 @@ def write_to_file(res, term): if __name__ == '__main__': - init() + init() #colorama main() diff --git a/settings.py b/settings.py index 2134631..3a8bdf3 100644 --- a/settings.py +++ b/settings.py @@ -8,101 +8,8 @@ COURSE_PATTERN = r'[FD]0*(\d*\w?)\.?\d*([YWZH])?' DAYS_PATTERN = f"^{'(M|T|W|Th|F|S|U)?'*7}$" -PREFIXES = ('old', 'debug') + SCHEDULE = 'schedule.html' HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end', 'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap') CURRENT_TERM_CODES = CAMPUS_LIST = {'fh': '201911', 'da': '201912', 'test': 'test'} - -ADVANCED_FORM_DATA = [ - [ - [('sel_subj', 'dummy'), - ('sel_day', 'dummy'), - ('sel_schd', 'dummy'), - ('sel_insm', 'dummy'), - ('sel_camp', 'dummy'), - ('sel_levl', 'dummy'), - ('sel_sess', 'dummy'), - ('sel_instr', 'dummy'), - ('sel_ptrm', 'dummy'), - ('sel_attr', 'dummy')], - [('sel_crse', ''), - ('sel_title', ''), - ('sel_from_cred', ''), - ('sel_to_cred', ''), - ('sel_camp', '%'), - ('sel_sess', '%'), - ('sel_instr', '%'), - ('sel_ptrm', '%'), - ('begin_hh', '0'), - ('begin_mi', '0'), - ('begin_ap', 'a'), - ('end_hh', '0'), - ('end_mi', '0'), - ('end_ap', 'a'), - ('SUB_BTN', 'Section Search'), - ('path', '1')] - ], - [ - [('sel_subj', 'dummy'), - ('sel_day', 'dummy'), - ('sel_schd', 'dummy'), - ('sel_insm', 'dummy'), - ('sel_camp', 'dummy'), - ('sel_levl', 'dummy'), - ('sel_sess', 'dummy'), - ('sel_instr', 'dummy'), - ('sel_ptrm', 'dummy'), - ('sel_attr', 'dummy')], - [('sel_crse', ''), - ('sel_title', ''), - ('sel_schd', '%'), - ('sel_from_cred', ''), - ('sel_to_cred', ''), - ('sel_camp', '%'), - ('sel_instr', '%'), - ('sel_sess', '%'), - ('sel_ptrm', '%'), - ('sel_attr', '%'), - ('begin_hh', '0'), - ('begin_mi', '0'), - ('begin_ap', 'a'), - ('end_hh', '0'), - ('end_mi', '0'), - ('end_ap', 'a'), - ('SUB_BTN', 'Section Search'), - ('path', '1')] - ], - - [ - [('sel_subj', 'dummy'), - ('sel_day', 'dummy'), - ('sel_schd', 'dummy'), - ('sel_insm', 'dummy'), - ('sel_camp', 'dummy'), - ('sel_levl', 'dummy'), - ('sel_sess', 'dummy'), - ('sel_instr', 'dummy'), - ('sel_ptrm', 'dummy'), - ('sel_attr', 'dummy')], - [('sel_crse', ''), - ('sel_title', ''), - ('sel_schd', '%'), - ('sel_from_cred', ''), - ('sel_to_cred', ''), - ('sel_camp', '%'), - ('sel_levl', '%'), - ('sel_ptrm', '%'), - ('sel_instr', '%'), - ('sel_sess', '%'), - ('sel_attr', '%'), - ('begin_hh', '0'), - ('begin_mi', '0'), - ('begin_ap', 'a'), - ('end_hh', '0'), - ('end_mi', '0'), - ('end_ap', 'a'), - ('SUB_BTN', 'Section Search'), - ('path', '1')] - ] -] From 4a5f9f24db46a4b7746d132c24a4f3ade627b8c1 Mon Sep 17 00:00:00 2001 From: phi-line Date: Fri, 15 Jun 2018 18:34:23 -0700 Subject: [PATCH 19/19] fixed bug with wrong value in settings.py --- scrape_term.py | 4 +++- settings.py | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/scrape_term.py b/scrape_term.py index ef12a45..27aab7d 100644 --- a/scrape_term.py +++ b/scrape_term.py @@ -8,7 +8,9 @@ from bs4 import BeautifulSoup from tinydb import TinyDB -from settings import DB_DIR, CURRENT_TERM_CODES, COURSE_PATTERN, HEADERS, SCHEDULE +from settings import DB_DIR, COURSE_PATTERN, HEADERS, SCHEDULE + +CURRENT_TERM_CODES = {'fh': '201911', 'da': '201912'} def main(): diff --git a/settings.py b/settings.py index 3a8bdf3..7f370ff 100644 --- a/settings.py +++ b/settings.py @@ -10,6 +10,6 @@ SCHEDULE = 'schedule.html' -HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end', - 'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap') -CURRENT_TERM_CODES = CAMPUS_LIST = {'fh': '201911', 'da': '201912', 'test': 'test'} +HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end', 'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap') + +CAMPUS_LIST = {'fh': '201911', 'da': '201912', 'test': 'test'}