From 237fd49a5c2edf7ec8c93b8bf2e48ad9e17a20f7 Mon Sep 17 00:00:00 2001
From: phi-line <phi.liney@gmail.com>
Date: Tue, 5 Jun 2018 01:26:05 -0700
Subject: [PATCH 01/19] added format for advanced search. request hits fine but
 the table will be hard to parse

---
 scrape_advanced.py                | 239 ++++++++++++++++++++++++++++++
 data_scraper.py => scrape_term.py |  12 +-
 server.py                         |   7 +-
 settings.py                       |  13 +-
 4 files changed, 256 insertions(+), 15 deletions(-)
 create mode 100644 scrape_advanced.py
 rename data_scraper.py => scrape_term.py (89%)

diff --git a/scrape_advanced.py b/scrape_advanced.py
new file mode 100644
index 0000000..edc0488
--- /dev/null
+++ b/scrape_advanced.py
@@ -0,0 +1,239 @@
+from collections import defaultdict
+from os import makedirs, rename, remove
+from os.path import join, exists
+
+from scrape_term import get_key
+from settings import DB_DIR, ADVANCED_HEADERS, PAST_TERM_CODES, SCHEDULE
+
+# 3rd party
+import requests
+from bs4 import BeautifulSoup
+from tinydb import TinyDB
+
+def main():
+    if not exists(DB_DIR):
+        makedirs(DB_DIR, exist_ok=True)
+
+    for term in PAST_TERM_CODES.values():
+        temp_path = join(DB_DIR, 'temp.json')
+        temp = TinyDB(temp_path)
+
+        content = mine(term, write=True)
+        advanced_parse(content, db=temp)
+
+        if rename(temp_path, join(DB_DIR, f'old_{term}_database.json')):
+            remove(temp_path)
+
+        db = TinyDB(join(DB_DIR, f'old_{term}_database.json'))
+        print(term, db.tables())
+
+def mine(term, write=False):
+    '''
+    Mine will hit the database for foothill's class listings
+    :param term: (str) the term to mine
+    :param write: (bool) write to file?
+    :return res.content: (json) the html body
+    '''
+    cookies = {
+        'TESTID': 'SET',
+        'badprotocol': '3',
+        'shib_idp_session': 'ad53da4cf9b9a4c1f354e513c10c60802592d842df45e292e94acb39e4f22f3c',
+        'BANSSO': 'F1538D4DA70C773174EDA34DF454DCA1E95FBB970A1B3C78D794E9F6850F167B',
+        'fos.web.server': 'lumweb3',
+        'fos.secure.web.server': 'lumweb3',
+        'runId': '-5887403237976173719',
+        'usid': 'nEekhNSMntHK2/GY/bTtOA__',
+        'CPSESSID': 'AQARMjAxODA2MDUwMTE4MjUDABA1NlBKWTcxODI5MzAz',
+    }
+
+    headers = {
+        'Connection': 'keep-alive',
+        'Pragma': 'no-cache',
+        'Cache-Control': 'no-cache',
+        'Origin': 'https://banssb.fhda.edu',
+        'Upgrade-Insecure-Requests': '1',
+        'Content-Type': 'application/x-www-form-urlencoded',
+        'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3202.29 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
+        'Referer': 'https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'en-US,en;q=0.9,la;q=0.8',
+        'DNT': '1',
+    }
+
+    data = [
+      ('rsts', 'dummy'),
+      ('crn', 'dummy'),
+      ('term_in', f'{term}'),
+      ('sel_subj', 'dummy'),
+      ('sel_subj', 'ACTG'),
+      ('sel_subj', 'ALCB'),
+      ('sel_subj', 'ALTW'),
+      ('sel_subj', 'AHS'),
+      ('sel_subj', 'ANTH'),
+      ('sel_subj', 'APSM'),
+      ('sel_subj', 'ART'),
+      ('sel_subj', 'ASTR'),
+      ('sel_subj', 'ATHL'),
+      ('sel_subj', 'BIOL'),
+      ('sel_subj', 'BUSI'),
+      ('sel_subj', 'CRLP'),
+      ('sel_subj', 'CHEM'),
+      ('sel_subj', 'CHLD'),
+      ('sel_subj', 'COMM'),
+      ('sel_subj', 'C S'),
+      ('sel_subj', 'CNSL'),
+      ('sel_subj', 'CRWR'),
+      ('sel_subj', 'DANC'),
+      ('sel_subj', 'D A'),
+      ('sel_subj', 'D H'),
+      ('sel_subj', 'DMS'),
+      ('sel_subj', 'ECON'),
+      ('sel_subj', 'EMS'),
+      ('sel_subj', 'ENGR'),
+      ('sel_subj', 'ENGL'),
+      ('sel_subj', 'ESLL'),
+      ('sel_subj', 'HORT'),
+      ('sel_subj', 'JFS'),
+      ('sel_subj', 'GEOG'),
+      ('sel_subj', 'GIST'),
+      ('sel_subj', 'GID'),
+      ('sel_subj', 'HLTH'),
+      ('sel_subj', 'HIST'),
+      ('sel_subj', 'HUMN'),
+      ('sel_subj', 'ITRN'),
+      ('sel_subj', 'JAPN'),
+      ('sel_subj', 'KINS'),
+      ('sel_subj', 'L A'),
+      ('sel_subj', 'LINC'),
+      ('sel_subj', 'LIBR'),
+      ('sel_subj', 'MATH'),
+      ('sel_subj', 'MDIA'),
+      ('sel_subj', 'MTEC'),
+      ('sel_subj', 'MUS'),
+      ('sel_subj', 'NCBS'),
+      ('sel_subj', 'NCBH'),
+      ('sel_subj', 'NCEL'),
+      ('sel_subj', 'NCLA'),
+      ('sel_subj', 'NCP'),
+      ('sel_subj', 'NCSV'),
+      ('sel_subj', 'P A'),
+      ('sel_subj', 'PHT'),
+      ('sel_subj', 'PHIL'),
+      ('sel_subj', 'PHOT'),
+      ('sel_subj', 'PHED'),
+      ('sel_subj', 'PHDA'),
+      ('sel_subj', 'PSE'),
+      ('sel_subj', 'PHYS'),
+      ('sel_subj', 'POLI'),
+      ('sel_subj', 'PCA'),
+      ('sel_subj', 'PSYC'),
+      ('sel_subj', 'R T'),
+      ('sel_subj', 'RSPT'),
+      ('sel_subj', 'SOSC'),
+      ('sel_subj', 'SOC'),
+      ('sel_subj', 'SPAN'),
+      ('sel_subj', 'THTR'),
+      ('sel_subj', 'V T'),
+      ('sel_subj', 'VITI'),
+      ('sel_subj', 'WMN'),
+      ('sel_day', 'dummy'),
+      ('sel_schd', 'dummy'),
+      ('sel_insm', 'dummy'),
+      ('sel_camp', 'dummy'),
+      ('sel_camp', '%'),
+      ('sel_levl', 'dummy'),
+      ('sel_sess', 'dummy'),
+      ('sel_sess', '%'),
+      ('sel_instr', 'dummy'),
+      ('sel_instr', '%'),
+      ('sel_ptrm', 'dummy'),
+      ('sel_ptrm', '%'),
+      ('sel_attr', 'dummy'),
+      ('sel_crse', ''),
+      ('sel_title', ''),
+      ('sel_from_cred', ''),
+      ('sel_to_cred', ''),
+      ('begin_hh', '0'),
+      ('begin_mi', '0'),
+      ('begin_ap', 'a'),
+      ('end_hh', '0'),
+      ('end_mi', '0'),
+      ('end_ap', 'a'),
+      ('SUB_BTN', 'Section Search'),
+      ('path', '1'),
+    ]
+
+    res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced', headers=headers, cookies=cookies, data=data)
+    res.raise_for_status()
+
+    if write:
+        with open(f'{join(DB_DIR, SCHEDULE)}', "wb") as file:
+            for chunk in res.iter_content(chunk_size=512):
+                if chunk:
+                    file.write(chunk)
+
+    return res.content
+
+
+def advanced_parse(content, db):
+    '''
+    Advanced parse takes the content from the request and then populates the database with the data
+    :param content: (html) The html containing the courses
+    :param db: (TinyDB) the current database
+    :return: None
+    '''
+    soup = BeautifulSoup(content, 'html5lib')
+
+    table = soup.find('table', {'class': 'datadisplaytable'})
+    table_rows = table.find_all('tr')
+    for tr in table_rows[:5]:
+        td = tr.find_all('td', {'class': 'dddefault'})
+        print(td)
+
+
+def parse(content, db):
+    '''
+    Parse takes the content from the request and then populates the database with the data
+    :param content: (html) The html containing the courses
+    :param db: (TinyDB) the current database
+    '''
+    soup = BeautifulSoup(content, 'html5lib')
+
+    tables = soup.find_all('table', {'class': 'TblCourses'})
+    for t in tables:
+        dept = t['dept'].replace(' ', '')
+        dept_desc = t['dept-desc']
+
+        rows = t.find_all('tr', {'class': 'CourseRow'})
+        s = defaultdict(lambda: defaultdict(list))
+        for r in rows:
+            cols = r.find_all(lambda tag: tag.name == 'td' and not tag.get_text().isspace())
+
+            if cols:
+                for i, c in enumerate(cols):
+                    a = c.find('a')
+                    cols[i] = a.get_text() if a else cols[i].get_text()
+
+                try:
+                    key = get_key(f'{cols[0] if cols[0] else cols[1]}')[0]
+                    data = dict(zip(ADVANCED_HEADERS, cols))
+
+                    crn = data['CRN']
+                    if s[key][crn]:
+                        comb = set(s[key][crn][0].items()) ^ set(data.items())
+                        if not comb:
+                            continue
+
+                    data['units'] = data['units'].lstrip()
+
+                    s[key][crn].append(data)
+                except KeyError:
+                    continue
+
+        j = dict(s)
+        db.table(f'{dept}').insert(j)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/data_scraper.py b/scrape_term.py
similarity index 89%
rename from data_scraper.py
rename to scrape_term.py
index 2420f18..4bcc7ab 100644
--- a/data_scraper.py
+++ b/scrape_term.py
@@ -8,20 +8,14 @@
 from bs4 import BeautifulSoup
 from tinydb import TinyDB
 
-from settings import DB_DIR
+from settings import DB_DIR, CURRENT_TERM_CODES, COURSE_PATTERN, HEADERS, SCHEDULE
 
-SCHEDULE = 'schedule.html'
-TERM_CODES = {'fh': '201911', 'da': '201912'}
-HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end',
-           'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap')
-
-COURSE_PATTERN = r'[FD]0*(\d*\w?)\.?\d*([YWZH])?'
 
 def main():
     if not exists(DB_DIR):
         makedirs(DB_DIR, exist_ok=True)
 
-    for term in TERM_CODES.values():
+    for term in CURRENT_TERM_CODES.values():
         temp_path = join(DB_DIR, 'temp.json')
         temp = TinyDB(temp_path)
 
@@ -37,7 +31,7 @@ def main():
 
 def mine(term, write=False):
     '''
-    Mine will hit the database for foothill's class listings and write it to a file.
+    Mine will hit the database for foothill's class listings
     :param term: (str) the term to mine
     :param write: (bool) write to file?
     :return res.content: (json) the html body
diff --git a/server.py b/server.py
index 517ffff..ffaac31 100644
--- a/server.py
+++ b/server.py
@@ -5,6 +5,8 @@
 import itertools as itr
 import typing as ty
 
+from settings import COURSE_PATTERN, DAYS_PATTERN, CAMPUS_LIST
+
 # 3rd party
 from flask import Flask, jsonify, request, render_template
 from tinydb import TinyDB
@@ -21,11 +23,6 @@ def add_cors_headers(response):
 
 DB_ROOT = 'db/'
 
-CAMPUS_LIST = {'fh':'201911', 'da':'201912', 'test':'test'}
-
-COURSE_PATTERN = r'[FD]0*(\d*\w?)\.?\d*([YWZH])?'
-DAYS_PATTERN = f"^{'(M|T|W|Th|F|S|U)?'*7}$"
-
 FH_TYPE_ALIAS = {'standard': None, 'online': 'W', 'hybrid': 'Y'}
 DA_TYPE_ALIAS = {'standard': None, 'online': 'Z', 'hybrid': 'Y'}
 
diff --git a/settings.py b/settings.py
index df60686..d8f3a74 100644
--- a/settings.py
+++ b/settings.py
@@ -1,7 +1,18 @@
 import os
 
 ROOT_DIR = os.path.abspath(os.path.dirname(__file__))
-API_DIR = os.path.join(ROOT_DIR, 'owlapi')
 DB_DIR = os.path.join(ROOT_DIR, 'db')
 TEST_DIR = os.path.join(ROOT_DIR, 'tests')
 TEST_DB_DIR = os.path.join(TEST_DIR, 'test_db')
+
+COURSE_PATTERN = r'[FD]0*(\d*\w?)\.?\d*([YWZH])?'
+DAYS_PATTERN = f"^{'(M|T|W|Th|F|S|U)?'*7}$"
+
+SCHEDULE = 'schedule.html'
+HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end',
+           'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap')
+CURRENT_TERM_CODES = CAMPUS_LIST = {'fh': '201911', 'da': '201912', 'test': 'test'}
+
+ADVANCED_HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end',
+           'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap')
+PAST_TERM_CODES = {'fh': '201841'}

From 2cb8cd28ac0317c15d77e1655eca0a326e3f43f2 Mon Sep 17 00:00:00 2001
From: phi-line <phi.liney@gmail.com>
Date: Wed, 6 Jun 2018 03:37:44 -0700
Subject: [PATCH 02/19] finished more of the advanced scraper. some potential
 problems may arise with data that is inconsistent from year to year

---
 Pipfile            |   1 +
 Pipfile.lock       |  49 +++++----
 scrape_advanced.py | 264 +++++++++++++++++++++++----------------------
 scrape_term.py     |   2 +-
 selenium_login.py  |  52 +++++++++
 settings.py        |   5 +-
 6 files changed, 222 insertions(+), 151 deletions(-)
 create mode 100644 selenium_login.py

diff --git a/Pipfile b/Pipfile
index 4977ced..5122131 100644
--- a/Pipfile
+++ b/Pipfile
@@ -16,6 +16,7 @@ maya = "*"
 pytest = "*"
 flask = "*"
 pylint = "*"
+selenium = "*"
 
 
 [dev-packages]
diff --git a/Pipfile.lock b/Pipfile.lock
index ae649ba..532c0c6 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "aebd83c7173a3d0dcc9013e1c610a33d4ffc785fe275cabb5825e246f33fbb93"
+            "sha256": "6098e2ee7f95b6c25b6293ccd2279203e4a83cbe3fe4660b3d9e3a39ea93de71"
         },
         "host-environment-markers": {
             "implementation_name": "cpython",
@@ -242,10 +242,10 @@
         },
         "pytest": {
             "hashes": [
-                "sha256:c76e93f3145a44812955e8d46cdd302d8a45fbfc7bf22be24fe231f9d8d8853a",
-                "sha256:39555d023af3200d004d09e51b4dd9fdd828baa863cded3fd6ba2f29f757ae2d"
+                "sha256:26838b2bc58620e01675485491504c3aa7ee0faf335c37fcd5f8731ca4319591",
+                "sha256:32c49a69566aa7c333188149ad48b58ac11a426d5352ea3d8f6ce843f88199cb"
             ],
-            "version": "==3.6.0"
+            "version": "==3.6.1"
         },
         "python-dateutil": {
             "hashes": [
@@ -270,23 +270,23 @@
         },
         "regex": {
             "hashes": [
-                "sha256:333687d9a44738c486735955993f83bd22061a416c48f5a5f9e765e90cf1b0c9",
-                "sha256:361a1fd703a35580a4714ec28d85e29780081a4c399a99bbfb2aee695d72aedb",
-                "sha256:f69d1201a4750f763971ea8364ed95ee888fc128968b39d38883a72a4d005895",
-                "sha256:a50532f61b23d4ab9d216a6214f359dd05c911c1a1ad20986b6738a782926c1a",
-                "sha256:1b428a296531ea1642a7da48562746309c5c06471a97bd0c02dd6a82e9cecee8",
-                "sha256:5b9c0ddd5b4afa08c9074170a2ea9b34ea296e32aeea522faaaaeeeb2fe0af2e",
-                "sha256:27d72bb42dffb32516c28d218bb054ce128afd3e18464f30837166346758af67",
-                "sha256:32cf4743debee9ea12d3626ee21eae83052763740e04086304e7a74778bf58c9",
-                "sha256:35eeccf17af3b017a54d754e160af597036435c58eceae60f1dd1364ae1250c7",
-                "sha256:be42a601aaaeb7a317f818490a39d153952a97c40c6e9beeb2a1103616405348",
-                "sha256:eee4d94b1a626490fc8170ffd788883f8c641b576e11ba9b4a29c9f6623371e0",
-                "sha256:32f6408dbca35040bc65f9f4ae1444d5546411fde989cb71443a182dd643305e",
-                "sha256:a9243d7b359b72c681a2c32eaa7ace8d346b7e8ce09d172a683acf6853161d9c",
-                "sha256:494bed6396a20d3aa6376bdf2d3fbb1005b8f4339558d8ac7b53256755f80303",
-                "sha256:b44624a38d07d3c954c84ad302c29f7930f4bf01443beef5589e9157b14e2a29"
-            ],
-            "version": "==2018.2.21"
+                "sha256:60ff6be94b168ee7f6f5a8b334503f3d3eda21b2aa9cf3909736bc600ed9455d",
+                "sha256:92ef64d4afe7e709b57b1ca38a41ef0df54f03b4418f1c0e1b2edb52f671eec8",
+                "sha256:9ee46c7cb5306c96ae9dad07f608f5a474f47c9505fe046d32df6bcb5e6c18ba",
+                "sha256:3cc8106f31467d9b7a7ea6d0db95cb7a4097e3683e686c89cc14d3a81f66e637",
+                "sha256:8ad161a52107e6e4bd56f1ee299b1dc492873b8abbfcf639fea4765d96853e32",
+                "sha256:3326619c3716dbbfe5b2a3e4a109b0bbb6476a35398612539788b15663e0f0d3",
+                "sha256:f87f51647eeff0f7a1e787b2a8b56d059cfa3ea28f2d825b50a66a172574c6f0",
+                "sha256:712922a779b153290e3007f4bbdb0af459c36c70f00c6690acd0a86f2f3f52b0",
+                "sha256:7638a3babd94f947e639c45c0b13cee62caea31ad6fedce392bd3edacf412c5f",
+                "sha256:3b95120ffcbeb44eb3362456ec887c72190726a2a3e270f1c7343266941826d4",
+                "sha256:45fa86b2e6bf8b1f5b60820d2d9520d42f32497d4bf8903ed0b86285b29d3aa9",
+                "sha256:bfb99e3bdf1ff372c8876f217b00fe44dd08f3f53ab590df6fa93b3b72d9dfb6",
+                "sha256:99665c4ca23f9b09618b38afd3c11d0dd6424d0e2d4374afd4c3fc319236552b",
+                "sha256:ab174253361da55a8425f60bbe319fb32083b295507bace5513834bc3723fcd1",
+                "sha256:afeb71482e4f7c18ad94802c6c8fbabf2585d3804ca45a8c9db046c120a44a51"
+            ],
+            "version": "==2018.6.6"
         },
         "requests": {
             "hashes": [
@@ -295,6 +295,13 @@
             ],
             "version": "==2.18.4"
         },
+        "selenium": {
+            "hashes": [
+                "sha256:1372101ad23798462038481f92ba1c7fab8385c788b05da6b44318f10ea52422",
+                "sha256:b8a2630fd858636c894960726ca3c94d8277e516ea3a9d81614fb819a5844764"
+            ],
+            "version": "==3.12.0"
+        },
         "six": {
             "hashes": [
                 "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb",
diff --git a/scrape_advanced.py b/scrape_advanced.py
index edc0488..4b4920b 100644
--- a/scrape_advanced.py
+++ b/scrape_advanced.py
@@ -1,7 +1,8 @@
-from collections import defaultdict
 from os import makedirs, rename, remove
 from os.path import join, exists
+from collections import defaultdict
 
+from selenium_login import scrape_cookies, kill_driver
 from scrape_term import get_key
 from settings import DB_DIR, ADVANCED_HEADERS, PAST_TERM_CODES, SCHEDULE
 
@@ -18,8 +19,15 @@ def main():
         temp_path = join(DB_DIR, 'temp.json')
         temp = TinyDB(temp_path)
 
-        content = mine(term, write=True)
-        advanced_parse(content, db=temp)
+        try:
+            cookies = scrape_cookies()
+            content = mine(term, cookies)
+        except KeyboardInterrupt:
+            kill_driver()
+            return
+        finally:
+            kill_driver()
+            advanced_parse(content, db=temp)
 
         if rename(temp_path, join(DB_DIR, f'old_{term}_database.json')):
             remove(temp_path)
@@ -27,24 +35,14 @@ def main():
         db = TinyDB(join(DB_DIR, f'old_{term}_database.json'))
         print(term, db.tables())
 
-def mine(term, write=False):
+def mine(term, cookies, write=False):
     '''
     Mine will hit the database for foothill's class listings
     :param term: (str) the term to mine
+    :param cookies: (dict) cookies to send with POST
     :param write: (bool) write to file?
     :return res.content: (json) the html body
     '''
-    cookies = {
-        'TESTID': 'SET',
-        'badprotocol': '3',
-        'shib_idp_session': 'ad53da4cf9b9a4c1f354e513c10c60802592d842df45e292e94acb39e4f22f3c',
-        'BANSSO': 'F1538D4DA70C773174EDA34DF454DCA1E95FBB970A1B3C78D794E9F6850F167B',
-        'fos.web.server': 'lumweb3',
-        'fos.secure.web.server': 'lumweb3',
-        'runId': '-5887403237976173719',
-        'usid': 'nEekhNSMntHK2/GY/bTtOA__',
-        'CPSESSID': 'AQARMjAxODA2MDUwMTE4MjUDABA1NlBKWTcxODI5MzAz',
-    }
 
     headers = {
         'Connection': 'keep-alive',
@@ -53,90 +51,123 @@ def mine(term, write=False):
         'Origin': 'https://banssb.fhda.edu',
         'Upgrade-Insecure-Requests': '1',
         'Content-Type': 'application/x-www-form-urlencoded',
-        'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3202.29 Safari/537.36',
+        'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686)',
         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
-        'Referer': 'https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse',
+        'Referer': 'https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced',
         'Accept-Encoding': 'gzip, deflate, br',
         'Accept-Language': 'en-US,en;q=0.9,la;q=0.8',
         'DNT': '1',
     }
 
+    # data = [
+    #   ('rsts', 'dummy'),
+    #   ('crn', 'dummy'),
+    #   ('term_in', f'{term}'),
+    #   ('sel_subj', 'dummy'),
+    #   ('sel_subj', 'ACTG'),
+    #   ('sel_subj', 'ALCB'),
+    #   ('sel_subj', 'ALTW'),
+    #   ('sel_subj', 'AHS'),
+    #   ('sel_subj', 'ANTH'),
+    #   ('sel_subj', 'APSM'),
+    #   ('sel_subj', 'ART'),
+    #   ('sel_subj', 'ASTR'),
+    #   ('sel_subj', 'ATHL'),
+    #   ('sel_subj', 'BIOL'),
+    #   ('sel_subj', 'BUSI'),
+    #   ('sel_subj', 'CRLP'),
+    #   ('sel_subj', 'CHEM'),
+    #   ('sel_subj', 'CHLD'),
+    #   ('sel_subj', 'COMM'),
+    #   ('sel_subj', 'C S'),
+    #   ('sel_subj', 'CNSL'),
+    #   ('sel_subj', 'CRWR'),
+    #   ('sel_subj', 'DANC'),
+    #   ('sel_subj', 'D A'),
+    #   ('sel_subj', 'D H'),
+    #   ('sel_subj', 'DMS'),
+    #   ('sel_subj', 'ECON'),
+    #   ('sel_subj', 'EMS'),
+    #   ('sel_subj', 'ENGR'),
+    #   ('sel_subj', 'ENGL'),
+    #   ('sel_subj', 'ESLL'),
+    #   ('sel_subj', 'HORT'),
+    #   ('sel_subj', 'JFS'),
+    #   ('sel_subj', 'GEOG'),
+    #   ('sel_subj', 'GIST'),
+    #   ('sel_subj', 'GID'),
+    #   ('sel_subj', 'HLTH'),
+    #   ('sel_subj', 'HIST'),
+    #   ('sel_subj', 'HUMN'),
+    #   ('sel_subj', 'ITRN'),
+    #   ('sel_subj', 'JAPN'),
+    #   ('sel_subj', 'KINS'),
+    #   ('sel_subj', 'L A'),
+    #   ('sel_subj', 'LINC'),
+    #   ('sel_subj', 'LIBR'),
+    #   ('sel_subj', 'MATH'),
+    #   ('sel_subj', 'MDIA'),
+    #   ('sel_subj', 'MTEC'),
+    #   ('sel_subj', 'MUS'),
+    #   ('sel_subj', 'NCBS'),
+    #   ('sel_subj', 'NCBH'),
+    #   ('sel_subj', 'NCEL'),
+    #   ('sel_subj', 'NCLA'),
+    #   ('sel_subj', 'NCP'),
+    #   ('sel_subj', 'NCSV'),
+    #   ('sel_subj', 'P A'),
+    #   ('sel_subj', 'PHT'),
+    #   ('sel_subj', 'PHIL'),
+    #   ('sel_subj', 'PHOT'),
+    #   ('sel_subj', 'PHED'),
+    #   ('sel_subj', 'PHDA'),
+    #   ('sel_subj', 'PSE'),
+    #   ('sel_subj', 'PHYS'),
+    #   ('sel_subj', 'POLI'),
+    #   ('sel_subj', 'PCA'),
+    #   ('sel_subj', 'PSYC'),
+    #   ('sel_subj', 'R T'),
+    #   ('sel_subj', 'RSPT'),
+    #   ('sel_subj', 'SOSC'),
+    #   ('sel_subj', 'SOC'),
+    #   ('sel_subj', 'SPAN'),
+    #   ('sel_subj', 'THTR'),
+    #   ('sel_subj', 'V T'),
+    #   ('sel_subj', 'VITI'),
+    #   ('sel_subj', 'WMN'),
+    #   ('sel_day', 'dummy'),
+    #   ('sel_schd', 'dummy'),
+    #   ('sel_insm', 'dummy'),
+    #   ('sel_camp', 'dummy'),
+    #   ('sel_camp', '%'),
+    #   ('sel_levl', 'dummy'),
+    #   ('sel_sess', 'dummy'),
+    #   ('sel_sess', '%'),
+    #   ('sel_instr', 'dummy'),
+    #   ('sel_instr', '%'),
+    #   ('sel_ptrm', 'dummy'),
+    #   ('sel_ptrm', '%'),
+    #   ('sel_attr', 'dummy'),
+    #   ('sel_crse', ''),
+    #   ('sel_title', ''),
+    #   ('sel_from_cred', ''),
+    #   ('sel_to_cred', ''),
+    #   ('begin_hh', '0'),
+    #   ('begin_mi', '0'),
+    #   ('begin_ap', 'a'),
+    #   ('end_hh', '0'),
+    #   ('end_mi', '0'),
+    #   ('end_ap', 'a'),
+    #   ('SUB_BTN', 'Section Search'),
+    #   ('path', '1'),
+    # ]
+
     data = [
       ('rsts', 'dummy'),
       ('crn', 'dummy'),
       ('term_in', f'{term}'),
       ('sel_subj', 'dummy'),
       ('sel_subj', 'ACTG'),
-      ('sel_subj', 'ALCB'),
-      ('sel_subj', 'ALTW'),
-      ('sel_subj', 'AHS'),
-      ('sel_subj', 'ANTH'),
-      ('sel_subj', 'APSM'),
-      ('sel_subj', 'ART'),
-      ('sel_subj', 'ASTR'),
-      ('sel_subj', 'ATHL'),
-      ('sel_subj', 'BIOL'),
-      ('sel_subj', 'BUSI'),
-      ('sel_subj', 'CRLP'),
-      ('sel_subj', 'CHEM'),
-      ('sel_subj', 'CHLD'),
-      ('sel_subj', 'COMM'),
-      ('sel_subj', 'C S'),
-      ('sel_subj', 'CNSL'),
-      ('sel_subj', 'CRWR'),
-      ('sel_subj', 'DANC'),
-      ('sel_subj', 'D A'),
-      ('sel_subj', 'D H'),
-      ('sel_subj', 'DMS'),
-      ('sel_subj', 'ECON'),
-      ('sel_subj', 'EMS'),
-      ('sel_subj', 'ENGR'),
-      ('sel_subj', 'ENGL'),
-      ('sel_subj', 'ESLL'),
-      ('sel_subj', 'HORT'),
-      ('sel_subj', 'JFS'),
-      ('sel_subj', 'GEOG'),
-      ('sel_subj', 'GIST'),
-      ('sel_subj', 'GID'),
-      ('sel_subj', 'HLTH'),
-      ('sel_subj', 'HIST'),
-      ('sel_subj', 'HUMN'),
-      ('sel_subj', 'ITRN'),
-      ('sel_subj', 'JAPN'),
-      ('sel_subj', 'KINS'),
-      ('sel_subj', 'L A'),
-      ('sel_subj', 'LINC'),
-      ('sel_subj', 'LIBR'),
-      ('sel_subj', 'MATH'),
-      ('sel_subj', 'MDIA'),
-      ('sel_subj', 'MTEC'),
-      ('sel_subj', 'MUS'),
-      ('sel_subj', 'NCBS'),
-      ('sel_subj', 'NCBH'),
-      ('sel_subj', 'NCEL'),
-      ('sel_subj', 'NCLA'),
-      ('sel_subj', 'NCP'),
-      ('sel_subj', 'NCSV'),
-      ('sel_subj', 'P A'),
-      ('sel_subj', 'PHT'),
-      ('sel_subj', 'PHIL'),
-      ('sel_subj', 'PHOT'),
-      ('sel_subj', 'PHED'),
-      ('sel_subj', 'PHDA'),
-      ('sel_subj', 'PSE'),
-      ('sel_subj', 'PHYS'),
-      ('sel_subj', 'POLI'),
-      ('sel_subj', 'PCA'),
-      ('sel_subj', 'PSYC'),
-      ('sel_subj', 'R T'),
-      ('sel_subj', 'RSPT'),
-      ('sel_subj', 'SOSC'),
-      ('sel_subj', 'SOC'),
-      ('sel_subj', 'SPAN'),
-      ('sel_subj', 'THTR'),
-      ('sel_subj', 'V T'),
-      ('sel_subj', 'VITI'),
-      ('sel_subj', 'WMN'),
       ('sel_day', 'dummy'),
       ('sel_schd', 'dummy'),
       ('sel_insm', 'dummy'),
@@ -164,7 +195,8 @@ def mine(term, write=False):
       ('path', '1'),
     ]
 
-    res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced', headers=headers, cookies=cookies, data=data)
+    res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced',
+                        headers=headers, cookies=cookies, data=data)
     res.raise_for_status()
 
     if write:
@@ -187,52 +219,30 @@ def advanced_parse(content, db):
 
     table = soup.find('table', {'class': 'datadisplaytable'})
     table_rows = table.find_all('tr')
-    for tr in table_rows[:5]:
-        td = tr.find_all('td', {'class': 'dddefault'})
-        print(td)
-
-
-def parse(content, db):
-    '''
-    Parse takes the content from the request and then populates the database with the data
-    :param content: (html) The html containing the courses
-    :param db: (TinyDB) the current database
-    '''
-    soup = BeautifulSoup(content, 'html5lib')
-
-    tables = soup.find_all('table', {'class': 'TblCourses'})
-    for t in tables:
-        dept = t['dept'].replace(' ', '')
-        dept_desc = t['dept-desc']
 
-        rows = t.find_all('tr', {'class': 'CourseRow'})
-        s = defaultdict(lambda: defaultdict(list))
-        for r in rows:
-            cols = r.find_all(lambda tag: tag.name == 'td' and not tag.get_text().isspace())
+    for tr in table_rows:
+        cols = tr.find_all(lambda tag: tag.name == 'td' and not tag.get_text().isspace())
 
-            if cols:
-                for i, c in enumerate(cols):
-                    a = c.find('a')
-                    cols[i] = a.get_text() if a else cols[i].get_text()
+        if cols and len(cols) >= len(ADVANCED_HEADERS):
+            s = defaultdict(lambda: defaultdict(list))
 
-                try:
-                    key = get_key(f'{cols[0] if cols[0] else cols[1]}')[0]
-                    data = dict(zip(ADVANCED_HEADERS, cols))
+            for i, c in enumerate(cols[1:]):
+                a = c.find('a')
+                cols[i] = a.get_text() if a else cols[i].get_text()
 
-                    crn = data['CRN']
-                    if s[key][crn]:
-                        comb = set(s[key][crn][0].items()) ^ set(data.items())
-                        if not comb:
-                            continue
+            try:
+                data = dict(zip(ADVANCED_HEADERS, cols))
 
-                    data['units'] = data['units'].lstrip()
+                key = data['course']
+                crn = data['CRN']
+                data['units'] = data['units'].lstrip()
 
-                    s[key][crn].append(data)
-                except KeyError:
-                    continue
+                s[key][crn].append(data)
+            except KeyError:
+                continue
 
-        j = dict(s)
-        db.table(f'{dept}').insert(j)
+            j = dict(s)
+            db.table(f'{key}').insert(j)
 
 
 if __name__ == '__main__':
diff --git a/scrape_term.py b/scrape_term.py
index 4bcc7ab..bb2bef7 100644
--- a/scrape_term.py
+++ b/scrape_term.py
@@ -1,7 +1,7 @@
-from collections import defaultdict
 from os import makedirs, rename, remove
 from os.path import join, exists
 from re import match
+from collections import defaultdict
 
 # 3rd party
 import requests
diff --git a/selenium_login.py b/selenium_login.py
new file mode 100644
index 0000000..8ab581d
--- /dev/null
+++ b/selenium_login.py
@@ -0,0 +1,52 @@
+import os
+
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+from selenium.webdriver.chrome.options import Options
+
+chrome_options = Options()
+chrome_options.add_argument("--window-size=300,400")
+
+driver = webdriver.Chrome(chrome_options=chrome_options)
+driver.get("https://myportal.fhda.edu/cp/home/displaylogin")
+
+
+def scrape_cookies():
+    driver.execute_script(f"document.getElementById('user').value='{os.environ['MP_USER']}'")
+    driver.execute_script(f"document.getElementById('pass').value='{os.environ['MP_PASS']}'")
+
+    try:
+        driver.execute_script("doLogin()")
+        WebDriverWait(driver, 10).until(
+            EC.title_is("MyPortal / Foothill-De Anza College District")
+        )
+
+        driver.get("https://myportal.fhda.edu/render.UserLayoutRootNode.uP?uP_tparam=utf&utf=%2fcp%2fip%2flogin%3fsys% \
+                    3dsctssb%26url%3dhttps%3A%2F%2Fbanssb.fhda.edu%2FPROD%2Fbwskfcls.p_sel_crse_search")
+        driver.get("https://myportal.fhda.edu/render.UserLayoutRootNode.uP?uP_tparam=utf&utf=%2fcp%2fip%2flogin%3fsys%3dsctssb%26url%3dhttps%3A%2F%2Fbanssb.fhda.edu%2FPROD%2Fbwskfcls.p_sel_crse_search")
+
+        WebDriverWait(driver, 10).until(
+            EC.title_is("MyPortal / Foothill-De Anza College District")
+        )
+    finally:
+        cookies_list = driver.get_cookies()
+        return get_cookies(cookies_list)
+
+
+def get_cookies(cookies_list):
+    cookies_dict = {}
+    for cookie in cookies_list:
+        cookies_dict[cookie['name']] = cookie['value']
+    print(cookies_dict)
+    return cookies_dict
+
+
+def kill_driver():
+    driver.quit()
+
+
+if __name__ == '__main__':
+    scrape_cookies()
diff --git a/settings.py b/settings.py
index d8f3a74..d45a26f 100644
--- a/settings.py
+++ b/settings.py
@@ -13,6 +13,7 @@
            'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap')
 CURRENT_TERM_CODES = CAMPUS_LIST = {'fh': '201911', 'da': '201912', 'test': 'test'}
 
-ADVANCED_HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end',
-           'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap')
+ADVANCED_HEADERS = ('select', 'CRN', 'subject', 'section', 'course', 'campus', 'units', 'title', 'days', 'time',
+                    'seats_cap', 'seats_act', 'seats', 'wait_cap', 'wait_act', 'wait_seats', 'instructor',
+                    'date_range', 'location')
 PAST_TERM_CODES = {'fh': '201841'}

From 564a09faf8cdf4bdd9fb8ec1dc70a4ed83d9f96a Mon Sep 17 00:00:00 2001
From: phi-line <phi.liney@gmail.com>
Date: Wed, 6 Jun 2018 11:59:56 -0700
Subject: [PATCH 03/19] scraper working for each quarter's data

---
 scrape_advanced.py | 259 +++++++++++++++++++++++----------------------
 1 file changed, 131 insertions(+), 128 deletions(-)

diff --git a/scrape_advanced.py b/scrape_advanced.py
index 4b4920b..2a3608c 100644
--- a/scrape_advanced.py
+++ b/scrape_advanced.py
@@ -1,5 +1,6 @@
 from os import makedirs, rename, remove
 from os.path import join, exists
+from re import compile
 from collections import defaultdict
 
 from selenium_login import scrape_cookies, kill_driver
@@ -15,25 +16,24 @@ def main():
     if not exists(DB_DIR):
         makedirs(DB_DIR, exist_ok=True)
 
-    for term in PAST_TERM_CODES.values():
-        temp_path = join(DB_DIR, 'temp.json')
-        temp = TinyDB(temp_path)
+    cookies = scrape_cookies()
+    try:
+        for term in PAST_TERM_CODES.values():
+            temp_path = join(DB_DIR, 'temp.json')
+            temp = TinyDB(temp_path)
 
-        try:
-            cookies = scrape_cookies()
             content = mine(term, cookies)
-        except KeyboardInterrupt:
-            kill_driver()
-            return
-        finally:
-            kill_driver()
             advanced_parse(content, db=temp)
 
-        if rename(temp_path, join(DB_DIR, f'old_{term}_database.json')):
-            remove(temp_path)
+            if rename(temp_path, join(DB_DIR, f'old_{term}_database.json')):
+                remove(temp_path)
 
-        db = TinyDB(join(DB_DIR, f'old_{term}_database.json'))
-        print(term, db.tables())
+            db = TinyDB(join(DB_DIR, f'old_{term}_database.json'))
+            print(term, db.tables())
+    except KeyboardInterrupt:
+        kill_driver()
+    finally:
+        kill_driver()
 
 def mine(term, cookies, write=False):
     '''
@@ -59,115 +59,82 @@ def mine(term, cookies, write=False):
         'DNT': '1',
     }
 
-    # data = [
-    #   ('rsts', 'dummy'),
-    #   ('crn', 'dummy'),
-    #   ('term_in', f'{term}'),
-    #   ('sel_subj', 'dummy'),
-    #   ('sel_subj', 'ACTG'),
-    #   ('sel_subj', 'ALCB'),
-    #   ('sel_subj', 'ALTW'),
-    #   ('sel_subj', 'AHS'),
-    #   ('sel_subj', 'ANTH'),
-    #   ('sel_subj', 'APSM'),
-    #   ('sel_subj', 'ART'),
-    #   ('sel_subj', 'ASTR'),
-    #   ('sel_subj', 'ATHL'),
-    #   ('sel_subj', 'BIOL'),
-    #   ('sel_subj', 'BUSI'),
-    #   ('sel_subj', 'CRLP'),
-    #   ('sel_subj', 'CHEM'),
-    #   ('sel_subj', 'CHLD'),
-    #   ('sel_subj', 'COMM'),
-    #   ('sel_subj', 'C S'),
-    #   ('sel_subj', 'CNSL'),
-    #   ('sel_subj', 'CRWR'),
-    #   ('sel_subj', 'DANC'),
-    #   ('sel_subj', 'D A'),
-    #   ('sel_subj', 'D H'),
-    #   ('sel_subj', 'DMS'),
-    #   ('sel_subj', 'ECON'),
-    #   ('sel_subj', 'EMS'),
-    #   ('sel_subj', 'ENGR'),
-    #   ('sel_subj', 'ENGL'),
-    #   ('sel_subj', 'ESLL'),
-    #   ('sel_subj', 'HORT'),
-    #   ('sel_subj', 'JFS'),
-    #   ('sel_subj', 'GEOG'),
-    #   ('sel_subj', 'GIST'),
-    #   ('sel_subj', 'GID'),
-    #   ('sel_subj', 'HLTH'),
-    #   ('sel_subj', 'HIST'),
-    #   ('sel_subj', 'HUMN'),
-    #   ('sel_subj', 'ITRN'),
-    #   ('sel_subj', 'JAPN'),
-    #   ('sel_subj', 'KINS'),
-    #   ('sel_subj', 'L A'),
-    #   ('sel_subj', 'LINC'),
-    #   ('sel_subj', 'LIBR'),
-    #   ('sel_subj', 'MATH'),
-    #   ('sel_subj', 'MDIA'),
-    #   ('sel_subj', 'MTEC'),
-    #   ('sel_subj', 'MUS'),
-    #   ('sel_subj', 'NCBS'),
-    #   ('sel_subj', 'NCBH'),
-    #   ('sel_subj', 'NCEL'),
-    #   ('sel_subj', 'NCLA'),
-    #   ('sel_subj', 'NCP'),
-    #   ('sel_subj', 'NCSV'),
-    #   ('sel_subj', 'P A'),
-    #   ('sel_subj', 'PHT'),
-    #   ('sel_subj', 'PHIL'),
-    #   ('sel_subj', 'PHOT'),
-    #   ('sel_subj', 'PHED'),
-    #   ('sel_subj', 'PHDA'),
-    #   ('sel_subj', 'PSE'),
-    #   ('sel_subj', 'PHYS'),
-    #   ('sel_subj', 'POLI'),
-    #   ('sel_subj', 'PCA'),
-    #   ('sel_subj', 'PSYC'),
-    #   ('sel_subj', 'R T'),
-    #   ('sel_subj', 'RSPT'),
-    #   ('sel_subj', 'SOSC'),
-    #   ('sel_subj', 'SOC'),
-    #   ('sel_subj', 'SPAN'),
-    #   ('sel_subj', 'THTR'),
-    #   ('sel_subj', 'V T'),
-    #   ('sel_subj', 'VITI'),
-    #   ('sel_subj', 'WMN'),
-    #   ('sel_day', 'dummy'),
-    #   ('sel_schd', 'dummy'),
-    #   ('sel_insm', 'dummy'),
-    #   ('sel_camp', 'dummy'),
-    #   ('sel_camp', '%'),
-    #   ('sel_levl', 'dummy'),
-    #   ('sel_sess', 'dummy'),
-    #   ('sel_sess', '%'),
-    #   ('sel_instr', 'dummy'),
-    #   ('sel_instr', '%'),
-    #   ('sel_ptrm', 'dummy'),
-    #   ('sel_ptrm', '%'),
-    #   ('sel_attr', 'dummy'),
-    #   ('sel_crse', ''),
-    #   ('sel_title', ''),
-    #   ('sel_from_cred', ''),
-    #   ('sel_to_cred', ''),
-    #   ('begin_hh', '0'),
-    #   ('begin_mi', '0'),
-    #   ('begin_ap', 'a'),
-    #   ('end_hh', '0'),
-    #   ('end_mi', '0'),
-    #   ('end_ap', 'a'),
-    #   ('SUB_BTN', 'Section Search'),
-    #   ('path', '1'),
-    # ]
-
     data = [
       ('rsts', 'dummy'),
       ('crn', 'dummy'),
       ('term_in', f'{term}'),
       ('sel_subj', 'dummy'),
       ('sel_subj', 'ACTG'),
+      ('sel_subj', 'ALCB'),
+      ('sel_subj', 'ALTW'),
+      ('sel_subj', 'AHS'),
+      ('sel_subj', 'ANTH'),
+      ('sel_subj', 'APSM'),
+      ('sel_subj', 'ART'),
+      ('sel_subj', 'ASTR'),
+      ('sel_subj', 'ATHL'),
+      ('sel_subj', 'BIOL'),
+      ('sel_subj', 'BUSI'),
+      ('sel_subj', 'CRLP'),
+      ('sel_subj', 'CHEM'),
+      ('sel_subj', 'CHLD'),
+      ('sel_subj', 'COMM'),
+      ('sel_subj', 'C S'),
+      ('sel_subj', 'CNSL'),
+      ('sel_subj', 'CRWR'),
+      ('sel_subj', 'DANC'),
+      ('sel_subj', 'D A'),
+      ('sel_subj', 'D H'),
+      ('sel_subj', 'DMS'),
+      ('sel_subj', 'ECON'),
+      ('sel_subj', 'EMS'),
+      ('sel_subj', 'ENGR'),
+      ('sel_subj', 'ENGL'),
+      ('sel_subj', 'ESLL'),
+      ('sel_subj', 'HORT'),
+      ('sel_subj', 'JFS'),
+      ('sel_subj', 'GEOG'),
+      ('sel_subj', 'GIST'),
+      ('sel_subj', 'GID'),
+      ('sel_subj', 'HLTH'),
+      ('sel_subj', 'HIST'),
+      ('sel_subj', 'HUMN'),
+      ('sel_subj', 'ITRN'),
+      ('sel_subj', 'JAPN'),
+      ('sel_subj', 'KINS'),
+      ('sel_subj', 'L A'),
+      ('sel_subj', 'LINC'),
+      ('sel_subj', 'LIBR'),
+      ('sel_subj', 'MATH'),
+      ('sel_subj', 'MDIA'),
+      ('sel_subj', 'MTEC'),
+      ('sel_subj', 'MUS'),
+      ('sel_subj', 'NCBS'),
+      ('sel_subj', 'NCBH'),
+      ('sel_subj', 'NCEL'),
+      ('sel_subj', 'NCLA'),
+      ('sel_subj', 'NCP'),
+      ('sel_subj', 'NCSV'),
+      ('sel_subj', 'P A'),
+      ('sel_subj', 'PHT'),
+      ('sel_subj', 'PHIL'),
+      ('sel_subj', 'PHOT'),
+      ('sel_subj', 'PHED'),
+      ('sel_subj', 'PHDA'),
+      ('sel_subj', 'PSE'),
+      ('sel_subj', 'PHYS'),
+      ('sel_subj', 'POLI'),
+      ('sel_subj', 'PCA'),
+      ('sel_subj', 'PSYC'),
+      ('sel_subj', 'R T'),
+      ('sel_subj', 'RSPT'),
+      ('sel_subj', 'SOSC'),
+      ('sel_subj', 'SOC'),
+      ('sel_subj', 'SPAN'),
+      ('sel_subj', 'THTR'),
+      ('sel_subj', 'V T'),
+      ('sel_subj', 'VITI'),
+      ('sel_subj', 'WMN'),
       ('sel_day', 'dummy'),
       ('sel_schd', 'dummy'),
       ('sel_insm', 'dummy'),
@@ -195,6 +162,39 @@ def mine(term, cookies, write=False):
       ('path', '1'),
     ]
 
+    # data = [
+    #   ('rsts', 'dummy'),
+    #   ('crn', 'dummy'),
+    #   ('term_in', f'{term}'),
+    #   ('sel_subj', 'dummy'),
+    #   ('sel_subj', 'ACTG'),
+    #   ('sel_day', 'dummy'),
+    #   ('sel_schd', 'dummy'),
+    #   ('sel_insm', 'dummy'),
+    #   ('sel_camp', 'dummy'),
+    #   ('sel_camp', '%'),
+    #   ('sel_levl', 'dummy'),
+    #   ('sel_sess', 'dummy'),
+    #   ('sel_sess', '%'),
+    #   ('sel_instr', 'dummy'),
+    #   ('sel_instr', '%'),
+    #   ('sel_ptrm', 'dummy'),
+    #   ('sel_ptrm', '%'),
+    #   ('sel_attr', 'dummy'),
+    #   ('sel_crse', ''),
+    #   ('sel_title', ''),
+    #   ('sel_from_cred', ''),
+    #   ('sel_to_cred', ''),
+    #   ('begin_hh', '0'),
+    #   ('begin_mi', '0'),
+    #   ('begin_ap', 'a'),
+    #   ('end_hh', '0'),
+    #   ('end_mi', '0'),
+    #   ('end_ap', 'a'),
+    #   ('SUB_BTN', 'Section Search'),
+    #   ('path', '1'),
+    # ]
+
     res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced',
                         headers=headers, cookies=cookies, data=data)
     res.raise_for_status()
@@ -226,23 +226,26 @@ def advanced_parse(content, db):
         if cols and len(cols) >= len(ADVANCED_HEADERS):
             s = defaultdict(lambda: defaultdict(list))
 
-            for i, c in enumerate(cols[1:]):
+            for i, c in enumerate(cols):
                 a = c.find('a')
-                cols[i] = a.get_text() if a else cols[i].get_text()
+                cols[i] = get_parsed_text(a) if a else get_parsed_text(cols[i])
 
-            try:
-                data = dict(zip(ADVANCED_HEADERS, cols))
+            data = dict(zip(ADVANCED_HEADERS, cols))
 
-                key = data['course']
-                crn = data['CRN']
-                data['units'] = data['units'].lstrip()
+            subject = data['subject']
+            key = data['course']
+            crn = data['CRN']
 
-                s[key][crn].append(data)
-            except KeyError:
-                continue
+            s[key][crn].append(data)
 
             j = dict(s)
-            db.table(f'{key}').insert(j)
+            db.table(f'{subject}').insert(j)
+
+
+def get_parsed_text(tag):
+    text = tag.get_text()
+    p = compile(r'<.*?>')
+    return p.sub('', text)
 
 
 if __name__ == '__main__':

From b8e6ddf084f63901f1001762aea1b92c44cebc4d Mon Sep 17 00:00:00 2001
From: phi-line <phi.liney@gmail.com>
Date: Wed, 6 Jun 2018 13:39:07 -0700
Subject: [PATCH 04/19] scraping working for one quarter. hybrid classes
 handles

---
 scrape_advanced.py | 112 +++++++++++++++++++++------------------------
 scrape_term.py     |   4 +-
 settings.py        |   3 +-
 3 files changed, 56 insertions(+), 63 deletions(-)

diff --git a/scrape_advanced.py b/scrape_advanced.py
index 2a3608c..749a80e 100644
--- a/scrape_advanced.py
+++ b/scrape_advanced.py
@@ -5,36 +5,38 @@
 
 from selenium_login import scrape_cookies, kill_driver
 from scrape_term import get_key
-from settings import DB_DIR, ADVANCED_HEADERS, PAST_TERM_CODES, SCHEDULE
+from settings import OLD_DB_DIR, ADVANCED_HEADERS, PAST_TERM_CODES, SCHEDULE
 
 # 3rd party
 import requests
 from bs4 import BeautifulSoup
 from tinydb import TinyDB
 
+
 def main():
-    if not exists(DB_DIR):
-        makedirs(DB_DIR, exist_ok=True)
+    if not exists(OLD_DB_DIR):
+        makedirs(OLD_DB_DIR, exist_ok=True)
 
     cookies = scrape_cookies()
     try:
         for term in PAST_TERM_CODES.values():
-            temp_path = join(DB_DIR, 'temp.json')
+            temp_path = join(OLD_DB_DIR, 'temp.json')
             temp = TinyDB(temp_path)
 
-            content = mine(term, cookies)
+            content = mine(term, cookies, write=False)
             advanced_parse(content, db=temp)
 
-            if rename(temp_path, join(DB_DIR, f'old_{term}_database.json')):
+            if rename(temp_path, join(OLD_DB_DIR, f'old_{term}_database.json')):
                 remove(temp_path)
 
-            db = TinyDB(join(DB_DIR, f'old_{term}_database.json'))
+            db = TinyDB(join(OLD_DB_DIR, f'old_{term}_database.json'))
             print(term, db.tables())
     except KeyboardInterrupt:
         kill_driver()
     finally:
         kill_driver()
 
+
 def mine(term, cookies, write=False):
     '''
     Mine will hit the database for foothill's class listings
@@ -162,39 +164,6 @@ def mine(term, cookies, write=False):
       ('path', '1'),
     ]
 
-    # data = [
-    #   ('rsts', 'dummy'),
-    #   ('crn', 'dummy'),
-    #   ('term_in', f'{term}'),
-    #   ('sel_subj', 'dummy'),
-    #   ('sel_subj', 'ACTG'),
-    #   ('sel_day', 'dummy'),
-    #   ('sel_schd', 'dummy'),
-    #   ('sel_insm', 'dummy'),
-    #   ('sel_camp', 'dummy'),
-    #   ('sel_camp', '%'),
-    #   ('sel_levl', 'dummy'),
-    #   ('sel_sess', 'dummy'),
-    #   ('sel_sess', '%'),
-    #   ('sel_instr', 'dummy'),
-    #   ('sel_instr', '%'),
-    #   ('sel_ptrm', 'dummy'),
-    #   ('sel_ptrm', '%'),
-    #   ('sel_attr', 'dummy'),
-    #   ('sel_crse', ''),
-    #   ('sel_title', ''),
-    #   ('sel_from_cred', ''),
-    #   ('sel_to_cred', ''),
-    #   ('begin_hh', '0'),
-    #   ('begin_mi', '0'),
-    #   ('begin_ap', 'a'),
-    #   ('end_hh', '0'),
-    #   ('end_mi', '0'),
-    #   ('end_ap', 'a'),
-    #   ('SUB_BTN', 'Section Search'),
-    #   ('path', '1'),
-    # ]
-
     res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced',
                         headers=headers, cookies=cookies, data=data)
     res.raise_for_status()
@@ -208,6 +177,10 @@ def mine(term, cookies, write=False):
     return res.content
 
 
+class BlankRow(Exception):
+    pass
+
+
 def advanced_parse(content, db):
     '''
     Advanced parse takes the content from the request and then populates the database with the data
@@ -220,26 +193,45 @@ def advanced_parse(content, db):
     table = soup.find('table', {'class': 'datadisplaytable'})
     table_rows = table.find_all('tr')
 
-    for tr in table_rows:
-        cols = tr.find_all(lambda tag: tag.name == 'td' and not tag.get_text().isspace())
-
-        if cols and len(cols) >= len(ADVANCED_HEADERS):
-            s = defaultdict(lambda: defaultdict(list))
-
-            for i, c in enumerate(cols):
-                a = c.find('a')
-                cols[i] = get_parsed_text(a) if a else get_parsed_text(cols[i])
-
-            data = dict(zip(ADVANCED_HEADERS, cols))
-
-            subject = data['subject']
-            key = data['course']
-            crn = data['CRN']
-
-            s[key][crn].append(data)
-
-            j = dict(s)
-            db.table(f'{subject}').insert(j)
+    table_headers = list()
+    start_idx = 0
+    for i, tr in enumerate(table_rows):
+        header_cols = tr.find_all('th', {'class': 'ddheader'})
+        for th in header_cols:
+            table_headers.append(get_parsed_text(th))
+        if table_headers:
+            start_idx = i
+            break
+
+    for tr in table_rows[start_idx:]:
+        try:
+            cols = tr.find_all('td', {'class': 'dddefault'})
+
+            if len(cols) > 0:
+                s = defaultdict(lambda: defaultdict(list))
+
+                num_blank = 0
+                for i, c in enumerate(cols):
+                    a = c.find('a')
+                    cols[i] = get_parsed_text(a) if a else get_parsed_text(cols[i])
+                    if cols[i].isspace():
+                        num_blank += 1
+
+                if num_blank > len(cols) - num_blank:
+                    raise BlankRow
+
+                data = dict(zip(table_headers, cols))
+
+                subject = data['Subj']
+                key = data['Crse']
+                crn = data['CRN']
+
+                s[key][crn].append(data)
+
+                j = dict(s)
+                db.table(f'{subject}').insert(j)
+        except BlankRow:
+            continue
 
 
 def get_parsed_text(tag):
diff --git a/scrape_term.py b/scrape_term.py
index bb2bef7..6fab54e 100644
--- a/scrape_term.py
+++ b/scrape_term.py
@@ -78,8 +78,8 @@ def parse(content, db):
 
         rows = t.find_all('tr', {'class': 'CourseRow'})
         s = defaultdict(lambda: defaultdict(list))
-        for r in rows:
-            cols = r.find_all(lambda tag: tag.name == 'td' and not tag.get_text().isspace())
+        for tr in rows:
+            cols = tr.find_all(lambda tag: tag.name == 'td' and not tag.get_text().isspace())
 
             if cols:
                 for i, c in enumerate(cols):
diff --git a/settings.py b/settings.py
index d45a26f..fbc3ba2 100644
--- a/settings.py
+++ b/settings.py
@@ -2,6 +2,7 @@
 
 ROOT_DIR = os.path.abspath(os.path.dirname(__file__))
 DB_DIR = os.path.join(ROOT_DIR, 'db')
+OLD_DB_DIR = os.path.join(DB_DIR, 'old')
 TEST_DIR = os.path.join(ROOT_DIR, 'tests')
 TEST_DB_DIR = os.path.join(TEST_DIR, 'test_db')
 
@@ -16,4 +17,4 @@
 ADVANCED_HEADERS = ('select', 'CRN', 'subject', 'section', 'course', 'campus', 'units', 'title', 'days', 'time',
                     'seats_cap', 'seats_act', 'seats', 'wait_cap', 'wait_act', 'wait_seats', 'instructor',
                     'date_range', 'location')
-PAST_TERM_CODES = {'fh': '201841'}
+PAST_TERM_CODES = {'fh': '201831'}

From e442b8fbaaa76ed0b82e1a7ff7028b088e09af00 Mon Sep 17 00:00:00 2001
From: phi-line <phi.liney@gmail.com>
Date: Wed, 6 Jun 2018 22:20:31 -0700
Subject: [PATCH 05/19] refined scraper

---
 scrape_advanced.py | 121 +++++++++++++++++++++++++++------------------
 selenium_login.py  |   1 -
 2 files changed, 72 insertions(+), 50 deletions(-)

diff --git a/scrape_advanced.py b/scrape_advanced.py
index 749a80e..e0dc6eb 100644
--- a/scrape_advanced.py
+++ b/scrape_advanced.py
@@ -4,35 +4,44 @@
 from collections import defaultdict
 
 from selenium_login import scrape_cookies, kill_driver
-from scrape_term import get_key
-from settings import OLD_DB_DIR, ADVANCED_HEADERS, PAST_TERM_CODES, SCHEDULE
+from settings import OLD_DB_DIR, SCHEDULE
 
 # 3rd party
 import requests
 from bs4 import BeautifulSoup
 from tinydb import TinyDB
 
+CAMPUS_RANGE = (1, 2)
+YEAR_RANGE = (8, 0)
+QUARTER_RANGE = (4, 1)
 
 def main():
     if not exists(OLD_DB_DIR):
         makedirs(OLD_DB_DIR, exist_ok=True)
 
+    codes = generate_term_codes()
+    print(codes)
+
     cookies = scrape_cookies()
+    print(cookies)
+
+    temp_path = join(OLD_DB_DIR, 'temp.json')
+
     try:
-        for term in PAST_TERM_CODES.values():
-            temp_path = join(OLD_DB_DIR, 'temp.json')
+        for term in codes:
             temp = TinyDB(temp_path)
 
             content = mine(term, cookies, write=False)
-            advanced_parse(content, db=temp)
+            if not advanced_parse(content,db=temp):
+                continue
 
-            if rename(temp_path, join(OLD_DB_DIR, f'old_{term}_database.json')):
-                remove(temp_path)
+            rename(temp_path, join(OLD_DB_DIR, f'old_{term}_database.json'))
 
             db = TinyDB(join(OLD_DB_DIR, f'old_{term}_database.json'))
             print(term, db.tables())
     except KeyboardInterrupt:
         kill_driver()
+        remove(temp_path)
     finally:
         kill_driver()
 
@@ -190,48 +199,62 @@ def advanced_parse(content, db):
     '''
     soup = BeautifulSoup(content, 'html5lib')
 
-    table = soup.find('table', {'class': 'datadisplaytable'})
-    table_rows = table.find_all('tr')
-
-    table_headers = list()
-    start_idx = 0
-    for i, tr in enumerate(table_rows):
-        header_cols = tr.find_all('th', {'class': 'ddheader'})
-        for th in header_cols:
-            table_headers.append(get_parsed_text(th))
-        if table_headers:
-            start_idx = i
-            break
-
-    for tr in table_rows[start_idx:]:
-        try:
-            cols = tr.find_all('td', {'class': 'dddefault'})
-
-            if len(cols) > 0:
-                s = defaultdict(lambda: defaultdict(list))
-
-                num_blank = 0
-                for i, c in enumerate(cols):
-                    a = c.find('a')
-                    cols[i] = get_parsed_text(a) if a else get_parsed_text(cols[i])
-                    if cols[i].isspace():
-                        num_blank += 1
-
-                if num_blank > len(cols) - num_blank:
-                    raise BlankRow
-
-                data = dict(zip(table_headers, cols))
-
-                subject = data['Subj']
-                key = data['Crse']
-                crn = data['CRN']
-
-                s[key][crn].append(data)
-
-                j = dict(s)
-                db.table(f'{subject}').insert(j)
-        except BlankRow:
-            continue
+    try:
+        table = soup.find('table', {'class': 'datadisplaytable'})
+        table_rows = table.find_all('tr')
+
+        table_headers = list()
+        start_idx = 0
+        for i, tr in enumerate(table_rows):
+            header_cols = tr.find_all('th', {'class': 'ddheader'})
+            for th in header_cols:
+                table_headers.append(get_parsed_text(th))
+            if table_headers:
+                start_idx = i
+                break
+
+        for tr in table_rows[start_idx:]:
+            try:
+                cols = tr.find_all('td', {'class': 'dddefault'})
+
+                if len(cols) > 0:
+                    s = defaultdict(lambda: defaultdict(list))
+
+                    num_blank = 0
+                    for i, c in enumerate(cols):
+                        a = c.find('a')
+                        cols[i] = get_parsed_text(a) if a else get_parsed_text(cols[i])
+                        if cols[i].isspace():
+                            num_blank += 1
+
+                    if num_blank > len(cols) - num_blank:
+                        raise BlankRow
+
+                    data = dict(zip(table_headers, cols))
+
+                    subject = data['Subj']
+                    key = data['Crse']
+                    crn = data['CRN']
+
+                    s[key][crn].append(data)
+
+                    j = dict(s)
+                    db.table(f'{subject}').insert(j)
+            except BlankRow:
+                continue
+    except AttributeError as e:
+        print(e)
+        return False
+    return True
+
+
+def generate_term_codes():
+    codes = []
+    for i in range(YEAR_RANGE[0], YEAR_RANGE[1], -1):
+        for j in range(QUARTER_RANGE[0], QUARTER_RANGE[1], -1):
+            for k in range(CAMPUS_RANGE[0], CAMPUS_RANGE[1], 1):
+                codes.append(f'201{i}{j}{k}')
+    return codes
 
 
 def get_parsed_text(tag):
diff --git a/selenium_login.py b/selenium_login.py
index 8ab581d..3c09c02 100644
--- a/selenium_login.py
+++ b/selenium_login.py
@@ -40,7 +40,6 @@ def get_cookies(cookies_list):
     cookies_dict = {}
     for cookie in cookies_list:
         cookies_dict[cookie['name']] = cookie['value']
-    print(cookies_dict)
     return cookies_dict
 
 

From da81de16a4680c73b2debe24ea1e80f0a05afb06 Mon Sep 17 00:00:00 2001
From: phi-line <phi.liney@gmail.com>
Date: Thu, 7 Jun 2018 22:21:44 -0700
Subject: [PATCH 06/19] added some feedback to the scraping process

---
 scrape_advanced.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/scrape_advanced.py b/scrape_advanced.py
index e0dc6eb..0369e16 100644
--- a/scrape_advanced.py
+++ b/scrape_advanced.py
@@ -1,3 +1,4 @@
+import sys
 from os import makedirs, rename, remove
 from os.path import join, exists
 from re import compile
@@ -12,8 +13,8 @@
 from tinydb import TinyDB
 
 CAMPUS_RANGE = (1, 2)
-YEAR_RANGE = (8, 0)
-QUARTER_RANGE = (4, 1)
+YEAR_RANGE = (0, 8)
+QUARTER_RANGE = (1, 4)
 
 def main():
     if not exists(OLD_DB_DIR):
@@ -29,16 +30,20 @@ def main():
 
     try:
         for term in codes:
+            sys.stdout.write(f'[{term}] | Scraping…\r')
+            sys.stdout.flush()
+
             temp = TinyDB(temp_path)
 
             content = mine(term, cookies, write=False)
-            if not advanced_parse(content,db=temp):
+            if not advanced_parse(content, db=temp, term=term):
                 continue
 
             rename(temp_path, join(OLD_DB_DIR, f'old_{term}_database.json'))
 
             db = TinyDB(join(OLD_DB_DIR, f'old_{term}_database.json'))
-            print(term, db.tables())
+            print(f'[{term}] | ', db.tables())
+
     except KeyboardInterrupt:
         kill_driver()
         remove(temp_path)
@@ -190,7 +195,7 @@ class BlankRow(Exception):
     pass
 
 
-def advanced_parse(content, db):
+def advanced_parse(content, db, term=''):
     '''
     Advanced parse takes the content from the request and then populates the database with the data
     :param content: (html) The html containing the courses
@@ -243,16 +248,16 @@ def advanced_parse(content, db):
             except BlankRow:
                 continue
     except AttributeError as e:
-        print(e)
+        print(f'[{term}] | ERROR: {e}')
         return False
     return True
 
 
 def generate_term_codes():
     codes = []
-    for i in range(YEAR_RANGE[0], YEAR_RANGE[1], -1):
-        for j in range(QUARTER_RANGE[0], QUARTER_RANGE[1], -1):
-            for k in range(CAMPUS_RANGE[0], CAMPUS_RANGE[1], 1):
+    for i in range(YEAR_RANGE[0], YEAR_RANGE[1] + 1):
+        for j in range(QUARTER_RANGE[0], QUARTER_RANGE[1] + 1):
+            for k in range(CAMPUS_RANGE[0], CAMPUS_RANGE[1] + 1):
                 codes.append(f'201{i}{j}{k}')
     return codes
 

From 920cab8b883a9cbea991b8ba9c3633785dfacbf7 Mon Sep 17 00:00:00 2001
From: phi-line <phi.liney@gmail.com>
Date: Fri, 8 Jun 2018 17:14:52 -0700
Subject: [PATCH 07/19] refined scraper to use dynamic department codes

---
 scrape_advanced.py | 174 ++++++++++++---------------------------------
 selenium_login.py  |   8 +--
 settings.py        |  11 +--
 3 files changed, 57 insertions(+), 136 deletions(-)

diff --git a/scrape_advanced.py b/scrape_advanced.py
index 0369e16..c86852c 100644
--- a/scrape_advanced.py
+++ b/scrape_advanced.py
@@ -5,7 +5,7 @@
 from collections import defaultdict
 
 from selenium_login import scrape_cookies, kill_driver
-from settings import OLD_DB_DIR, SCHEDULE
+from settings import OLD_DB_DIR, SCHEDULE, ADVANCED_FORM_DATA
 
 # 3rd party
 import requests
@@ -13,9 +13,10 @@
 from tinydb import TinyDB
 
 CAMPUS_RANGE = (1, 2)
-YEAR_RANGE = (0, 8)
+YEAR_RANGE = (1, 8)
 QUARTER_RANGE = (1, 4)
 
+
 def main():
     if not exists(OLD_DB_DIR):
         makedirs(OLD_DB_DIR, exist_ok=True)
@@ -35,7 +36,11 @@ def main():
 
             temp = TinyDB(temp_path)
 
-            content = mine(term, cookies, write=False)
+            dept_data = mine_dept_data(term, cookies, write=False)
+            sys.stdout.write(f'[{term}] | Mining Depts… {[dept[1] for dept in dept_data]}\r')
+            sys.stdout.flush()
+
+            content = mine_table_data(term, dept_data, cookies, write=False)
             if not advanced_parse(content, db=temp, term=term):
                 continue
 
@@ -51,7 +56,32 @@ def main():
         kill_driver()
 
 
-def mine(term, cookies, write=False):
+def mine_dept_data(term, cookies, write=False):
+    import requests
+
+    data = [('p_calling_proc', 'bwckschd.p_disp_dyn_sched'), ('p_term', f'{term}')]
+
+    res = requests.post('https://banssb.fhda.edu/PROD/bwckgens.p_proc_term_date', data=data)
+    res.raise_for_status()
+
+    if write:
+        with open(f'{join(OLD_DB_DIR, SCHEDULE)}', "wb") as file:
+            for chunk in res.iter_content(chunk_size=512):
+                if not chunk:
+                    break
+
+                file.write(chunk)
+                file.flush()
+
+    soup = BeautifulSoup(res.content, "html5lib")
+    select = soup.find('select', {'id': 'subj_id'})
+    options = select.find_all('option')
+
+    data = [('sel_subj', o['value']) for o in options]
+    return data
+
+
+def mine_table_data(term, dept_data, cookies, write=False):
     '''
     Mine will hit the database for foothill's class listings
     :param term: (str) the term to mine
@@ -60,133 +90,23 @@ def mine(term, cookies, write=False):
     :return res.content: (json) the html body
     '''
 
-    headers = {
-        'Connection': 'keep-alive',
-        'Pragma': 'no-cache',
-        'Cache-Control': 'no-cache',
-        'Origin': 'https://banssb.fhda.edu',
-        'Upgrade-Insecure-Requests': '1',
-        'Content-Type': 'application/x-www-form-urlencoded',
-        'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686)',
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
-        'Referer': 'https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced',
-        'Accept-Encoding': 'gzip, deflate, br',
-        'Accept-Language': 'en-US,en;q=0.9,la;q=0.8',
-        'DNT': '1',
-    }
-
-    data = [
-      ('rsts', 'dummy'),
-      ('crn', 'dummy'),
-      ('term_in', f'{term}'),
-      ('sel_subj', 'dummy'),
-      ('sel_subj', 'ACTG'),
-      ('sel_subj', 'ALCB'),
-      ('sel_subj', 'ALTW'),
-      ('sel_subj', 'AHS'),
-      ('sel_subj', 'ANTH'),
-      ('sel_subj', 'APSM'),
-      ('sel_subj', 'ART'),
-      ('sel_subj', 'ASTR'),
-      ('sel_subj', 'ATHL'),
-      ('sel_subj', 'BIOL'),
-      ('sel_subj', 'BUSI'),
-      ('sel_subj', 'CRLP'),
-      ('sel_subj', 'CHEM'),
-      ('sel_subj', 'CHLD'),
-      ('sel_subj', 'COMM'),
-      ('sel_subj', 'C S'),
-      ('sel_subj', 'CNSL'),
-      ('sel_subj', 'CRWR'),
-      ('sel_subj', 'DANC'),
-      ('sel_subj', 'D A'),
-      ('sel_subj', 'D H'),
-      ('sel_subj', 'DMS'),
-      ('sel_subj', 'ECON'),
-      ('sel_subj', 'EMS'),
-      ('sel_subj', 'ENGR'),
-      ('sel_subj', 'ENGL'),
-      ('sel_subj', 'ESLL'),
-      ('sel_subj', 'HORT'),
-      ('sel_subj', 'JFS'),
-      ('sel_subj', 'GEOG'),
-      ('sel_subj', 'GIST'),
-      ('sel_subj', 'GID'),
-      ('sel_subj', 'HLTH'),
-      ('sel_subj', 'HIST'),
-      ('sel_subj', 'HUMN'),
-      ('sel_subj', 'ITRN'),
-      ('sel_subj', 'JAPN'),
-      ('sel_subj', 'KINS'),
-      ('sel_subj', 'L A'),
-      ('sel_subj', 'LINC'),
-      ('sel_subj', 'LIBR'),
-      ('sel_subj', 'MATH'),
-      ('sel_subj', 'MDIA'),
-      ('sel_subj', 'MTEC'),
-      ('sel_subj', 'MUS'),
-      ('sel_subj', 'NCBS'),
-      ('sel_subj', 'NCBH'),
-      ('sel_subj', 'NCEL'),
-      ('sel_subj', 'NCLA'),
-      ('sel_subj', 'NCP'),
-      ('sel_subj', 'NCSV'),
-      ('sel_subj', 'P A'),
-      ('sel_subj', 'PHT'),
-      ('sel_subj', 'PHIL'),
-      ('sel_subj', 'PHOT'),
-      ('sel_subj', 'PHED'),
-      ('sel_subj', 'PHDA'),
-      ('sel_subj', 'PSE'),
-      ('sel_subj', 'PHYS'),
-      ('sel_subj', 'POLI'),
-      ('sel_subj', 'PCA'),
-      ('sel_subj', 'PSYC'),
-      ('sel_subj', 'R T'),
-      ('sel_subj', 'RSPT'),
-      ('sel_subj', 'SOSC'),
-      ('sel_subj', 'SOC'),
-      ('sel_subj', 'SPAN'),
-      ('sel_subj', 'THTR'),
-      ('sel_subj', 'V T'),
-      ('sel_subj', 'VITI'),
-      ('sel_subj', 'WMN'),
-      ('sel_day', 'dummy'),
-      ('sel_schd', 'dummy'),
-      ('sel_insm', 'dummy'),
-      ('sel_camp', 'dummy'),
-      ('sel_camp', '%'),
-      ('sel_levl', 'dummy'),
-      ('sel_sess', 'dummy'),
-      ('sel_sess', '%'),
-      ('sel_instr', 'dummy'),
-      ('sel_instr', '%'),
-      ('sel_ptrm', 'dummy'),
-      ('sel_ptrm', '%'),
-      ('sel_attr', 'dummy'),
-      ('sel_crse', ''),
-      ('sel_title', ''),
-      ('sel_from_cred', ''),
-      ('sel_to_cred', ''),
-      ('begin_hh', '0'),
-      ('begin_mi', '0'),
-      ('begin_ap', 'a'),
-      ('end_hh', '0'),
-      ('end_mi', '0'),
-      ('end_ap', 'a'),
-      ('SUB_BTN', 'Section Search'),
-      ('path', '1'),
-    ]
-
-    res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced',
-                        headers=headers, cookies=cookies, data=data)
+    data = [('rsts', 'dummy'), ('crn', 'dummy'), ('term_in', f'{term}'), ('sel_subj', 'dummy')]
+
+    data.extend(dept_data)
+
+    data.extend(ADVANCED_FORM_DATA)
+
+    res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced', cookies=cookies, data=data)
     res.raise_for_status()
 
     if write:
-        with open(f'{join(DB_DIR, SCHEDULE)}', "wb") as file:
+        with open(f'{join(OLD_DB_DIR, SCHEDULE)}', "wb") as file:
             for chunk in res.iter_content(chunk_size=512):
-                if chunk:
-                    file.write(chunk)
+                if not chunk:
+                    break
+
+                file.write(chunk)
+                file.flush()
 
     return res.content
 
diff --git a/selenium_login.py b/selenium_login.py
index 3c09c02..b100a24 100644
--- a/selenium_login.py
+++ b/selenium_login.py
@@ -20,16 +20,14 @@ def scrape_cookies():
 
     try:
         driver.execute_script("doLogin()")
-        WebDriverWait(driver, 10).until(
+        WebDriverWait(driver, 5).until(
             EC.title_is("MyPortal / Foothill-De Anza College District")
         )
 
-        driver.get("https://myportal.fhda.edu/render.UserLayoutRootNode.uP?uP_tparam=utf&utf=%2fcp%2fip%2flogin%3fsys% \
-                    3dsctssb%26url%3dhttps%3A%2F%2Fbanssb.fhda.edu%2FPROD%2Fbwskfcls.p_sel_crse_search")
         driver.get("https://myportal.fhda.edu/render.UserLayoutRootNode.uP?uP_tparam=utf&utf=%2fcp%2fip%2flogin%3fsys%3dsctssb%26url%3dhttps%3A%2F%2Fbanssb.fhda.edu%2FPROD%2Fbwskfcls.p_sel_crse_search")
 
-        WebDriverWait(driver, 10).until(
-            EC.title_is("MyPortal / Foothill-De Anza College District")
+        WebDriverWait(driver, 5).until(
+            EC.presence_of_element_located((By.ID, "ssbbackurl"))
         )
     finally:
         cookies_list = driver.get_cookies()
diff --git a/settings.py b/settings.py
index fbc3ba2..1201dcb 100644
--- a/settings.py
+++ b/settings.py
@@ -10,11 +10,14 @@
 DAYS_PATTERN = f"^{'(M|T|W|Th|F|S|U)?'*7}$"
 
 SCHEDULE = 'schedule.html'
+SEARCH = 'search.html'
 HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end',
            'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap')
 CURRENT_TERM_CODES = CAMPUS_LIST = {'fh': '201911', 'da': '201912', 'test': 'test'}
 
-ADVANCED_HEADERS = ('select', 'CRN', 'subject', 'section', 'course', 'campus', 'units', 'title', 'days', 'time',
-                    'seats_cap', 'seats_act', 'seats', 'wait_cap', 'wait_act', 'wait_seats', 'instructor',
-                    'date_range', 'location')
-PAST_TERM_CODES = {'fh': '201831'}
+ADVANCED_FORM_DATA = [('sel_day', 'dummy'), ('sel_schd', 'dummy'), ('sel_insm', 'dummy'), ('sel_camp', 'dummy'),
+                      ('sel_camp', '%'), ('sel_levl', 'dummy'), ('sel_sess', 'dummy'), ('sel_sess', '%'),
+                      ('sel_instr', 'dummy'), ('sel_instr', '%'), ('sel_ptrm', 'dummy'), ('sel_ptrm', '%'),
+                      ('sel_attr', 'dummy'), ('sel_crse', ''), ('sel_title', ''), ('sel_from_cred', ''),
+                      ('sel_to_cred', ''), ('begin_hh', '0'), ('begin_mi', '0'), ('begin_ap', 'a'), ('end_hh', '0'),
+                      ('end_mi', '0'), ('end_ap', 'a'), ('SUB_BTN', 'Section Search'), ('path', '1')]

From 5119fc72916bfb6b8a7b9a39da49a408b44f5a77 Mon Sep 17 00:00:00 2001
From: phi-line <phi.liney@gmail.com>
Date: Fri, 8 Jun 2018 18:26:36 -0700
Subject: [PATCH 08/19] made things look pretty (important)

---
 Pipfile            |  1 +
 Pipfile.lock       | 21 +++++++----
 scrape_advanced.py | 86 +++++++++++++++++++++++++++-------------------
 selenium_login.py  |  4 +--
 4 files changed, 67 insertions(+), 45 deletions(-)

diff --git a/Pipfile b/Pipfile
index 5122131..1e5b2c2 100644
--- a/Pipfile
+++ b/Pipfile
@@ -17,6 +17,7 @@ pytest = "*"
 flask = "*"
 pylint = "*"
 selenium = "*"
+colorama = "*"
 
 
 [dev-packages]
diff --git a/Pipfile.lock b/Pipfile.lock
index 532c0c6..97f0feb 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "6098e2ee7f95b6c25b6293ccd2279203e4a83cbe3fe4660b3d9e3a39ea93de71"
+            "sha256": "4811f7bec1a856e27bc2c2fff11d68d045e2c5eaa2d2a0c04e1e3bfc7125ea58"
         },
         "host-environment-markers": {
             "implementation_name": "cpython",
@@ -31,10 +31,10 @@
     "default": {
         "astroid": {
             "hashes": [
-                "sha256:032f6e09161e96f417ea7fad46d3fac7a9019c775f202182c22df0e4f714cb1c",
-                "sha256:dea42ae6e0b789b543f728ddae7ddb6740ba33a49fb52c4a4d9cb7bb4aa6ec09"
+                "sha256:0ef2bf9f07c3150929b25e8e61b5198c27b0dca195e156f0e4d5bdd89185ca1a",
+                "sha256:fc9b582dba0366e63540982c3944a9230cbc6f303641c51483fa547dcc22393a"
             ],
-            "version": "==1.6.4"
+            "version": "==1.6.5"
         },
         "atomicwrites": {
             "hashes": [
@@ -79,6 +79,13 @@
             ],
             "version": "==6.7"
         },
+        "colorama": {
+            "hashes": [
+                "sha256:463f8483208e921368c9f306094eb6f725c6ca42b0f97e313cb5d5512459feda",
+                "sha256:48eb22f4f8461b1df5734a074b57042430fb06e1d61bd1e11b078c0fe6d7a1f1"
+            ],
+            "version": "==0.3.9"
+        },
         "dateparser": {
             "hashes": [
                 "sha256:b452ef8b36cd78ae86a50721794bc674aa3994e19b570f7ba92810f4e0a2ae03",
@@ -235,10 +242,10 @@
         },
         "pylint": {
             "hashes": [
-                "sha256:aa519865f8890a5905fa34924fed0f3bfc7d84fc9f9142c16dac52ffecd25a39",
-                "sha256:c353d8225195b37cc3aef18248b8f3fe94c5a6a95affaf885ae21a24ca31d8eb"
+                "sha256:a48070545c12430cfc4e865bf62f5ad367784765681b3db442d8230f0960aa3c",
+                "sha256:fff220bcb996b4f7e2b0f6812fd81507b72ca4d8c4d05daf2655c333800cb9b3"
             ],
-            "version": "==1.9.1"
+            "version": "==1.9.2"
         },
         "pytest": {
             "hashes": [
diff --git a/scrape_advanced.py b/scrape_advanced.py
index c86852c..6e0a6ef 100644
--- a/scrape_advanced.py
+++ b/scrape_advanced.py
@@ -4,14 +4,16 @@
 from re import compile
 from collections import defaultdict
 
-from selenium_login import scrape_cookies, kill_driver
-from settings import OLD_DB_DIR, SCHEDULE, ADVANCED_FORM_DATA
-
 # 3rd party
 import requests
 from bs4 import BeautifulSoup
 from tinydb import TinyDB
 
+from selenium_login import scrape_cookies, kill_driver
+from settings import OLD_DB_DIR, SCHEDULE, ADVANCED_FORM_DATA
+
+from colorama import init, Fore, Style
+
 CAMPUS_RANGE = (1, 2)
 YEAR_RANGE = (1, 8)
 QUARTER_RANGE = (1, 4)
@@ -22,23 +24,23 @@ def main():
         makedirs(OLD_DB_DIR, exist_ok=True)
 
     codes = generate_term_codes()
-    print(codes)
+    print(f'Loaded {len(codes)} term codes')
+
+    print_c(f'Scraping session cookie…\r')
 
     cookies = scrape_cookies()
-    print(cookies)
+    print(f"Scraped session cookie {cookies['CPSESSID']}", end=f"\n{'-'*79}\n")
 
     temp_path = join(OLD_DB_DIR, 'temp.json')
 
     try:
         for term in codes:
-            sys.stdout.write(f'[{term}] | Scraping…\r')
-            sys.stdout.flush()
+            print_c(f" [{term}] [{color(Fore.YELLOW, 'MINING…')}] Scraping…\r")
 
             temp = TinyDB(temp_path)
 
-            dept_data = mine_dept_data(term, cookies, write=False)
-            sys.stdout.write(f'[{term}] | Mining Depts… {[dept[1] for dept in dept_data]}\r')
-            sys.stdout.flush()
+            dept_data = mine_dept_data(term, write=False)
+            print_c(f" [{term}] [{color(Fore.YELLOW, 'MINING…')}] Parsing {len(dept_data)} departments…\r")
 
             content = mine_table_data(term, dept_data, cookies, write=False)
             if not advanced_parse(content, db=temp, term=term):
@@ -47,7 +49,9 @@ def main():
             rename(temp_path, join(OLD_DB_DIR, f'old_{term}_database.json'))
 
             db = TinyDB(join(OLD_DB_DIR, f'old_{term}_database.json'))
-            print(f'[{term}] | ', db.tables())
+
+            num_courses = sum([len(db.table(t).all()) for t in db.tables()])
+            print_c(f" [{term}] [{color(Fore.GREEN, 'SUCCESS')}] Mined {num_courses} courses\n")
 
     except KeyboardInterrupt:
         kill_driver()
@@ -56,22 +60,19 @@ def main():
         kill_driver()
 
 
-def mine_dept_data(term, cookies, write=False):
-    import requests
-
+def mine_dept_data(term, write=False):
+    '''
+    Mine dept data will grab the department IDs for a given quarter.
+    :param term: (str) the term to mine
+    :param write: (bool) write to file?
+    :return data (list(tuple)) the html body
+    '''
     data = [('p_calling_proc', 'bwckschd.p_disp_dyn_sched'), ('p_term', f'{term}')]
 
     res = requests.post('https://banssb.fhda.edu/PROD/bwckgens.p_proc_term_date', data=data)
     res.raise_for_status()
 
-    if write:
-        with open(f'{join(OLD_DB_DIR, SCHEDULE)}', "wb") as file:
-            for chunk in res.iter_content(chunk_size=512):
-                if not chunk:
-                    break
-
-                file.write(chunk)
-                file.flush()
+    write and write_to_file(res)
 
     soup = BeautifulSoup(res.content, "html5lib")
     select = soup.find('select', {'id': 'subj_id'})
@@ -89,7 +90,6 @@ def mine_table_data(term, dept_data, cookies, write=False):
     :param write: (bool) write to file?
     :return res.content: (json) the html body
     '''
-
     data = [('rsts', 'dummy'), ('crn', 'dummy'), ('term_in', f'{term}'), ('sel_subj', 'dummy')]
 
     data.extend(dept_data)
@@ -99,22 +99,11 @@ def mine_table_data(term, dept_data, cookies, write=False):
     res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced', cookies=cookies, data=data)
     res.raise_for_status()
 
-    if write:
-        with open(f'{join(OLD_DB_DIR, SCHEDULE)}', "wb") as file:
-            for chunk in res.iter_content(chunk_size=512):
-                if not chunk:
-                    break
-
-                file.write(chunk)
-                file.flush()
+    write and write_to_file(res)
 
     return res.content
 
 
-class BlankRow(Exception):
-    pass
-
-
 def advanced_parse(content, db, term=''):
     '''
     Advanced parse takes the content from the request and then populates the database with the data
@@ -168,7 +157,7 @@ def advanced_parse(content, db, term=''):
             except BlankRow:
                 continue
     except AttributeError as e:
-        print(f'[{term}] | ERROR: {e}')
+        print(f" [{term}] [{color(Fore.RED, 'ERROR!!')}] {e}")
         return False
     return True
 
@@ -182,11 +171,36 @@ def generate_term_codes():
     return codes
 
 
+class BlankRow(Exception):
+    pass
+
+
 def get_parsed_text(tag):
     text = tag.get_text()
     p = compile(r'<.*?>')
     return p.sub('', text)
 
 
+def print_c(message):
+    sys.stdout.write('\x1b[2K')
+    sys.stdout.write(message)
+    sys.stdout.flush()
+
+
+def color(c, word):
+    return f'{c}{word}{Style.RESET_ALL}'
+
+
+def write_to_file(res):
+    with open(f'{join(OLD_DB_DIR, SCHEDULE)}', "wb") as file:
+        for chunk in res.iter_content(chunk_size=512):
+            if not chunk:
+                break
+
+            file.write(chunk)
+            file.flush()
+
+
 if __name__ == '__main__':
+    init()
     main()
diff --git a/selenium_login.py b/selenium_login.py
index b100a24..c6d654d 100644
--- a/selenium_login.py
+++ b/selenium_login.py
@@ -20,13 +20,13 @@ def scrape_cookies():
 
     try:
         driver.execute_script("doLogin()")
-        WebDriverWait(driver, 5).until(
+        WebDriverWait(driver, 3).until(
             EC.title_is("MyPortal / Foothill-De Anza College District")
         )
 
         driver.get("https://myportal.fhda.edu/render.UserLayoutRootNode.uP?uP_tparam=utf&utf=%2fcp%2fip%2flogin%3fsys%3dsctssb%26url%3dhttps%3A%2F%2Fbanssb.fhda.edu%2FPROD%2Fbwskfcls.p_sel_crse_search")
 
-        WebDriverWait(driver, 5).until(
+        WebDriverWait(driver, 3).until(
             EC.presence_of_element_located((By.ID, "ssbbackurl"))
         )
     finally:

From d5e1d9f36d46d1f07d3199fc2696baacdbedc79b Mon Sep 17 00:00:00 2001
From: phi-line <phi.liney@gmail.com>
Date: Sun, 10 Jun 2018 16:15:47 -0700
Subject: [PATCH 09/19] added alternate form field format. fixes 98% of cases

---
 scrape_advanced.py | 42 +++++++++++++++++++----------
 settings.py        | 67 +++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 89 insertions(+), 20 deletions(-)

diff --git a/scrape_advanced.py b/scrape_advanced.py
index 6e0a6ef..10c3f77 100644
--- a/scrape_advanced.py
+++ b/scrape_advanced.py
@@ -10,7 +10,7 @@
 from tinydb import TinyDB
 
 from selenium_login import scrape_cookies, kill_driver
-from settings import OLD_DB_DIR, SCHEDULE, ADVANCED_FORM_DATA
+from settings import OLD_DB_DIR, ADVANCED_FORM_DATA
 
 from colorama import init, Fore, Style
 
@@ -23,6 +23,9 @@ def main():
     if not exists(OLD_DB_DIR):
         makedirs(OLD_DB_DIR, exist_ok=True)
 
+    if not exists(join(OLD_DB_DIR, 'html')):
+        makedirs(join(OLD_DB_DIR, 'html'), exist_ok=True)
+
     codes = generate_term_codes()
     print(f'Loaded {len(codes)} term codes')
 
@@ -42,16 +45,25 @@ def main():
             dept_data = mine_dept_data(term, write=False)
             print_c(f" [{term}] [{color(Fore.YELLOW, 'MINING…')}] Parsing {len(dept_data)} departments…\r")
 
-            content = mine_table_data(term, dept_data, cookies, write=False)
-            if not advanced_parse(content, db=temp, term=term):
-                continue
+            failed = False
+            for idx, variant in enumerate(ADVANCED_FORM_DATA):
+                content = mine_table_data(term, variant, dept_data, cookies, write=True)
+                if advanced_parse(content, db=temp, term=term):
+                    break
+                elif idx == len(ADVANCED_FORM_DATA) - 1:
+                    failed = True
 
-            rename(temp_path, join(OLD_DB_DIR, f'old_{term}_database.json'))
+            if rename(temp_path, join(OLD_DB_DIR, f'old_{term}_database.json')):
+                remove(temp_path)
 
             db = TinyDB(join(OLD_DB_DIR, f'old_{term}_database.json'))
 
             num_courses = sum([len(db.table(t).all()) for t in db.tables()])
-            print_c(f" [{term}] [{color(Fore.GREEN, 'SUCCESS')}] Mined {num_courses} courses\n")
+
+            if failed:
+                print_c(f" [{term}] [{color(Fore.RED, 'ERROR!!')}] Payload failed…\n")
+            else:
+                print_c(f" [{term}] [{color(Fore.GREEN, 'SUCCESS')}] Mined {num_courses} courses\n")
 
     except KeyboardInterrupt:
         kill_driver()
@@ -72,7 +84,7 @@ def mine_dept_data(term, write=False):
     res = requests.post('https://banssb.fhda.edu/PROD/bwckgens.p_proc_term_date', data=data)
     res.raise_for_status()
 
-    write and write_to_file(res)
+    write and write_to_file(res, term)
 
     soup = BeautifulSoup(res.content, "html5lib")
     select = soup.find('select', {'id': 'subj_id'})
@@ -82,7 +94,7 @@ def mine_dept_data(term, write=False):
     return data
 
 
-def mine_table_data(term, dept_data, cookies, write=False):
+def mine_table_data(term, payload, dept_data, cookies, write=False):
     '''
     Mine will hit the database for foothill's class listings
     :param term: (str) the term to mine
@@ -90,16 +102,18 @@ def mine_table_data(term, dept_data, cookies, write=False):
     :param write: (bool) write to file?
     :return res.content: (json) the html body
     '''
-    data = [('rsts', 'dummy'), ('crn', 'dummy'), ('term_in', f'{term}'), ('sel_subj', 'dummy')]
+    data = [('rsts', 'dummy'), ('crn', 'dummy'), ('term_in', f'{term}')]
+
+    data.extend(payload[0])
 
     data.extend(dept_data)
 
-    data.extend(ADVANCED_FORM_DATA)
+    data.extend(payload[1])
 
     res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced', cookies=cookies, data=data)
     res.raise_for_status()
 
-    write and write_to_file(res)
+    write and write_to_file(res, term)
 
     return res.content
 
@@ -157,7 +171,6 @@ def advanced_parse(content, db, term=''):
             except BlankRow:
                 continue
     except AttributeError as e:
-        print(f" [{term}] [{color(Fore.RED, 'ERROR!!')}] {e}")
         return False
     return True
 
@@ -191,8 +204,9 @@ def color(c, word):
     return f'{c}{word}{Style.RESET_ALL}'
 
 
-def write_to_file(res):
-    with open(f'{join(OLD_DB_DIR, SCHEDULE)}', "wb") as file:
+def write_to_file(res, term):
+
+    with open(f"{join(OLD_DB_DIR, 'html', term+'.html')}", "wb") as file:
         for chunk in res.iter_content(chunk_size=512):
             if not chunk:
                 break
diff --git a/settings.py b/settings.py
index 1201dcb..155682d 100644
--- a/settings.py
+++ b/settings.py
@@ -15,9 +15,64 @@
            'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap')
 CURRENT_TERM_CODES = CAMPUS_LIST = {'fh': '201911', 'da': '201912', 'test': 'test'}
 
-ADVANCED_FORM_DATA = [('sel_day', 'dummy'), ('sel_schd', 'dummy'), ('sel_insm', 'dummy'), ('sel_camp', 'dummy'),
-                      ('sel_camp', '%'), ('sel_levl', 'dummy'), ('sel_sess', 'dummy'), ('sel_sess', '%'),
-                      ('sel_instr', 'dummy'), ('sel_instr', '%'), ('sel_ptrm', 'dummy'), ('sel_ptrm', '%'),
-                      ('sel_attr', 'dummy'), ('sel_crse', ''), ('sel_title', ''), ('sel_from_cred', ''),
-                      ('sel_to_cred', ''), ('begin_hh', '0'), ('begin_mi', '0'), ('begin_ap', 'a'), ('end_hh', '0'),
-                      ('end_mi', '0'), ('end_ap', 'a'), ('SUB_BTN', 'Section Search'), ('path', '1')]
+ADVANCED_FORM_DATA = [
+    [
+        [('sel_subj', 'dummy'),
+         ('sel_day', 'dummy'),
+         ('sel_schd', 'dummy'),
+         ('sel_insm', 'dummy'),
+         ('sel_camp', 'dummy'),
+         ('sel_levl', 'dummy'),
+         ('sel_sess', 'dummy'),
+         ('sel_instr', 'dummy'),
+         ('sel_ptrm', 'dummy'),
+         ('sel_attr', 'dummy')],
+        [('sel_crse', ''),
+         ('sel_title', ''),
+         ('sel_from_cred', ''),
+         ('sel_to_cred', ''),
+         ('sel_camp', '%'),
+         ('sel_sess', '%'),
+         ('sel_instr', '%'),
+         ('sel_ptrm', '%'),
+         ('begin_hh', '0'),
+         ('begin_mi', '0'),
+         ('begin_ap', 'a'),
+         ('end_hh', '0'),
+         ('end_mi', '0'),
+         ('end_ap', 'a'),
+         ('SUB_BTN', 'Section Search'),
+         ('path', '1')]
+    ],
+
+    [
+        [('sel_subj', 'dummy'),
+         ('sel_day', 'dummy'),
+         ('sel_schd', 'dummy'),
+         ('sel_insm', 'dummy'),
+         ('sel_camp', 'dummy'),
+         ('sel_levl', 'dummy'),
+         ('sel_sess', 'dummy'),
+         ('sel_instr', 'dummy'),
+         ('sel_ptrm', 'dummy'),
+         ('sel_attr', 'dummy')],
+        [('sel_crse', ''),
+         ('sel_title', ''),
+         ('sel_schd', '%'),
+         ('sel_from_cred', ''),
+         ('sel_to_cred', ''),
+         ('sel_camp', '%'),
+         ('sel_instr', '%'),
+         ('sel_sess', '%'),
+         ('sel_ptrm', '%'),
+         ('sel_attr', '%'),
+         ('begin_hh', '0'),
+         ('begin_mi', '0'),
+         ('begin_ap', 'a'),
+         ('end_hh', '0'),
+         ('end_mi', '0'),
+         ('end_ap', 'a'),
+         ('SUB_BTN', 'Section Search'),
+         ('path', '1')]
+    ]
+]

From c9edb41b6ba46755a02914a3b9d46277c90774f3 Mon Sep 17 00:00:00 2001
From: phi-line <phi.liney@gmail.com>
Date: Sun, 10 Jun 2018 16:20:10 -0700
Subject: [PATCH 10/19] removed html files from .gitignore

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 56ac861..316e78c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@ __pycache__/
 .idea/
 *.DS_Store
 *.json
-schedule.html
+*.html
 frontend/static/README.md
 
 # pytest

From 21a397bc0716dac6fdc5e3829ce6ad3cf894987a Mon Sep 17 00:00:00 2001
From: phi-line <phi.liney@gmail.com>
Date: Sun, 10 Jun 2018 18:53:52 -0700
Subject: [PATCH 11/19] fixed edgecase for past quarters

---
 settings.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/settings.py b/settings.py
index 155682d..821401e 100644
--- a/settings.py
+++ b/settings.py
@@ -74,5 +74,37 @@
          ('end_ap', 'a'),
          ('SUB_BTN', 'Section Search'),
          ('path', '1')]
+    ],
+
+    [
+        [('sel_subj', 'dummy'),
+         ('sel_day', 'dummy'),
+         ('sel_schd', 'dummy'),
+         ('sel_insm', 'dummy'),
+         ('sel_camp', 'dummy'),
+         ('sel_levl', 'dummy'),
+         ('sel_sess', 'dummy'),
+         ('sel_instr', 'dummy'),
+         ('sel_ptrm', 'dummy'),
+         ('sel_attr', 'dummy')],
+        [('sel_crse', ''),
+         ('sel_title', ''),
+         ('sel_schd', '%'),
+         ('sel_from_cred', ''),
+         ('sel_to_cred', ''),
+         ('sel_camp', '%'),
+         ('sel_levl', '%'),
+         ('sel_ptrm', '%'),
+         ('sel_instr', '%'),
+         ('sel_sess', '%'),
+         ('sel_attr', '%'),
+         ('begin_hh', '0'),
+         ('begin_mi', '0'),
+         ('begin_ap', 'a'),
+         ('end_hh', '0'),
+         ('end_mi', '0'),
+         ('end_ap', 'a'),
+         ('SUB_BTN', 'Section Search'),
+         ('path', '1')]
     ]
 ]

From 2d498971d42a99a78f15b6081fbf99391dd1d3b5 Mon Sep 17 00:00:00 2001
From: phi-line <phi.liney@gmail.com>
Date: Mon, 11 Jun 2018 00:44:56 -0700
Subject: [PATCH 12/19] increased pylint score

---
 scrape_advanced.py | 27 +++++++++++++++------------
 selenium_login.py  |  4 +++-
 server.py          |  6 +++---
 3 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/scrape_advanced.py b/scrape_advanced.py
index 10c3f77..fc98fd6 100644
--- a/scrape_advanced.py
+++ b/scrape_advanced.py
@@ -1,18 +1,17 @@
 import sys
 from os import makedirs, rename, remove
 from os.path import join, exists
-from re import compile
 from collections import defaultdict
+import re
 
 # 3rd party
 import requests
 from bs4 import BeautifulSoup
 from tinydb import TinyDB
-
+from colorama import init, Fore, Style
 from selenium_login import scrape_cookies, kill_driver
-from settings import OLD_DB_DIR, ADVANCED_FORM_DATA
 
-from colorama import init, Fore, Style
+from settings import OLD_DB_DIR, ADVANCED_FORM_DATA
 
 CAMPUS_RANGE = (1, 2)
 YEAR_RANGE = (1, 8)
@@ -27,12 +26,12 @@ def main():
         makedirs(join(OLD_DB_DIR, 'html'), exist_ok=True)
 
     codes = generate_term_codes()
-    print(f'Loaded {len(codes)} term codes')
+    print_c(f'Loaded {color(Fore.CYAN, len(codes))} term codes\n')
 
     print_c(f'Scraping session cookie…\r')
 
     cookies = scrape_cookies()
-    print(f"Scraped session cookie {cookies['CPSESSID']}", end=f"\n{'-'*79}\n")
+    print_c(f"Scraped session cookie {color(Fore.YELLOW, cookies['CPSESSID'])}\n{'-'*79}\n")
 
     temp_path = join(OLD_DB_DIR, 'temp.json')
 
@@ -43,7 +42,8 @@ def main():
             temp = TinyDB(temp_path)
 
             dept_data = mine_dept_data(term, write=False)
-            print_c(f" [{term}] [{color(Fore.YELLOW, 'MINING…')}] Parsing {len(dept_data)} departments…\r")
+            print_c(f" [{term}] [{color(Fore.YELLOW, 'MINING…')}] \
+                      Parsing {len(dept_data)} departments…\r")
 
             failed = False
             for idx, variant in enumerate(ADVANCED_FORM_DATA):
@@ -84,7 +84,8 @@ def mine_dept_data(term, write=False):
     res = requests.post('https://banssb.fhda.edu/PROD/bwckgens.p_proc_term_date', data=data)
     res.raise_for_status()
 
-    write and write_to_file(res, term)
+    if write:
+        write_to_file(res, term)
 
     soup = BeautifulSoup(res.content, "html5lib")
     select = soup.find('select', {'id': 'subj_id'})
@@ -110,10 +111,12 @@ def mine_table_data(term, payload, dept_data, cookies, write=False):
 
     data.extend(payload[1])
 
-    res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced', cookies=cookies, data=data)
+    res = requests.post('https://banssb.fhda.edu/PROD/bwskfcls.P_GetCrse_Advanced',
+                         cookies=cookies, data=data)
     res.raise_for_status()
 
-    write and write_to_file(res, term)
+    if write:
+        write_to_file(res, term)
 
     return res.content
 
@@ -145,7 +148,7 @@ def advanced_parse(content, db, term=''):
             try:
                 cols = tr.find_all('td', {'class': 'dddefault'})
 
-                if len(cols) > 0:
+                if cols:
                     s = defaultdict(lambda: defaultdict(list))
 
                     num_blank = 0
@@ -190,7 +193,7 @@ class BlankRow(Exception):
 
 def get_parsed_text(tag):
     text = tag.get_text()
-    p = compile(r'<.*?>')
+    p = re.compile(r'<.*?>')
     return p.sub('', text)
 
 
diff --git a/selenium_login.py b/selenium_login.py
index c6d654d..617e435 100644
--- a/selenium_login.py
+++ b/selenium_login.py
@@ -18,6 +18,7 @@ def scrape_cookies():
     driver.execute_script(f"document.getElementById('user').value='{os.environ['MP_USER']}'")
     driver.execute_script(f"document.getElementById('pass').value='{os.environ['MP_PASS']}'")
 
+    cookies_list = list()
     try:
         driver.execute_script("doLogin()")
         WebDriverWait(driver, 3).until(
@@ -31,7 +32,8 @@ def scrape_cookies():
         )
     finally:
         cookies_list = driver.get_cookies()
-        return get_cookies(cookies_list)
+
+    return get_cookies(cookies_list)
 
 
 def get_cookies(cookies_list):
diff --git a/server.py b/server.py
index ffaac31..23504c1 100644
--- a/server.py
+++ b/server.py
@@ -5,14 +5,14 @@
 import itertools as itr
 import typing as ty
 
-from settings import COURSE_PATTERN, DAYS_PATTERN, CAMPUS_LIST
-
 # 3rd party
 from flask import Flask, jsonify, request, render_template
 from tinydb import TinyDB
 from maya import when, MayaInterval
 
-# Quart config
+from settings import COURSE_PATTERN, DAYS_PATTERN, CAMPUS_LIST
+
+# Flask config
 def add_cors_headers(response):
     response.headers['Access-Control-Allow-Origin'] = '*'
     return response

From 93ce5c6102a4bf7186d19bfd0cc08fddb0c533b7 Mon Sep 17 00:00:00 2001
From: phi-line <phi.liney@gmail.com>
Date: Mon, 11 Jun 2018 01:37:08 -0700
Subject: [PATCH 13/19] cleaned up code and added docstrings

---
 scrape_advanced.py | 32 ++++++++++++++++++++++++++++++--
 scrape_term.py     | 19 +++----------------
 2 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/scrape_advanced.py b/scrape_advanced.py
index fc98fd6..4cbf5ef 100644
--- a/scrape_advanced.py
+++ b/scrape_advanced.py
@@ -47,7 +47,7 @@ def main():
 
             failed = False
             for idx, variant in enumerate(ADVANCED_FORM_DATA):
-                content = mine_table_data(term, variant, dept_data, cookies, write=True)
+                content = mine_table_data(term, variant, dept_data, cookies, write=False)
                 if advanced_parse(content, db=temp, term=term):
                     break
                 elif idx == len(ADVANCED_FORM_DATA) - 1:
@@ -99,6 +99,8 @@ def mine_table_data(term, payload, dept_data, cookies, write=False):
     '''
     Mine will hit the database for foothill's class listings
     :param term: (str) the term to mine
+    :param payload: (str) data payload for request
+    :param dept_data: (str) department data payload
     :param cookies: (dict) cookies to send with POST
     :param write: (bool) write to file?
     :return res.content: (json) the html body
@@ -179,6 +181,11 @@ def advanced_parse(content, db, term=''):
 
 
 def generate_term_codes():
+    """
+    This helper generates a list of term codes based on the ranges set by:
+    YEAR_RANGE, QUARTER_RANGE, CAMPUS_RANGE
+    :return: (list(str)) list of term codes
+    """
     codes = []
     for i in range(YEAR_RANGE[0], YEAR_RANGE[1] + 1):
         for j in range(QUARTER_RANGE[0], QUARTER_RANGE[1] + 1):
@@ -192,23 +199,44 @@ class BlankRow(Exception):
 
 
 def get_parsed_text(tag):
+    """
+    Regex that strips all html tags and their contents
+    :param tag: (str) inner contents of parent tag
+    :return: (str) isolated text
+    """
     text = tag.get_text()
     p = re.compile(r'<.*?>')
     return p.sub('', text)
 
 
 def print_c(message):
+    """
+    Clears last carriage returned line and writes a new one
+    :param message: (str)
+    :return: None
+    """
     sys.stdout.write('\x1b[2K')
     sys.stdout.write(message)
     sys.stdout.flush()
 
 
 def color(c, word):
+    """
+    Format template that inserts a color for a given word
+    :param c: (Color) Color to format to
+    :param word: (str) Word to format
+    :return: (str) Formatted String
+    """
     return f'{c}{word}{Style.RESET_ALL}'
 
 
 def write_to_file(res, term):
-
+    """
+    Writes a bytestream to a nested file directory
+    :param res: response object
+    :param term: term code
+    :return: None
+    """
     with open(f"{join(OLD_DB_DIR, 'html', term+'.html')}", "wb") as file:
         for chunk in res.iter_content(chunk_size=512):
             if not chunk:
diff --git a/scrape_term.py b/scrape_term.py
index 6fab54e..ef12a45 100644
--- a/scrape_term.py
+++ b/scrape_term.py
@@ -36,22 +36,9 @@ def mine(term, write=False):
     :param write: (bool) write to file?
     :return res.content: (json) the html body
     '''
-    headers = {
-        'Origin': 'https://banssb.fhda.edu',
-        'Accept-Encoding': 'gzip, deflate, br',
-        'Accept-Language': 'en-US,en;q=0.9',
-        'User-Agent': 'FoothillAPI',
-        'Content-Type': 'application/x-www-form-urlencoded',
-        'Accept': 'text/html, */*; q=0.01',
-        'Referer': 'https://banssb.fhda.edu/PROD/fhda_opencourses.P_Application',
-        'X-Requested-With': 'XMLHttpRequest',
-        'Connection': 'keep-alive',
-    }
-
-    data = [('termcode', f'{term}'), ]
-
-    res = requests.post('https://banssb.fhda.edu/PROD/fhda_opencourses.P_GetCourseList',
-    headers=headers, data=data)
+    data = [('termcode', f'{term}')]
+
+    res = requests.post('https://banssb.fhda.edu/PROD/fhda_opencourses.P_GetCourseList', data=data)
     res.raise_for_status()
 
     if write:

From 5227f68787eab7b7aa0e9358ee3cd6f21afbafc4 Mon Sep 17 00:00:00 2001
From: phi-line <phi.liney@gmail.com>
Date: Mon, 11 Jun 2018 02:04:07 -0700
Subject: [PATCH 14/19] removed need for 'old' subdirectory

---
 scrape_advanced.py | 12 ++++++------
 settings.py        |  3 ---
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/scrape_advanced.py b/scrape_advanced.py
index 4cbf5ef..64a15c9 100644
--- a/scrape_advanced.py
+++ b/scrape_advanced.py
@@ -11,7 +11,7 @@
 from colorama import init, Fore, Style
 from selenium_login import scrape_cookies, kill_driver
 
-from settings import OLD_DB_DIR, ADVANCED_FORM_DATA
+from settings import DB_DIR, ADVANCED_FORM_DATA
 
 CAMPUS_RANGE = (1, 2)
 YEAR_RANGE = (1, 8)
@@ -19,11 +19,11 @@
 
 
 def main():
-    if not exists(OLD_DB_DIR):
-        makedirs(OLD_DB_DIR, exist_ok=True)
+    if not exists(DB_DIR):
+        makedirs(DB_DIR, exist_ok=True)
 
-    if not exists(join(OLD_DB_DIR, 'html')):
-        makedirs(join(OLD_DB_DIR, 'html'), exist_ok=True)
+    if not exists(join(DB_DIR, 'html')):
+        makedirs(join(DB_DIR, 'html'), exist_ok=True)
 
     codes = generate_term_codes()
     print_c(f'Loaded {color(Fore.CYAN, len(codes))} term codes\n')
@@ -237,7 +237,7 @@ def write_to_file(res, term):
     :param term: term code
     :return: None
     """
-    with open(f"{join(OLD_DB_DIR, 'html', term+'.html')}", "wb") as file:
+    with open(f"{join(DB_DIR, 'html', term+'.html')}", "wb") as file:
         for chunk in res.iter_content(chunk_size=512):
             if not chunk:
                 break
diff --git a/settings.py b/settings.py
index 821401e..3093a0b 100644
--- a/settings.py
+++ b/settings.py
@@ -2,7 +2,6 @@
 
 ROOT_DIR = os.path.abspath(os.path.dirname(__file__))
 DB_DIR = os.path.join(ROOT_DIR, 'db')
-OLD_DB_DIR = os.path.join(DB_DIR, 'old')
 TEST_DIR = os.path.join(ROOT_DIR, 'tests')
 TEST_DB_DIR = os.path.join(TEST_DIR, 'test_db')
 
@@ -10,7 +9,6 @@
 DAYS_PATTERN = f"^{'(M|T|W|Th|F|S|U)?'*7}$"
 
 SCHEDULE = 'schedule.html'
-SEARCH = 'search.html'
 HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end',
            'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap')
 CURRENT_TERM_CODES = CAMPUS_LIST = {'fh': '201911', 'da': '201912', 'test': 'test'}
@@ -44,7 +42,6 @@
          ('SUB_BTN', 'Section Search'),
          ('path', '1')]
     ],
-
     [
         [('sel_subj', 'dummy'),
          ('sel_day', 'dummy'),

From 363b9e3f624de8014c2f75c679e0cffe1384b3cf Mon Sep 17 00:00:00 2001
From: phi-line <phi.liney@gmail.com>
Date: Mon, 11 Jun 2018 03:22:46 -0700
Subject: [PATCH 15/19] fixed big created when reformatted

---
 scrape_advanced.py | 10 +++++-----
 selenium_login.py  |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/scrape_advanced.py b/scrape_advanced.py
index 64a15c9..b2df6d4 100644
--- a/scrape_advanced.py
+++ b/scrape_advanced.py
@@ -33,7 +33,7 @@ def main():
     cookies = scrape_cookies()
     print_c(f"Scraped session cookie {color(Fore.YELLOW, cookies['CPSESSID'])}\n{'-'*79}\n")
 
-    temp_path = join(OLD_DB_DIR, 'temp.json')
+    temp_path = join(DB_DIR, 'temp.json')
 
     try:
         for term in codes:
@@ -42,8 +42,8 @@ def main():
             temp = TinyDB(temp_path)
 
             dept_data = mine_dept_data(term, write=False)
-            print_c(f" [{term}] [{color(Fore.YELLOW, 'MINING…')}] \
-                      Parsing {len(dept_data)} departments…\r")
+            print_c(f" [{term}] [{color(Fore.YELLOW, 'MINING…')}] " +
+                     f"Parsing {len(dept_data)} departments…\r")
 
             failed = False
             for idx, variant in enumerate(ADVANCED_FORM_DATA):
@@ -53,10 +53,10 @@ def main():
                 elif idx == len(ADVANCED_FORM_DATA) - 1:
                     failed = True
 
-            if rename(temp_path, join(OLD_DB_DIR, f'old_{term}_database.json')):
+            if rename(temp_path, join(DB_DIR, f'old_{term}_database.json')):
                 remove(temp_path)
 
-            db = TinyDB(join(OLD_DB_DIR, f'old_{term}_database.json'))
+            db = TinyDB(join(DB_DIR, f'old_{term}_database.json'))
 
             num_courses = sum([len(db.table(t).all()) for t in db.tables()])
 
diff --git a/selenium_login.py b/selenium_login.py
index 617e435..1487172 100644
--- a/selenium_login.py
+++ b/selenium_login.py
@@ -18,17 +18,17 @@ def scrape_cookies():
     driver.execute_script(f"document.getElementById('user').value='{os.environ['MP_USER']}'")
     driver.execute_script(f"document.getElementById('pass').value='{os.environ['MP_PASS']}'")
 
-    cookies_list = list()
     try:
         driver.execute_script("doLogin()")
         WebDriverWait(driver, 3).until(
             EC.title_is("MyPortal / Foothill-De Anza College District")
         )
 
-        driver.get("https://myportal.fhda.edu/render.UserLayoutRootNode.uP?uP_tparam=utf&utf=%2fcp%2fip%2flogin%3fsys%3dsctssb%26url%3dhttps%3A%2F%2Fbanssb.fhda.edu%2FPROD%2Fbwskfcls.p_sel_crse_search")
+        driver.get(
+            "https://myportal.fhda.edu/render.UserLayoutRootNode.uP?uP_tparam=utf&utf=%2fcp%2fip%2flogin%3fsys%3dsctssb%26url%3dhttps%3A%2F%2Fbanssb.fhda.edu%2FPROD%2Fbwskfcls.p_sel_crse_search")
 
         WebDriverWait(driver, 3).until(
-            EC.presence_of_element_located((By.ID, "ssbbackurl"))
+            EC.title_is("MyPortal / Foothill-De Anza College District")
         )
     finally:
         cookies_list = driver.get_cookies()

From 1872b6c510419564fd86a0dbdf8ad7b2bdc5f16d Mon Sep 17 00:00:00 2001
From: phi-line <phi.liney@gmail.com>
Date: Mon, 11 Jun 2018 19:58:23 -0700
Subject: [PATCH 16/19] added debug mode

---
 scrape_advanced.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/scrape_advanced.py b/scrape_advanced.py
index b2df6d4..50ceb37 100644
--- a/scrape_advanced.py
+++ b/scrape_advanced.py
@@ -17,6 +17,9 @@
 YEAR_RANGE = (1, 8)
 QUARTER_RANGE = (1, 4)
 
+PREFIX = 'old'
+DEBUG = False
+
 
 def main():
     if not exists(DB_DIR):
@@ -28,6 +31,10 @@ def main():
     codes = generate_term_codes()
     print_c(f'Loaded {color(Fore.CYAN, len(codes))} term codes\n')
 
+    if DEBUG:
+        PREFIX = 'debug'
+        codes = codes[:5]
+
     print_c(f'Scraping session cookie…\r')
 
     cookies = scrape_cookies()
@@ -53,10 +60,10 @@ def main():
                 elif idx == len(ADVANCED_FORM_DATA) - 1:
                     failed = True
 
-            if rename(temp_path, join(DB_DIR, f'old_{term}_database.json')):
+            if rename(temp_path, join(DB_DIR, f'{PREFIX}_{term}_database.json')):
                 remove(temp_path)
 
-            db = TinyDB(join(DB_DIR, f'old_{term}_database.json'))
+            db = TinyDB(join(DB_DIR, f'{PREFIX}_{term}_database.json'))
 
             num_courses = sum([len(db.table(t).all()) for t in db.tables()])
 
@@ -109,6 +116,9 @@ def mine_table_data(term, payload, dept_data, cookies, write=False):
 
     data.extend(payload[0])
 
+    if DEBUG:
+        dept_data = dept_data[:1]
+
     data.extend(dept_data)
 
     data.extend(payload[1])

From 787cd3b385edd4d5e6947ff899bab607ef3bee18 Mon Sep 17 00:00:00 2001
From: phi-line <phi.liney@gmail.com>
Date: Wed, 13 Jun 2018 11:50:50 -0700
Subject: [PATCH 17/19] increased pylint score and made debug mode better

---
 scrape_advanced.py | 9 ++++-----
 selenium_login.py  | 1 -
 settings.py        | 1 +
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/scrape_advanced.py b/scrape_advanced.py
index 50ceb37..cb7041c 100644
--- a/scrape_advanced.py
+++ b/scrape_advanced.py
@@ -11,13 +11,12 @@
 from colorama import init, Fore, Style
 from selenium_login import scrape_cookies, kill_driver
 
-from settings import DB_DIR, ADVANCED_FORM_DATA
+from settings import DB_DIR, ADVANCED_FORM_DATA, PREFIXES
 
 CAMPUS_RANGE = (1, 2)
 YEAR_RANGE = (1, 8)
 QUARTER_RANGE = (1, 4)
 
-PREFIX = 'old'
 DEBUG = False
 
 
@@ -31,8 +30,8 @@ def main():
     codes = generate_term_codes()
     print_c(f'Loaded {color(Fore.CYAN, len(codes))} term codes\n')
 
+    prefix = PREFIXES[0] if not DEBUG else PREFIXES[1]
     if DEBUG:
-        PREFIX = 'debug'
         codes = codes[:5]
 
     print_c(f'Scraping session cookie…\r')
@@ -60,10 +59,10 @@ def main():
                 elif idx == len(ADVANCED_FORM_DATA) - 1:
                     failed = True
 
-            if rename(temp_path, join(DB_DIR, f'{PREFIX}_{term}_database.json')):
+            if rename(temp_path, join(DB_DIR, f'{prefix}_{term}_database.json')):
                 remove(temp_path)
 
-            db = TinyDB(join(DB_DIR, f'{PREFIX}_{term}_database.json'))
+            db = TinyDB(join(DB_DIR, f'{prefix}_{term}_database.json'))
 
             num_courses = sum([len(db.table(t).all()) for t in db.tables()])
 
diff --git a/selenium_login.py b/selenium_login.py
index 1487172..ea35661 100644
--- a/selenium_login.py
+++ b/selenium_login.py
@@ -1,7 +1,6 @@
 import os
 
 from selenium import webdriver
-from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 
diff --git a/settings.py b/settings.py
index 3093a0b..2134631 100644
--- a/settings.py
+++ b/settings.py
@@ -8,6 +8,7 @@
 COURSE_PATTERN = r'[FD]0*(\d*\w?)\.?\d*([YWZH])?'
 DAYS_PATTERN = f"^{'(M|T|W|Th|F|S|U)?'*7}$"
 
+PREFIXES = ('old', 'debug')
 SCHEDULE = 'schedule.html'
 HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end',
            'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap')

From 363b07b1b68ae9c048750b656d4b492f7e6ce2b2 Mon Sep 17 00:00:00 2001
From: phi-line <phi.liney@gmail.com>
Date: Wed, 13 Jun 2018 23:03:40 -0700
Subject: [PATCH 18/19] addressed TryExceptElse's changes

---
 .gitignore         |   2 +-
 scrape_advanced.py | 133 ++++++++++++++++++++++++++++++---------------
 settings.py        |  95 +-------------------------------
 3 files changed, 92 insertions(+), 138 deletions(-)

diff --git a/.gitignore b/.gitignore
index 316e78c..63e00fa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@ __pycache__/
 .idea/
 *.DS_Store
 *.json
-*.html
+db/html/
 frontend/static/README.md
 
 # pytest
diff --git a/scrape_advanced.py b/scrape_advanced.py
index cb7041c..5b93eea 100644
--- a/scrape_advanced.py
+++ b/scrape_advanced.py
@@ -2,6 +2,7 @@
 from os import makedirs, rename, remove
 from os.path import join, exists
 from collections import defaultdict
+from itertools import product
 import re
 
 # 3rd party
@@ -10,14 +11,55 @@
 from tinydb import TinyDB
 from colorama import init, Fore, Style
 from selenium_login import scrape_cookies, kill_driver
+from selenium.common.exceptions import TimeoutException
 
-from settings import DB_DIR, ADVANCED_FORM_DATA, PREFIXES
+from settings import DB_DIR
 
 CAMPUS_RANGE = (1, 2)
 YEAR_RANGE = (1, 8)
 QUARTER_RANGE = (1, 4)
 
-DEBUG = False
+DEBUG = True
+
+PREFIXES = ('old', 'debug')
+
+ADVANCED_FORM_DATA = [
+    [
+        [('sel_subj', 'dummy'), ('sel_day', 'dummy'), ('sel_schd', 'dummy'),
+         ('sel_insm', 'dummy'), ('sel_camp', 'dummy'), ('sel_levl', 'dummy'),
+         ('sel_sess', 'dummy'), ('sel_instr', 'dummy'), ('sel_ptrm', 'dummy'),
+         ('sel_attr', 'dummy')],
+        [('sel_crse', ''), ('sel_title', ''), ('sel_from_cred', ''),
+         ('sel_to_cred', ''), ('sel_camp', '%'), ('sel_sess', '%'),
+         ('sel_instr', '%'), ('sel_ptrm', '%'), ('begin_hh', '0'),
+         ('begin_mi', '0'), ('begin_ap', 'a'), ('end_hh', '0'), ('end_mi', '0'),
+         ('end_ap', 'a'), ('SUB_BTN', 'Section Search'), ('path', '1')]
+    ],
+    [
+        [('sel_subj', 'dummy'), ('sel_day', 'dummy'), ('sel_schd', 'dummy'),
+         ('sel_insm', 'dummy'), ('sel_camp', 'dummy'), ('sel_levl', 'dummy'),
+         ('sel_sess', 'dummy'), ('sel_instr', 'dummy'), ('sel_ptrm', 'dummy'),
+         ('sel_attr', 'dummy')],
+        [('sel_crse', ''), ('sel_title', ''),  ('sel_schd', '%'),
+         ('sel_from_cred', ''), ('sel_to_cred', ''), ('sel_camp', '%'),
+         ('sel_instr', '%'), ('sel_sess', '%'), ('sel_ptrm', '%'),
+         ('sel_attr', '%'), ('begin_hh', '0'), ('begin_mi', '0'),
+         ('begin_ap', 'a'), ('end_hh', '0'), ('end_mi', '0'),
+         ('end_ap', 'a'), ('SUB_BTN', 'Section Search'), ('path', '1')]
+    ],
+    [
+        [('sel_subj', 'dummy'), ('sel_day', 'dummy'), ('sel_schd', 'dummy'),
+         ('sel_insm', 'dummy'), ('sel_camp', 'dummy'), ('sel_levl', 'dummy'),
+         ('sel_sess', 'dummy'), ('sel_instr', 'dummy'), ('sel_ptrm', 'dummy'),
+         ('sel_attr', 'dummy')],
+        [('sel_crse', ''), ('sel_title', ''), ('sel_schd', '%'),
+         ('sel_from_cred', ''), ('sel_to_cred', ''), ('sel_camp', '%'),
+         ('sel_levl', '%'), ('sel_ptrm', '%'), ('sel_instr', '%'),
+         ('sel_sess', '%'), ('sel_attr', '%'), ('begin_hh', '0'),
+         ('begin_mi', '0'), ('begin_ap', 'a'), ('end_hh', '0'), ('end_mi', '0'),
+         ('end_ap', 'a'), ('SUB_BTN', 'Section Search'), ('path', '1')]
+    ]
+]
 
 
 def main():
@@ -71,7 +113,8 @@ def main():
             else:
                 print_c(f" [{term}] [{color(Fore.GREEN, 'SUCCESS')}] Mined {num_courses} courses\n")
 
-    except KeyboardInterrupt:
+    except (KeyboardInterrupt, TimeoutException) as e:
+        print_c(f"{color(Fore.GREEN, e)}\n")
         kill_driver()
         remove(temp_path)
     finally:
@@ -140,53 +183,58 @@ def advanced_parse(content, db, term=''):
     :return: None
     '''
     soup = BeautifulSoup(content, 'html5lib')
+    table_rows = None
 
     try:
         table = soup.find('table', {'class': 'datadisplaytable'})
         table_rows = table.find_all('tr')
+    except AttributeError as e:
+        return False
 
-        table_headers = list()
-        start_idx = 0
-        for i, tr in enumerate(table_rows):
-            header_cols = tr.find_all('th', {'class': 'ddheader'})
-            for th in header_cols:
-                table_headers.append(get_parsed_text(th))
-            if table_headers:
-                start_idx = i
-                break
+    table_headers = list()
+    start_idx = 0
+    for i, tr in enumerate(table_rows):
+        header_cols = tr.find_all('th', {'class': 'ddheader'})
+        for th in header_cols:
+            table_headers.append(get_parsed_text(th))
+        if table_headers:
+            start_idx = i
+            break
+
+    for tr in table_rows[start_idx:]:
+        parse_row(tr, table_headers, db)
+    return True
 
-        for tr in table_rows[start_idx:]:
-            try:
-                cols = tr.find_all('td', {'class': 'dddefault'})
 
-                if cols:
-                    s = defaultdict(lambda: defaultdict(list))
+def parse_row(tr, th, db):
+    try:
+        cols = tr.find_all('td', {'class': 'dddefault'})
 
-                    num_blank = 0
-                    for i, c in enumerate(cols):
-                        a = c.find('a')
-                        cols[i] = get_parsed_text(a) if a else get_parsed_text(cols[i])
-                        if cols[i].isspace():
-                            num_blank += 1
+        if cols:
+            s = defaultdict(lambda: defaultdict(list))
 
-                    if num_blank > len(cols) - num_blank:
-                        raise BlankRow
+            num_blank = 0
+            for i, c in enumerate(cols):
+                a = c.find('a')
+                cols[i] = get_parsed_text(a) if a else get_parsed_text(cols[i])
+                if cols[i].isspace():
+                    num_blank += 1
 
-                    data = dict(zip(table_headers, cols))
+            if num_blank > len(cols) - num_blank:
+                raise BlankRow
 
-                    subject = data['Subj']
-                    key = data['Crse']
-                    crn = data['CRN']
+            data = dict(zip(th, cols))
 
-                    s[key][crn].append(data)
+            subject = data['Subj']
+            key = data['Crse']
+            crn = data['CRN']
 
-                    j = dict(s)
-                    db.table(f'{subject}').insert(j)
-            except BlankRow:
-                continue
-    except AttributeError as e:
-        return False
-    return True
+            s[key][crn].append(data)
+            j = dict(s)
+
+            db.table(f'{subject}').insert(j)
+    except BlankRow:
+        return
 
 
 def generate_term_codes():
@@ -195,11 +243,10 @@ def generate_term_codes():
     YEAR_RANGE, QUARTER_RANGE, CAMPUS_RANGE
     :return: (list(str)) list of term codes
     """
-    codes = []
-    for i in range(YEAR_RANGE[0], YEAR_RANGE[1] + 1):
-        for j in range(QUARTER_RANGE[0], QUARTER_RANGE[1] + 1):
-            for k in range(CAMPUS_RANGE[0], CAMPUS_RANGE[1] + 1):
-                codes.append(f'201{i}{j}{k}')
+    i = range(YEAR_RANGE[0], YEAR_RANGE[1] + 1)
+    j = range(QUARTER_RANGE[0], QUARTER_RANGE[1] + 1)
+    k = range(CAMPUS_RANGE[0], CAMPUS_RANGE[1] + 1)
+    codes = [f'201{x[0]}{x[1]}{x[2]}' for x in product(i, j, k)]
     return codes
 
 
@@ -256,5 +303,5 @@ def write_to_file(res, term):
 
 
 if __name__ == '__main__':
-    init()
+    init() #colorama
     main()
diff --git a/settings.py b/settings.py
index 2134631..3a8bdf3 100644
--- a/settings.py
+++ b/settings.py
@@ -8,101 +8,8 @@
 COURSE_PATTERN = r'[FD]0*(\d*\w?)\.?\d*([YWZH])?'
 DAYS_PATTERN = f"^{'(M|T|W|Th|F|S|U)?'*7}$"
 
-PREFIXES = ('old', 'debug')
+
 SCHEDULE = 'schedule.html'
 HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end',
            'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap')
 CURRENT_TERM_CODES = CAMPUS_LIST = {'fh': '201911', 'da': '201912', 'test': 'test'}
-
-ADVANCED_FORM_DATA = [
-    [
-        [('sel_subj', 'dummy'),
-         ('sel_day', 'dummy'),
-         ('sel_schd', 'dummy'),
-         ('sel_insm', 'dummy'),
-         ('sel_camp', 'dummy'),
-         ('sel_levl', 'dummy'),
-         ('sel_sess', 'dummy'),
-         ('sel_instr', 'dummy'),
-         ('sel_ptrm', 'dummy'),
-         ('sel_attr', 'dummy')],
-        [('sel_crse', ''),
-         ('sel_title', ''),
-         ('sel_from_cred', ''),
-         ('sel_to_cred', ''),
-         ('sel_camp', '%'),
-         ('sel_sess', '%'),
-         ('sel_instr', '%'),
-         ('sel_ptrm', '%'),
-         ('begin_hh', '0'),
-         ('begin_mi', '0'),
-         ('begin_ap', 'a'),
-         ('end_hh', '0'),
-         ('end_mi', '0'),
-         ('end_ap', 'a'),
-         ('SUB_BTN', 'Section Search'),
-         ('path', '1')]
-    ],
-    [
-        [('sel_subj', 'dummy'),
-         ('sel_day', 'dummy'),
-         ('sel_schd', 'dummy'),
-         ('sel_insm', 'dummy'),
-         ('sel_camp', 'dummy'),
-         ('sel_levl', 'dummy'),
-         ('sel_sess', 'dummy'),
-         ('sel_instr', 'dummy'),
-         ('sel_ptrm', 'dummy'),
-         ('sel_attr', 'dummy')],
-        [('sel_crse', ''),
-         ('sel_title', ''),
-         ('sel_schd', '%'),
-         ('sel_from_cred', ''),
-         ('sel_to_cred', ''),
-         ('sel_camp', '%'),
-         ('sel_instr', '%'),
-         ('sel_sess', '%'),
-         ('sel_ptrm', '%'),
-         ('sel_attr', '%'),
-         ('begin_hh', '0'),
-         ('begin_mi', '0'),
-         ('begin_ap', 'a'),
-         ('end_hh', '0'),
-         ('end_mi', '0'),
-         ('end_ap', 'a'),
-         ('SUB_BTN', 'Section Search'),
-         ('path', '1')]
-    ],
-
-    [
-        [('sel_subj', 'dummy'),
-         ('sel_day', 'dummy'),
-         ('sel_schd', 'dummy'),
-         ('sel_insm', 'dummy'),
-         ('sel_camp', 'dummy'),
-         ('sel_levl', 'dummy'),
-         ('sel_sess', 'dummy'),
-         ('sel_instr', 'dummy'),
-         ('sel_ptrm', 'dummy'),
-         ('sel_attr', 'dummy')],
-        [('sel_crse', ''),
-         ('sel_title', ''),
-         ('sel_schd', '%'),
-         ('sel_from_cred', ''),
-         ('sel_to_cred', ''),
-         ('sel_camp', '%'),
-         ('sel_levl', '%'),
-         ('sel_ptrm', '%'),
-         ('sel_instr', '%'),
-         ('sel_sess', '%'),
-         ('sel_attr', '%'),
-         ('begin_hh', '0'),
-         ('begin_mi', '0'),
-         ('begin_ap', 'a'),
-         ('end_hh', '0'),
-         ('end_mi', '0'),
-         ('end_ap', 'a'),
-         ('SUB_BTN', 'Section Search'),
-         ('path', '1')]
-    ]
-]

From 4a5f9f24db46a4b7746d132c24a4f3ade627b8c1 Mon Sep 17 00:00:00 2001
From: phi-line <phi.liney@gmail.com>
Date: Fri, 15 Jun 2018 18:34:23 -0700
Subject: [PATCH 19/19] fixed bug with wrong value in settings.py

---
 scrape_term.py | 4 +++-
 settings.py    | 6 +++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/scrape_term.py b/scrape_term.py
index ef12a45..27aab7d 100644
--- a/scrape_term.py
+++ b/scrape_term.py
@@ -8,7 +8,9 @@
 from bs4 import BeautifulSoup
 from tinydb import TinyDB
 
-from settings import DB_DIR, CURRENT_TERM_CODES, COURSE_PATTERN, HEADERS, SCHEDULE
+from settings import DB_DIR, COURSE_PATTERN, HEADERS, SCHEDULE
+
+CURRENT_TERM_CODES = {'fh': '201911', 'da': '201912'}
 
 
 def main():
diff --git a/settings.py b/settings.py
index 3a8bdf3..7f370ff 100644
--- a/settings.py
+++ b/settings.py
@@ -10,6 +10,6 @@
 
 
 SCHEDULE = 'schedule.html'
-HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end',
-           'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap')
-CURRENT_TERM_CODES = CAMPUS_LIST = {'fh': '201911', 'da': '201912', 'test': 'test'}
+HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end', 'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap')
+
+CAMPUS_LIST = {'fh': '201911', 'da': '201912', 'test': 'test'}