# Beautiful Soup Sandbox
a notebook to investigate pulling class information out of PCC class schedule webpages

In [30]:
import requests
from bs4 import BeautifulSoup
import urllib
from bs4_functions import get_instr_sec_lst

In [31]:
base_url = 'https://www.pcc.edu/schedule/default.cfm?fa=dspTopic&thisTerm=200701&type=Credit'
page = requests.get(base_url)
soup = BeautifulSoup(page.content, 'html.parser')

In [32]:
depts = ['Civil and Mechanical Engineering Technology', 'Engineering', 'Electronic Engineering Technology']

In [33]:
dept_url_lst=[]
base_url = 'https://www.pcc.edu/schedule/'
for link in soup.find_all('a', href=True):
    if link.text in depts:
        extension = link.get("href")
        url = urllib.parse.urljoin(base_url,extension)
        dept_url_lst.append(url)
        
for link in dept_url_lst:
    print(link)

https://www.pcc.edu/schedule/default.cfm?fa=dspTopicDetails&thisTerm=201802&topicid=CMET&type=Credit
https://www.pcc.edu/schedule/default.cfm?fa=dspTopicDetails&thisTerm=201802&topicid=EET&type=Credit
https://www.pcc.edu/schedule/default.cfm?fa=dspTopicDetails&thisTerm=201802&topicid=GE&type=Credit


In [34]:
class_url_lst=[]
base_url = 'https://www.pcc.edu/schedule/'
for url in dept_url_lst:
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    for course in soup.find_all('dd'):
        if course.a.get("href"):
            ext = course.a.get("href")
            url = urllib.parse.urljoin(base_url,ext)
            class_url_lst.append(url)

In [35]:
instr_section_list =[]
for url in class_url_lst:
    #print(x)
    instr_section_list.extend(get_instr_sec_lst(url))

In [36]:
instructor_set = set([x.instructor for x in instr_section_list])
print(instructor_set)

{'Chuck W Fenning', 'Greg  Gerstne', 'Doug S Fal', 'Matthew Q Grave', 'Matt J Distad', 'Sanda N William', 'Richard E Fesle', 'Corneliu S Boghea', 'Mark R Brazie', 'Adam  Scheible', 'Donald D Bell', 'Yunhui  Ch', 'Chuck T Litchfield', 'Catherine  Stark', 'Kevin T Buck', 'David P Goldma', 'Tara M Nel', 'Curtis L Lipski', 'Spencer L Poff', 'Todd M Sande', 'David S Smith', 'Mark A Hage', 'Dan A Kruge', 'Trung H Le', 'Peter  Kazarinoff', 'Scott W Lowrey', 'Walter  Lara', 'Jenna F Bell', 'Ray M Schmelze', 'Linda M Browning', 'Russell B Eng'}


In [87]:
class instructorObj():
    def __init__(self,name):
        self.name = name
        self.classes=[]
        self.departments=[]
        self.office=''
        self.phone=''
        self.email=''
    
    def __str__(self):
        return self.name
    
    def print_schedule(self):
        print(self.name)
        print()
        for x in self.classes:
            print(x.course_number)
            print(x.course_name)
            print(x.day)
            print(x.start_time)
            print(x.end_time)
            print(x.building)
            print(x.room_num)
            print()
        

In [88]:
peter = instructorObj('Peter  Kazarinoff')
peter.office = 'ST200'
peter.phone ='971-722-8056'
peter.email = 'peter.kazarinoff@pcc.edu'
peter.departments = ['CMET','ENGR']
peter.classes = [x for x in instr_section_list if x.instructor == 'Peter  Kazarinoff']

In [89]:
peter.classes

[<bs4_functions.InstrSect at 0x18bddbc9668>,
 <bs4_functions.InstrSect at 0x18bddb92a58>,
 <bs4_functions.InstrSect at 0x18bddb928d0>,
 <bs4_functions.InstrSect at 0x18bddb92780>,
 <bs4_functions.InstrSect at 0x18bdd8b7f60>,
 <bs4_functions.InstrSect at 0x18bdd8b7ef0>,
 <bs4_functions.InstrSect at 0x18bddc15080>,
 <bs4_functions.InstrSect at 0x18bddc15978>,
 <bs4_functions.InstrSect at 0x18bdd9c5668>,
 <bs4_functions.InstrSect at 0x18bdd9cea58>,
 <bs4_functions.InstrSect at 0x18bdd9ce080>,
 <bs4_functions.InstrSect at 0x18bdd9ce940>,
 <bs4_functions.InstrSect at 0x18bdd9ce7f0>,
 <bs4_functions.InstrSect at 0x18bddbe9e10>]

In [90]:
instr_Obj_list = [] #a list of instructure Objects, each instructor has a list of class schedule objects
for instructor in list(instructor_set):
    #print(instructor)
    inst_Obj = instructorObj(instructor)
    #print(inst_Obj.name)
    #print(type(inst_Obj))
    inst_Obj.classes = [x for x in instr_section_list if x.instructor == instructor]
    inst_Obj.departments = list(set([x.department for x in instr_section_list if x.instructor == instructor]))
    instr_Obj_list.append(inst_Obj)

In [91]:
instr_Obj_list[-4].print_schedule()


Jenna F Bell

CMET222
Thermodynamics II
MW
01:00 PM
04:50 PM
AM
125

CMET223
Project Management
Tu
01:00 PM
03:50 PM
AM
107

ENGR213
Strength of Materials
MW
06:00 PM
08:20 PM
ST
100



In [92]:
instr_Obj_list[-6].print_schedule()

Scott W Lowrey

EET123
Digital Systems 3
None
12:30 PM
02:20 PM
AM
112

EET123
Digital Systems 3
None
12:30 PM
02:20 PM
AM
112

EET223
RF Communications Circuits
Tu
02:30 PM
05:20 PM
ST
313

EET223
RF Communications Circuits
None
08:30 AM
10:20 AM
ST
316

ENGR101
Engineering Fundamentals
W
09:00 AM
11:50 AM
AM
125



In [93]:
### Now need to try and build the excel sheets from the peoples schedule
### need a fuction that takes in an InstrSectionObject and puts it in an excel doc object

In [None]:
import bs4

In [None]:
import bs4
class courseTable(bs4.element.Tag):
    def __init__(self,bs4elementtage):
        self.contents = bs4elementtage
        
        
class SuperMan(bs4.element.Tag): #subclass, inherits from SuperHero
    pass

In [None]:
dir(table)

In [None]:
type(coursetableobject)

In [None]:
#for td in td_lst:
   #print(td.text)
str = td_lst[1].text

In [None]:
str

In [None]:
str.strip().isalnum()

In [None]:
CRN_lst=[]
for td in td_lst:
    if td.text.strip().isdigit() and len(td.text.strip()) == 5:
        CRN_lst.append(td.text.strip())
    
CRN_lst

In [None]:
loc_lst=[]
for td in td_lst:
    #print(td.text.strip())
    if " / " in td.text.strip():
        loc_lst.append(td.text.strip())
loc_lst
#td_lst[1].text

In [None]:
time_lst=[]
for td in td_lst:
    #print(td.text.strip())
    if ":" in td.text.strip() and "-" in td.text.strip():
        time_lst.append(td.text.strip())
time_lst
#td_lst[1].text

In [None]:
course_dict = {}

In [None]:
course_sec_dict ={}

In [None]:
course_sec_dict['ID']="".join([CRN_lst[0],'-00'])

In [None]:
course_sec_dict

In [None]:
course_sec_dict['CRN']=CRN_lst[0]

In [None]:
course_sec_dict

In [None]:
time_str = time_lst[0]
time_str

In [None]:
start_time = time_str.split('-')[0]
start_time

In [None]:
end_time = time_str.split('-')[1]
end_time

In [None]:
course_sec_dict['start_time']=time_lst[0].split('-')[0]

In [None]:
course_sec_dict['end_time']=time_lst[0].split('-')[1]

In [None]:
course_sec_dict

In [None]:
day_lst=[]
for td in td_lst:
    #print(td.text.strip())
    if len(td.text.strip())==1 and td.text.strip().isalpha:
        day_lst.append(td.text.strip())
day_lst

In [None]:
course_sec_dict['day']=day_lst[0].strip()

In [None]:
course_sec_dict

In [None]:
instr_lst=[]
for td in td_lst:
    #print(td.text.strip())
    if "Instructor: " in td.text.strip():
        instr_lst.append(td.text.strip().rstrip().lstrip().split('\n')[0].strip('Instructor: '))
instr_lst

In [None]:
course_sec_dict['instructor'] = instr_lst[0].strip()

In [None]:
course_sec_dict

In [None]:
loc_lst

In [None]:
course_sec_dict['building'] = loc_lst[0].split(' / ')[1].strip()
course_sec_dict

In [None]:
course_sec_dict['room_num'] = loc_lst[0].split(' / ')[2].strip()
course_sec_dict

In [None]:
class InstrSect():
    def __init__(self):
        self.ID=''
        self.CRN=''
        self.building=''
        self.day=''
        self.start_time=''
        self.end_time=''
        self.room_num=''
        self.instructor=''
        self.campus=''
        self.course_name=''
        self.course_number=''
        self.dept=''
        self.start_date=''
        self.end_date=''
        self.textbook_cost=''
        self.tuition=''
        self.fees=''

In [None]:
CMET235_01=InstrSect()

In [None]:
CMET235_01.ID

In [None]:
CMET235_01.ID=course_sec_dict['ID']

In [None]:
CMET235_01.ID

In [None]:
CMET235_01.CRN=course_sec_dict['CRN']
CMET235_01.building=course_sec_dict['building']
CMET235_01.day=course_sec_dict['day']
CMET235_01.start_time=course_sec_dict['start_time']
CMET235_01.end_time=course_sec_dict['end_time']
CMET235_01.start_date='02-Apr-2018'
CMET235_01.end_date='11-Jun-2018'
CMET235_01.fees='$18.00'

In [None]:
#instr_lst=[]
for td in td_lst:
    print(td.text.strip())
    #if "Instructor: " in td.text.strip():
        #instr_lst.append(td.text.strip().rstrip().lstrip().split('\n')[0].strip('Instructor: '))
#instr_lst

In [None]:
tr_lst = table.find_all("tr")
for row in tr_lst:
    print(row)

In [None]:
## This works, need to pair rows so that:
#Intr sect 1
#<data-row>
#<info-row>

#Intr sect 2
#<data-row>
#<info-row>

#Intr sect 3
#<data-row alt-color>
#<info-row alt-color>

#Intr sect 4
#<data-row alt-color>
#<info-row alt-color>


rows = soup.findAll('tr', attrs={'class': ['data-row ','info-row ','data-row alt-color','info-row alt-color']})
#print(rows)
print(rows[6].text)
print(rows[7].text)

In [None]:
def get_course_rows(soupObject):
    rows = soupObject.findAll('tr', attrs={'class': ['data-row ','info-row ','data-row alt-color','info-row alt-color']})
    return rows

In [None]:
row_lst = get_course_rows(soup)

In [None]:
#type(row_lst)
two_rows = rows[0:2]

In [None]:
def get_instructor(soupRowObject):
    #td_lst = soupRowObject.find('td')
    for td in soupRowObject.find_all('td'):
    #print(td.text.strip())
        if "Instructor: " in td.text.strip():
            return (td.text.strip().rstrip().lstrip().split('\n')[0].strip('Instructor: '))
        else:
            return None

                

In [None]:
inst= get_instructor(row_lst[0])
print(inst)

In [None]:
row_lst[0]

In [None]:
type(row_lst)

In [None]:
dir(row_lst)

In [None]:
soup

In [None]:
soup.table

In [None]:
type(table)

In [None]:
soup.table.tr

In [None]:
type(soup.table.tr)

In [None]:
soup.table.find_all("tr")

In [None]:
type(soup.table.find_all("tr"))

In [None]:
type(soup.table.children)

In [None]:
soup.tr.next_sibling

In [None]:
soup.tr.next_sibling

In [None]:
soup.tr.next_sibling

In [None]:
soup.tr.contnets

In [None]:
type(soup.tr.contents)

In [None]:
soup.table.tr.contents

In [None]:
soup.tr.next_sibling

In [None]:
soup.tr.next_sibling

In [None]:
soup.tr.next_sibling

In [None]:
for i in range(10):
    print(soup.table.children)