In [2]:
import pprint
import selenium
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse
from bs4.element import Comment
import urllib.request
import sys
import os
from os import path
from datetime import datetime
import csv
import time
class CollegeCrawl(object):
    allUrls={}
    visitedUrls=set()
    rejectedUrls=set()
    n_allUrls=0
    n_sitedUrls=0
    n_rejectedUrls=0
    
    """
        collegename: name
        rooturl: www.university.edu
        prioritykeywords: ['apply','adimission'...] etc. if None then every page 
        respectrobottxt: True
    """
    def __init__(self,_collegename, _rooturl, _prioritykeywords,gap_Insecond=5,max_pages=150, _respectrobottxt=True):
        self.college=_collegename
        self.rootUrl=_rooturl
        self.priorityKeywords=_prioritykeywords
        self.respectRobottext=_respectrobottxt
        self.gap_Insecond=gap_Insecond
        self.max_Pages=max_pages

    """
        get all urls starting from rootUrl
    """
    def all_pages(self, links_only=True):
        headers={'User-Agent':'Mozilla/5.0'}
        response=requests.get(self.rootUrl,headers=headers)
        allUrls={self.rootUrl}
        visitedUrls=set()
        rejectedUrls=set()
        while len(allUrls)>len(visitedUrls)+len(rejectedUrls) and len(visitedUrls) < self.max_Pages:
            soup=BeautifulSoup(response.text, 'html.parser')
            for link in soup.find_all('a'):
                try:
                    url=link['href']                
                    parsed_uri = urlparse(url )
                    if parsed_uri.netloc=='':
                        absolute_url=self.rootUrl+url    
                    elif parsed_uri.netloc==base_domain:
                        absolute_url=url
                    else:
                        continue
                    clean=re.sub(r'[_+!@#$?\\\s]+$', '', absolute_url).replace('.edu','.edu/').replace('.edu//','.edu/').replace('.edu//','.edu/')
                    allUrls.add(clean)               
                except:
                    continue
            unvisited_url=(allUrls-visitedUrls-rejectedUrls).pop()
            response=requests.get(unvisited_url)
            if response.status_code!=200:
                rejectedUrls.add(unvisited_url)  
            else:
                visitedUrls.add(unvisited_url) #only creates csv files for visited urls
            time.sleep(self.gap_Insecond)
        return [allUrls,visitedUrls,rejectedUrls]
    
    """
        read one page
    """
    def Read_OneUrl(self, oneUrl):
        response=requests.get(oneUrl)  
        if response.status_code==200: 
            return self.get_pagetext(response)
        else: 
            return [[None, None, None, None]]
    def tag_visible(self,element):
        if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
            return False
        if isinstance(element, Comment):
            return False
        return True 
    def get_pagetext(self,body):
        soup = BeautifulSoup(body.text, 'html.parser')
        texts = soup.findAll(text=True) 
        visible_texts = filter(self.tag_visible, texts)       
    
        return [ [t.parent.name,   
                 t.parent.previousSibling.name if t.parent.previousSibling!=None else None, 
                 t.nextSibling.name if t.nextSibling!=None else None,
                 re.sub(r'[\s+\t]',' ',t) ]  for t in visible_texts if len(t.strip())>2] 
    """
        Save One Page
    """
    def Save_OnePage(self, url,folder=None,filename=None,format='csv'):
        url=url.strip()
        content=self.Read_OneUrl(url) #in format of (a,a,a,a)
     
        if folder==None:
            folder=os.getcwd()
        if path.isdir(folder)==False:
            print('folder does not exist')
            return
        if filename==None:
            filename=url.replace('.', '_dot_').replace('/', '_').replace(':', '_').replace('?','_q_')+'_'+datetime.now().strftime("%m_%d_%Y_%H_%M_%S")+'.csv'
    
        fullname=path.join(folder, filename)
        with open(fullname, 'w', newline='',encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
            writer.writerow(['url', 'parent', 'ps', 'ns', 'text'])
        
            for lll in content: 
                lll.insert(0, url)
                writer.writerow(lll)  
        return fullname           
    
    """
        Save summaries
    """
    def Save_Summaries(self,urls):
        if(len(list(urls[1]))==150):
            print('There were at least ',len(list(urls[0])),' total urls in ',self.rootUrl)
        else:
            print('There were ',len(list(urls[0])),' total urls in ',self.rootUrl)
        print('There were ',len(list(urls[1])),' visited urls in ',self.rootUrl)
        print('There were ',len(list(urls[2])),' rejected urls in ',self.rootUrl)
        if(len(list(urls[2]))>0):
            print('These were the rejected urls: ',list(urls[2]))
     

In [3]:
names=['MIT','Yale','University of Washington','Stanford','Harvard']
urls=['https://mit.edu/','https://yale.edu/','https://washington.edu/','https://stanford.edu/','https://harvard.edu/']
keywords=['apply,admission']
wait_time=0 #can change this to 5 when we need. I set it to 0 to speed things up. Default value is 5 if you don't input it
max_pages=150#default value is 150 if you don't input it
for college in range(len(names)):
    c=CollegeCrawl(names[college],urls[college],keywords,wait_time,max_pages)
    pages=c.all_pages()
    for page in list(pages[1]):
        c.Save_OnePage(page)
    c.Save_Summaries(pages)

There were  34  total urls in  https://mit.edu/
There were  27  visited urls in  https://mit.edu/
There were  7  rejected urls in  https://mit.edu/
These were the rejected urls:  ['https://mit.edu/mailto:admissions@mit.edu/', 'https://mit.edu/ http://global.mit.edu/', 'https://mit.edu/tel:617-253-1000', 'https://mit.edu/mailto:dataprotection@mit.edu/', 'https://mit.edu/mailto:aacomments@mit.edu/', 'https://mit.edu/mailto:dataprotection@mit.edu/.', 'https://mit.edu/mailto:campus-map@mit.edu/']
There were  55  total urls in  https://yale.edu/
There were  52  visited urls in  https://yale.edu/
There were  3  rejected urls in  https://yale.edu/
These were the rejected urls:  ['https://yale.edu/mailto:titleix@yale.edu/', 'https://yale.edu/mailto:ocr.boston@ed.gov', 'https://yale.edu/mailto:employee.services@yale.edu/']
There were  2  total urls in  https://washington.edu/
There were  2  visited urls in  https://washington.edu/
There were  0  rejected urls in  https://washington.edu/
There w