In [1]:
import re
import sys
import os
import requests
import time
import csv
from bs4 import BeautifulSoup
from bs4.element import Comment
from urllib.parse import urlparse
from os import path
from datetime import datetime

from IPython.core.debugger import set_trace

In [2]:
class CollegeCrawl:
    gap_Insecond=5
    max_Pages=5
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    """
        collegename: name
        rooturl: www.university.edu
        prioritykeywords: ['apply','adimission'...] etc. if None then everth page 
        respectrobottxt: True
    """
    def __init__(self,_collegename, _rooturl, _prioritykeywords, _respectrobottxt=True):
        self.college=_collegename
        self.rootUrl=_rooturl
        self.priorityKeywords=_prioritykeywords
        self.respectRobottext=_respectrobottxt #currently not doing anything
        self.allUrls=set()
        self.allRankedUrls = []
        self.visitedUrls=set() #all urls visited no matter what is the response status
        self.rejectedUrls=set()
        self.base_domain = self.__getDomainFromUrl(self.rootUrl)
    
    def __str__(self):
        return "************Crawl {}({})************".format(self.college, self.rootUrl)
    
        
    
    def __getDomainFromUrl(self, url):
        #url = "http://www.mit.edu"
        firstIndex = url.find('//')
        length = len(url)-1 if url.endswith('/') else len(url)      
        return url[firstIndex+2:length]
    
    def __addAllUrlsInOnePage(self, response):
        soup=BeautifulSoup(response.text, 'html.parser')
       
        for link in soup.find_all('a'):
            try:
                url=link['href']                
                parsed_uri = urlparse(url )
                if parsed_uri.netloc=='':
                    absolute_url = self.rootUrl+url    
                elif parsed_uri.netloc==self.base_domain:
                    absolute_url=url
                else:
                    continue
                #netname = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) 
                clean=re.sub(r'[_+!@#$?\\\s]+$', '', absolute_url)
                #set_trace()
                slashIndex = clean.index("//") 
                clean=''.join(clean[0:slashIndex+1])+(''.join(clean[slashIndex+1:])).replace('//', '/')
                clean=re.sub(r'[/\s]$','', clean)
                self.allUrls.add(clean)
                    #unique_urls.add(re.sub(r'[_+!@#$?\\/^\s]+$', '', absolute_url))                  
            except:
                continue                 
    

    """
        get all urls starting from rootUrl
    """
    def getAllUrls(self):          
        self.allUrls.add(self.rootUrl)
        while len(self.allUrls)>len(self.visitedUrls) and len(self.visitedUrls)<=CollegeCrawl.max_Pages:
            #get one unvisited url
            unvisited_url=(self.allUrls-self.visitedUrls).pop()
               
            # visit the unvisited url, parse the content and add all the urls to allUrls set
            time.sleep(CollegeCrawl.gap_Insecond)
            try:
                # unvisited url becomes visited
                self.visitedUrls.add(unvisited_url)
                response=requests.get(unvisited_url,None, headers=CollegeCrawl.headers)
                # check whether successful
                if response.status_code !=200: 
                    self.rejectedUrls.add(unvisited_url)  
                else:
                    contentType = response.headers.get('content-type')
                    #set_trace()
                    if 'text/html' in contentType:
                        self.__addAllUrlsInOnePage(response)
            #except requests.exceptions.RequestException as e:
            except:
                continue
        
        
       
        
    
    """
        read one page
    """
    def read_OneUrl(self, url):
        response=requests.get(url)  
        if response.status_code==200:
            if "text/html" in response.headers.get('content-type'):
                return self.get_pagetext(response)
            else: #for example .pdf file, img file
                pass #save the file directly?
        else: 
            return [[None, None, None, None]]
    
    
    def get_pagetext(self,body):
        soup = BeautifulSoup(body.text, 'html.parser') #another type is content which is byte (like image)
        texts = soup.findAll(text=True) 
        visible_texts = filter(self.tag_visible, texts)       
    
        return [ [t.parent.name,   
             t.parent.previousSibling.name if t.parent.previousSibling!=None else None, 
             t.nextSibling.name if t.nextSibling!=None else None,
             re.sub(r'[\s+\t]',' ',t) ]  for t in visible_texts if len(t.strip())>2] 
    
    def tag_visible(self,element):
        if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
            return False
        if isinstance(element, Comment):
            return False
        return True 
        
    """
        Save One Page
    """
    def save_OnePage(self, url, format='csv'):
        url=url.strip()
        content=self.read_OneUrl(url) #in format of (a,a,a,a)
     
       # if folder==None:
        folder=os.getcwd()
        if path.isdir(folder)==False:
            print('folder doesnot exist')
            return
        #if filename==None:
        filename=url.replace('.', '_dot_').replace('/', '_').replace(':', '_')+'_'+datetime.now().strftime("%m_%d_%Y_%H_%M_%S")+'.csv'
    
        fullname=path.join(folder, filename)
        with open(fullname, 'w', newline='', encoding="utf-8") as newFile:
            writer = csv.writer(newFile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
            if format != 'csv':
               # writer = 
                pass
            writer.writerow(['url', 'parent', 'ps', 'ns', 'text'])
            for lll in content: 
                lll.insert(0, url)
                writer.writerow(lll)  
        #return fullname           
    
    """
        Save summaries
    """
    def save_Summaries(self):
        print(self)
        print("Total number of URLs visisted: ", len(self.visitedUrls))
        print("Total number of URLs rejected: ", len(self.rejectedUrls))
        print("URLs visited (ranked with priority words):\n",self.allRankedUrls)
    
    """
        Ranks the urls based on the priority words
    """
    def getRankedVisitedUrls(self):
        self.allRankedUrls = sorted(self.visitedUrls,key=lambda url: [w in url for w in self.priorityKeywords].count(True),reverse=True)
        return self.allRankedUrls
    
    def setMaxpage(self, maxPages):
        CollegeCrawl.max_Pages = maxPages
        
    def setGap(self, gap):
        CollegeCrawl.gap_Insecond = gap
    
    
    def crawl(self):
        self.getAllUrls()
        rankedUrlList = self.getRankedVisitedUrls()
        for url in rankedUrlList:
            self.save_OnePage(url)
            
        self.save_Summaries()

In [None]:
colleges = [["UCLA", "http://www.ucla.edu", ["apply","admission","research"]],
           ["Stanford", "https://www.stanford.edu/", ["apply","admission","research"]],
           ["Yale", "https://www.yale.edu/", ["apply","admission","research"]],
           ["UW", "http://www.washington.edu/", ["apply","admission","research"]]]
for college in colleges:
    cr = CollegeCrawl(college[0], college[1], college[2])
    cr.crawl()