# Check URLs for schools with URLs that didn't webscrape successfully

Authors: Tina Nguyen, Jaren Haber, Thao ... <br>
Project Manager: Jaren Haber, PhD Candidate <br>
Contact: jhaber@berkeley.edu

Institution: University of California, Berkeley <br>
Program: Undergraduate Research Apprentice Program (URAP) <br>

Date created: Fall 2017 <br>
Last modified: November 14th, 2018

Description: 

## Initialize

In [1]:
# Import packages
import pandas as pd
import numpy as np
import gc # For managing garbage collector

# Old imports (not sure what from here we need)
from bs4 import BeautifulSoup
import re
import os, csv
import shutil
import urllib
from urllib.request import urlopen
from socket import error as SocketError
import errno
import pickle
import pandas as pd
import lxml
# import httplib2
import requests, contextlib
from urllib.parse import urljoin, urlparse
from requests.auth import HTTPBasicAuth, HTTPDigestAuth
from os.path import splitext

In [2]:
# Load data
pd.set_option('display.max_colwidth', -1)
df = pd.read_csv("../data/charters_unscraped_hasURL_2015.csv", low_memory=False)

print("# rows, # cols: ", df.shape)

# rows, # cols:  (717, 13)


In [3]:
df["NCESSCH"] = df["NCESSCH"].astype(float) # Convert to float type (just in case)

In [4]:
# Sanity check: Show number of duplicates by NCESSCH:
print("# duplicates by NCESSCH: ", sum(df.duplicated(subset=["NCESSCH"])))

# duplicates by NCESSCH:  0


In [5]:
school_urls = df["URL"]
school_names = df["SCHNAM16"]

print(str(len(school_urls)))
school_urls

717


0      https://education.alaska.gov/DOE_Rolodex/SchoolCalendar/Home/SchoolDetails/319010
1      https://education.alaska.gov/DOE_Rolodex/SchoolCalendar/Home/SchoolDetails/56010 
2      https://education.alaska.gov/DOE_Rolodex/SchoolCalendar/Home/SchoolDetails/59090 
3      https://education.alaska.gov/DOE_Rolodex/SchoolCalendar/Home/SchoolDetails/249010
4      http://nome.nosd.schoolaccess.net/~acsa/                                         
5      http://ekc.k12northstar.org/                                                     
6      http://kaolaz.org/calendar.html                                                  
7      http://www.canyonviewprep.org/                                                   
8      http://www.pecschools.org/                                                       
9      http://pecschools.org/                                                           
10     http://www.compasshighschool.com/                                                
11     https://www.cr

## Define helper functions

In [6]:
def check(url):
    """ Helper function, check if url is a valid list <- our backup plan
    This functions helps to check the url that has service unavailable issues
    Since status code fails to check this."""
    
    try:
        urlopen(url)
        
    except urllib.error.URLError:
        print(url + " :URLError")
        return False
    except urllib.error.HTTPError:
        print(url +' :HTTPError')
        return False
    except SocketError:
        print(url + 'SocketError')
        return False
    print("Valid link found!")
    return True

In [7]:
def check_url(url):
    """This functions uses the status code to determine if the link is valid. This resolves
    the links that redirects and most cases of authentication problems"""
    code = "[no code collected]"
    if url == "":
        return False
    try:
        r = requests.get(url, auth=HTTPDigestAuth('user', 'pass'), headers= {'User-Agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"})
        code = r.status_code
        #backup plan for service unavailable issues
        if code == 503:
            return check(url)
        if code < 400:
            print("Valid link found!")
            return True   
    except:
        pass
    print("Encountered this invalid link: " + str(url) +" ---Error code: " + str(code))
    return False    

In [8]:
def get_children_links(url_parent, hostname, visited, depth, useless):
    """This function recursively gets the children links of a given link"""
    #we have gone through enough levels or visited this link already 
    if depth == 0 or url_parent in visited or url_parent in useless:
        return set()
    if not check_url(url_parent):
        useless.add(url_parent)
        return set()
    
    #get the html page
    #parse into a BS object
    html_page = requests.get(url_parent, auth=HTTPDigestAuth('user', 'pass'), headers= {'User-Agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"})
    # TO DO: try time library? or one link from each site at a time? round-robin/ throttle
    # check out SCRAPY--reading "Learning Scrapy"
    # PDF Labs, PDFtk free library (w/ API?), pdfrw (the other python library)
    soup = BeautifulSoup(html_page.text, "lxml")

    #we visited url_parent, updated into the set
    visited.add(url_parent)
    
    #now checking its children
    for link in soup.findAll('a'):
        #running recursively in a try-except block to prevent broken links break the code
        try:
            pattern = re.compile("((http|ftp)s?://.*?)")
            current_link = link.get('href')
#             print(current_link)
            if not pattern.match(current_link):
                current_link = urljoin(url_parent, current_link)
            
            #check if the link is within the domain (hostname)
            if hostname in current_link:
#                 print(current_link)
                #combine results from its children's links
                get_children_links(current_link, hostname, visited, depth -1, useless)
        except:
            pass
    return visited
#     print(count)

In [9]:
def getLinks(url, depth):
    text,useless = set(), set()
    hostname = urlparse(url).hostname
    return get_children_links(url, hostname, text, depth, useless)
#delete 

## Use check_url() on bad URL list

In [17]:
bad_links = []
for url in school_urls:
#    check_url(url)
    if not check_url(url):
        bad_links.append(url)

Valid link found!
Valid link found!
Valid link found!
Valid link found!
Encountered this invalid link: http://nome.nosd.schoolaccess.net/~acsa/ ---Error code: [no code collected]
Valid link found!
Valid link found!
Valid link found!
Encountered this invalid link: http://www.pecschools.org/ ---Error code: [no code collected]
Encountered this invalid link: http://pecschools.org/ ---Error code: [no code collected]
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Encountered this invalid link: http://caminomontessori.com/ ---Error code: [no code collected]
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Encountered this invalid link: http://www.foundationspublicschools.org/ ---Error code: 404
Valid link found!
Valid link found!
Val

Encountered this invalid link: http://www.mbacs.org/?_escaped_fragment_=history/c62f ---Error code: 404
Encountered this invalid link: http://www.wdp-llpcs.org/ ---Error code: [no code collected]
Valid link found!
Valid link found!
Encountered this invalid link: http://webgui.phila.k12.pa.us/offices/c/charter_schools/schools/arise-academy-high-charter-school ---Error code: [no code collected]
Encountered this invalid link: http://www.phila.k12.pa.us/ ---Error code: [no code collected]
Encountered this invalid link: http://www.innovativeartslv.com// ---Error code: 500
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Encountered this invalid link: http://bridgewateracademy.org/ ---Error code: [no code collected]
Encountered this invalid link: https://www.scscienceacademy.org/ ---Error code: 404
Valid link found!
Valid link found!
Valid link found!
Encountered this invalid link: http://coleman.aspirememphis.org/ ---Error code: [no code collected]
Valid link found!
V

Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Encountered this invalid link: https://mcpa.renewschools.org/ ---Error code: [no code collected]
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Encountered this invalid link: http://mcpa.renewschools.org/ ---Error code: [no code collected]
Encountered this invalid link: http://opwchargers.org/ ---Error code: 404
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link found!
Valid link fo

In [22]:
len(bad_links) #there are 101 invalid links

101

In [23]:
bad_links

['http://nome.nosd.schoolaccess.net/~acsa/',
 'http://www.pecschools.org/',
 'http://pecschools.org/',
 'http://caminomontessori.com/',
 'http://www.foundationspublicschools.org/',
 'http://www.acaciaelementary.org/about',
 'https://acaciamiddle.org/',
 'http://edline.pusd.org/pages/PUSD',
 'http://ib.sbusd.org/',
 'http://nestor.sbusd.org/pages/Nestor_Language_Academy_Charte',
 'http://www.krecr.org/',
 'https://main.dorothyheightes.org/',
 'https://excelpcs.org/',
 'https://excelpcs.org/',
 'http://www.imaginenl.com/',
 'https://andrewshighschool.com/',
 'http://www.pathwayscharter.net/',
 'http://schoolofsuccessjax.com/',
 'http://grady.k12.ga.us/',
 'http://www.kualapuu.k12.hi.us/',
 'http://healthacademy.idpl.org/',
 'http://www.andrewacademy.org/',
 'http://www.careeracademybr.org/',
 'http://morehouse.nls.k12.la.us/beekman/',
 'http://www.bacademy.org/',
 'http://www.mta.matchbooklearning.com/',
 'http://www.ulspsa.org/',
 'http://upsm.uprepschools.com/upsm-middle-school/',
 'ht

## Filtering charter data to remove bad URLs

In [27]:
# Load data
gc.disable()
charterdf = pd.read_pickle('../../nowdata/charters_2015.pkl')
gc.enable()

print(charterdf.shape)

(10965, 702)


In [30]:
"http://hce.whitnallschools.com/" in bad_links

True

In [72]:
# finding the index of rows that contain invalid links
index = 0 
remove = []
for i in charterdf["URL"]:
    if i in bad_links:
        remove.append(index)
    index += 1

In [75]:
# dropping rows that have invalid links
result = np.array(remove)
charterdf.reset_index()
valid_df = charterdf.drop(result)
valid_df.head()

Unnamed: 0,NCESSCH,URL,LAT1516,LON1516,AM,AS,BL,HI,HP,TR,...,ESS_VALID_STR,PROG_VALID_COUNT,PROG_VALID_STR,RIT_VALID_COUNT,RIT_VALID_STR,INQUIRY_RATIO,DISCIPLINE_RATIO,ESS_VALID_RATIO,PROG_VALID_RATIO,RIT_VALID_RATIO
0,10019700000.0,http://www.maef.net/,,,,,,,,,...,-3.049363,18.0,-2.997944,51.0,-2.545622,0.007424,0.000446,0.000893,0.001005,0.002847
1,20000100000.0,https://education.alaska.gov/DOE_Rolodex/SchoolCalendar/Home/SchoolDetails/319010,60.796131,-161.765194,167.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,,0.0,,,,,,
2,20015000000.0,https://www.kgbsd.org/ketchikancharter,55.347001,-131.641191,74.0,37.0,2.0,5.0,4.0,5.0,...,-6.0,1.0,-3.123525,0.0,-6.0,0.003762,0.000752,0.0,0.000752,0.0
3,20015000000.0,http://www.tongassschool.org/,55.347001,-131.641191,57.0,12.0,4.0,6.0,1.0,11.0,...,-3.673297,5.0,-3.752448,31.0,-2.96001,0.009768,0.000177,0.000212,0.000177,0.001096
4,20018000000.0,https://aquarian.asdk12.org/,61.192407,-149.916872,10.0,11.0,6.0,19.0,2.0,51.0,...,-6.0,0.0,-6.0,0.0,-6.0,0.0,0.0,0.0,0.0,0.0


## Deprecated (for reference only)

In [25]:
schools = dict()
cap = len(school_urls)
#cap = 10
for i in range(cap):
    link, name = school_urls[i], school_names[i]
    #sanity check make sure all links are valid
    if not check_url(link):
        print("WARNING! " + name + " doesnt have a valid link! " + link)
    print("\nGetting links for " + str(i) + " " + school_names[i] + "...")
    #saving school names as keys and list of links as values
    schools.update({name: list(getLinks(link, 3))})
    # TO DO: PRINT NUMBER LINKS GATHERED (as in a "success" kind of statement)


Getting links for 0 AYAPRUN ELITNAURVIK...

Getting links for 1 FAMILY PARTNERSHIP CHARTER SCHOOL...

Getting links for 2 RILKE SCHULE CHARTER SCHOOL...

Getting links for 3 AURORA BOREALIS CHARTER SCHOOL...
Encountered this invalid link: http://nome.nosd.schoolaccess.net/~acsa/ ---Error code: [no code collected]

Getting links for 4 ANVIL CITY SCIENCE ACADEMY...
Encountered this invalid link: http://nome.nosd.schoolaccess.net/~acsa/ ---Error code: [no code collected]

Getting links for 5 EFFIE KOKRINE CHARTER SCHOOL...

Getting links for 6 KINGMAN ACADEMY OF LEARNING - HIGH SCHOOL...
Encountered this invalid link: mailto:schan@kaolaz.org ---Error code: [no code collected]
Encountered this invalid link: http://kaolaz.org/home/AZ088620204/high%20school/KAHS_SchoolSiteCouncil.pdf ---Error code: 404
Encountered this invalid link: mailto:elillis@kaolaz.org ---Error code: [no code collected]
Encountered this invalid link: mailto:msuchowierski@kaolaz.org ---Error code: [no code collected]
E

TypeError: must be str, not float

## Save output

In [86]:
# Save DataFrame as CSV to data-management, web-scraping, and scrapy-cluster repos:
#df.to_csv('../../data_management/data/charters_unscraped_hasURL_2015.csv', index=False)
#df.to_csv('../../web_scraping/data/charters_unscraped_hasURL_2015.csv', index=False)
#df.to_csv('../../scrapy-cluster/kafka-monitor/charter_urls_2016_unscraped.csv', index=False)

In [26]:
with open("../../web_scraping/data/badurls.txt", "w") as output:
    output.write(str(bad_links))

In [78]:
#saving df to test with wayback function
valid_df.to_pickle("../../web_scraping/data/charters_valid_urls_2015.pkl")