# Wget using accept

In [1]:
# import necessary libraries
import os, csv
import shutil
import urllib
from urllib.request import urlopen
from socket import error as SocketError
import errno


In [2]:
#setting directories
micro_sample_cvs = "/Users/anhnguyen/Desktop/research/scraping_Python/micro-sample_Feb17.csv"
wget_folder = "/Users/anhnguyen/Desktop/research/scraping_Python/wget_accept"
no_dir_folder = "/Users/anhnguyen/Desktop/research/scraping_Python/no_dir"
learning_wget = "/Users/anhnguyen/Desktop/research/scraping_Python/learning_wget"

In [3]:
sample = [] # make empty list
with open(micro_sample_cvs, 'r', encoding = 'Windows-1252')\
as csvfile: # open file; the windows-1252 encoding looks weird but works for this
    reader = csv.DictReader(csvfile) # create a reader
    for row in reader: # loop through rows
        sample.append(row) # append each row to the list
        
#note: each row, sample[i] is a dictionary with keys as column name and value as info

In [4]:
# turning this into tuples we can use with wget!
# first, make some empty lists
url_list = []
name_list = []
terms_list = []

# now let's fill these lists with content from the sample
for school in sample:
    url_list.append(school["URL"])
    name_list.append(school["SCHNAM"])
    terms_list.append(school["ADDRESS"])

In [5]:
tuple_list = list(zip(url_list, name_list))
# Let's check what these tuples look like:
print(tuple_list[:3])
print("\n", tuple_list[1][1].title())

[('https://www.richland2.org/charterhigh/', 'RICHLAND TWO CHARTER HIGH'), ('https://www.polk.edu/lakeland-gateway-to-college-high-school/', 'POLK STATE COLLEGE COLLEGIATE HIGH SCHOOL'), ('https://www.nhaschools.com/schools/rivercity/Pages/default.aspx', 'RIVER CITY SCHOLARS CHARTER ACADEMY')]

 Polk State College Collegiate High School


### Helper Functions

In [18]:
def format_folder_name (k, name):
    """Format a folder nicely for easy access"""
    if k < 10: # Add two zeros to the folder name if k is less than 10 (for ease of organizing the output folders)
        dirname = "00" + str(k) + " " + name
    elif k < 100: # Add one zero if k is less than 100
        dirname = "0" + str(k) + " " + name
    else: # Add nothing if k>100
        dirname = str(k) + " " + name
    return dirname

def run_wget_command(link, parent_folder, my_folder):
    """wget on link and print output to appropriate folders"""
    #navigate to parent folder
    os.chdir(parent_folder)
    # create dir my_folder if it doesn't exist yet
    if not os.path.exists(my_folder):
        os.makedirs(my_folder)
    #navigate to the correct folder, ready to wget
    os.chdir(my_folder)
    
    os.system('wget -np --no-parent --show-progress --progress=dot --recursive --level=3 --convert-links --retry-connrefused \
         --random-wait --no-cookies --secure-protocol=auto --no-check-certificate --execute robots=off \
         --header "Host: jrs-s.net" \
         --user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" \
          --accept .html' + ' ' + link)
    

def contains_html(my_folder):
    """check if a wget is success by checking if a directory has a html file"""

    for r,d,f in os.walk(my_folder):
        for file in f:
            if file.endswith('.html'):
                return True
    return False  
def count_with_file_ext(folder, ext):
    count = 0
    for r,d,f in os.walk(my_folder):
        for file in f:
            if file.endswith(ext):
                count +=1
    return count  
def write_to_file(num, link, file_name):
    with open(file_name, "a") as text_file:
        text_file.write(str(num) + "\t" + link +"\n")
        
def write_file(str, file_name):
    with open(file_name, "a") as text_file:
        text_file.write(str)
        
def reset(folder, text_file_1, text_file_2):
    """Deletes all files in a folder and set 2 text files to blank"""
    parent_folder = folder[: folder.rindex('/')]
    shutil.rmtree(folder)
    os.makedirs(folder)
    filelist = [ f for f in os.listdir(folder) if f.endswith(".bak") ]
    for f in filelist:
        os.unlink(f)
    for file_name in [text_file_1, text_file_2]:
        reset_text_file(file_name)
        
def reset_text_file(file_name):
    if os.path.exists(file_name):
            with open(file_name, "w") as text_file:
                text_file.write("")

In [7]:
#testing methods
print(format_folder_name(30, "name me"))



030 name me


### Running wget

In [14]:
# set up file directories
success_file = "/Users/anhnguyen/Desktop/research/scraping_Python/success.txt"
fail_file = "/Users/anhnguyen/Desktop/research/scraping_Python/fail.txt"

In [52]:
#reset(wget_folder, success_file, fail_file)

In [9]:

k=200 # initialize this numerical variable k, which keeps track of which entry in the sample we are on.

#testing the first 10 tuples
tuple_test = tuple_list[200:300]

for tup in tuple_test:
    school_title = tup[1].title()
    k += 1 # Add one to k, so we start with 1 and increase by 1 all the way up to entry # 300
    print("Capturing website data for", school_title + ", which is school #" + str(k), "of 300...")
    
    # use the tuple to create a name for the folder
    dirname = format_folder_name(k, school_title)
    
    run_wget_command(tup[0], wget_folder, dirname)
    
    school_folder = wget_folder + '/'+ dirname
    if contains_html(school_folder):
        write_file( tup[0], success_file )
    else :
        write_file( tup[0], fail_file)
print("done!")
    

Capturing website data for City On A Hill Charter Public School Ii, which is school #201 of 300...
Capturing website data for Circles Of Success Learning Academy, which is school #202 of 300...
Capturing website data for Chicago Park Community Charter, which is school #203 of 300...
Capturing website data for Chandler Park Academy - Elementary, which is school #204 of 300...
Capturing website data for Cedar Tree Academy Pcs, which is school #205 of 300...
Capturing website data for Capital City Lower Pcs, which is school #206 of 300...
Capturing website data for Cardinal Community Academy Charter School, which is school #207 of 300...
Capturing website data for Camino Nuevo Charter Academy No. 4, which is school #208 of 300...
Capturing website data for Camarillo Academy Of Progressive Education, which is school #209 of 300...
Capturing website data for Bridgewater Academy Charter, which is school #210 of 300...
Capturing website data for The James And Grace Lee Boggs School, which is 

### Limitation of wget

-only works for static HTML and it doesn’t support JavaScript. Thus any element generated by JS will not be captured. 

More info:

https://www.petekeen.net/archiving-websites-with-wget

http://askubuntu.com/questions/411540/how-to-get-wget-to-download-exact-same-web-page-html-as-browser

https://www.reddit.com/r/linuxquestions/comments/3tb7vu/wget_specify_dns_server/
failed: nodename nor servname provided, or not known.


In [9]:
def check(url):
    """ Helper function, check if url is a valid list"""
    try:
        urlopen(url)
        
    except urllib.error.URLError:
        return False
    except urllib.error.HTTPError:
        return False
    except SocketError:
        return False
    return True
#want to see how many html files?
# want to see how many links are invalid?
#read txt file

def read_txt(txt_file):
    links = []
    count = 0
    with open(txt_file) as f:
        for line in f:     
#             elem =  line.split('\t')[1].rstrip()
#             if elem.endswith('\'):
#                 elem = elem[:-1]
            count +=1
#             print(elem)
            links += [line.rstrip()]
    return links, count

In [10]:
success_links, count = read_txt(success_file)
print("There are {} links in success file.".format( count))
# print(success_links)

There are 243 links in success file.


In [11]:
fail_links, count = read_txt(fail_file)
print("There are {} links in fail file.".format( count))

There are 57 links in fail file.


In [23]:
# counting # of html files
# def count_html(file):
    
def count_valid_links(list_of_links, valid_file, invalid_file):
    count_success, count_fail = 0, 0
    valid, invalid = '', ''
    for l in list_of_links:
#         print(l)
        if check(l):
            valid += l + '\n'
            count_success +=1
        else:
            invalid += l + '\n'
            count_fail += 1
    write_file(valid, valid_file)
    write_file(invalid, invalid_file)
    return count_success, count_fail



In [24]:
valid_list = '/Users/anhnguyen/Desktop/research/scraping_Python/valid_links.txt'
invalid_list = '/Users/anhnguyen/Desktop/research/scraping_Python/invalid_links.txt'
reset_text_file(valid_list)
reset_text_file(invalid_list)

In [25]:

count_success, count_fail = count_valid_links(fail_links, valid_list, invalid_list)


In [26]:
print("There are {} valid links and {} invalid links".format(count_success, count_fail))

There are 31 valid links and 26 invalid links


In [None]:
# recheck links without "/"
