# State Assessment Data Retrieval

## Objective:

Retrieve assessement data for a charter school network via a spider to follow hyperlinks that will download files containing testing results.

In [1]:
# importing packages
import scrapy
from scrapy import Selector
import requests as r
import pandas as pd
from scrapy.crawler import CrawlerProcess
import string as str

In [2]:
# instantiating web scarper objects via GET request
url = 'https://www.nj.gov/education/assessment/results/reports/2223/index.shtml' #--> url of site
html = r.get(url).content #--> get request returns HTML code
sel = Selector(text = html) #--> instansiating Selector object

In [3]:
# retrieving links for assessment results
links = sel.xpath('//*[@id="section_0"]/ul/li/a').extract() #--> link object
base_url = 'https://www.nj.gov' #--> creating base url to access links

new_links = [] #--> creating empty list to appemd links 

#appending links to empty list
for link in links:
    new_links.append(base_url + link.split('"')[1]) #--> appending links to list to be used in spider
 
# displaying links
for link in new_links:
    print(link)

https://www.nj.gov/education/assessment/results/reports/2223/spring/ELA03%20NJSLA%20DATA%202022-23.xlsx
https://www.nj.gov/education/assessment/results/reports/2223/spring/ELA04%20NJSLA%20DATA%202022-23.xlsx
https://www.nj.gov/education/assessment/results/reports/2223/spring/ELA05%20NJSLA%20DATA%202022-23.xlsx
https://www.nj.gov/education/assessment/results/reports/2223/spring/ELA06%20NJSLA%20DATA%202022-23.xlsx
https://www.nj.gov/education/assessment/results/reports/2223/spring/ELA07%20NJSLA%20DATA%202022-23.xlsx
https://www.nj.gov/education/assessment/results/reports/2223/spring/ELA08%20NJSLA%20DATA%202022-23.xlsx
https://www.nj.gov/education/assessment/results/reports/2223/spring/ELA09%20NJSLA%20DATA%202022-23.xlsx
https://www.nj.gov/education/assessment/results/reports/2223/spring/MAT03%20NJSLA%20DATA%202022-23.xlsx
https://www.nj.gov/education/assessment/results/reports/2223/spring/MAT04%20NJSLA%20DATA%202022-23.xlsx
https://www.nj.gov/education/assessment/results/reports/2223/spr

In [4]:
# creating spider class
class SpiderClass(scrapy.Spider):
    name = 'assessment_spider' #--> spider name
    
    def start_requests(self): #--> start request
        # defining new links in a list
        new_links = [ 
        
        'https://www.nj.gov/education/assessment/results/reports/2223/spring/ELA03%20NJSLA%20DATA%202022-23.xlsx',
        'https://www.nj.gov/education/assessment/results/reports/2223/spring/ELA04%20NJSLA%20DATA%202022-23.xlsx',
        'https://www.nj.gov/education/assessment/results/reports/2223/spring/ELA05%20NJSLA%20DATA%202022-23.xlsx',
        'https://www.nj.gov/education/assessment/results/reports/2223/spring/ELA06%20NJSLA%20DATA%202022-23.xlsx',
        'https://www.nj.gov/education/assessment/results/reports/2223/spring/ELA07%20NJSLA%20DATA%202022-23.xlsx',
        'https://www.nj.gov/education/assessment/results/reports/2223/spring/ELA08%20NJSLA%20DATA%202022-23.xlsx',
        'https://www.nj.gov/education/assessment/results/reports/2223/spring/ELA09%20NJSLA%20DATA%202022-23.xlsx',
        'https://www.nj.gov/education/assessment/results/reports/2223/spring/MAT03%20NJSLA%20DATA%202022-23.xlsx',
        'https://www.nj.gov/education/assessment/results/reports/2223/spring/MAT04%20NJSLA%20DATA%202022-23.xlsx',
        'https://www.nj.gov/education/assessment/results/reports/2223/spring/MAT05%20NJSLA%20DATA%202022-23.xlsx',
        'https://www.nj.gov/education/assessment/results/reports/2223/spring/MAT06%20NJSLA%20DATA%202022-23.xlsx',
        'https://www.nj.gov/education/assessment/results/reports/2223/spring/MAT07%20NJSLA%20DATA%202022-23.xlsx',
        'https://www.nj.gov/education/assessment/results/reports/2223/spring/MAT08%20NJSLA%20DATA%202022-23.xlsx',
        'https://www.nj.gov/education/assessment/results/reports/2223/spring/ALG01%20NJSLA%20DATA%202022-23.xlsx',
        'https://www.nj.gov/education/assessment/results/reports/2223/spring/GEO01%20NJSLA%20DATA%202022-23.xlsx',
        'https://www.nj.gov/education/assessment/results/reports/2223/spring/ALG02%20NJSLA%20DATA%202022-23.xlsx',
        'https://www.nj.gov/education/assessment/results/reports/2223/spring/SC05%20NJSLA%20DATA%202022-23.xlsx',
        'https://www.nj.gov/education/assessment/results/reports/2223/spring/SC08%20NJSLA%20DATA%202022-23.xlsx',
        'https://www.nj.gov/education/assessment/results/reports/2223/spring/SC11%20NJSLA%20DATA%202022-23.xlsx'
        ]
        
        for links in new_links: #--> for loop to iterate over links
            yield scrapy.Request(url = links, callback = self.parse)
            
    def parse(self, response): # --> start parse
        
        # creating file name object
        file_name = response.url.split('/')[-1] #--> splitting url and returing last indexed item
        

        """
        Opening the file in writing mode for binary data to be written in a file without any encoding or decoding
        """
        with open(file_name, 'wb') as f: #--> opens file for writing in binary mode
            f.write(response.body)#--> response.body contains the binary data which is written to the file
        
        #logs messages during the execution of a spider aids in errors and debugging
        self.log(f'Saved file {file_name}')
        

# starting crawler           
process = CrawlerProcess()
process.crawl(SpiderClass)
process.start()

2024-08-30 14:37:09 [scrapy.utils.log] INFO: Scrapy 2.8.0 started (bot: scrapybot)
2024-08-30 14:37:09 [scrapy.utils.log] INFO: Versions: lxml 4.9.1.0, libxml2 2.9.14, cssselect 1.1.0, parsel 1.6.0, w3lib 1.21.0, Twisted 22.2.0, Python 3.10.9 (main, Mar  1 2023, 12:33:47) [Clang 14.0.6 ], pyOpenSSL 23.0.0 (OpenSSL 1.1.1t  7 Feb 2023), cryptography 39.0.1, Platform macOS-10.16-x86_64-i386-64bit
2024-08-30 14:37:09 [scrapy.crawler] INFO: Overridden settings:
{}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2024-08-30 14:37:09 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2024-08-30 14:37:09 [scrapy.extensions.telnet] INFO: Telnet Password: 70337e7aa254db37
2024-08-30 14:37:09 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage

2024-08-30 14:37:11 [assessment_spider] DEBUG: Saved file SC05%20NJSLA%20DATA%202022-23.xlsx
2024-08-30 14:37:11 [assessment_spider] DEBUG: Saved file SC11%20NJSLA%20DATA%202022-23.xlsx
2024-08-30 14:37:11 [assessment_spider] DEBUG: Saved file SC08%20NJSLA%20DATA%202022-23.xlsx
2024-08-30 14:37:11 [scrapy.core.engine] INFO: Closing spider (finished)
2024-08-30 14:37:11 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 6724,
 'downloader/request_count': 19,
 'downloader/request_method_count/GET': 19,
 'downloader/response_bytes': 27399296,
 'downloader/response_count': 19,
 'downloader/response_status_count/200': 19,
 'elapsed_time_seconds': 1.584619,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2024, 8, 30, 18, 37, 11, 168481),
 'log_count/DEBUG': 41,
 'log_count/INFO': 10,
 'memusage/max': 156680192,
 'memusage/startup': 156680192,
 'response_received_count': 19,
 'scheduler/dequeued': 19,
 'scheduler/dequeued/memory': 19,
 'scheduler