In [None]:
import requests
import pandas as pd
from tqdm import tqdm
from os import listdir

In [None]:
class VAM():
    '''A class to extract data from the Victoria and Albert Museum API.

    See more information in: 
    https://developers.vam.ac.uk/guide/v2/search/introduction.html

    https://api.vam.ac.uk/docs#/
    
    Attributes
    ----------

    object_type : str
        Name of object type that will be searched

    year_accession_from : int
        Filter from which year the object was accessioned into the museum collection

    year_accession_to : int
        Filter until which year the object was accessioned into the museum collection

    n_page : int, minLength: 1, maxLength: 100
        Number of pages in the return of API

    page_size : int, minLength: 1, maxLength: 100
        Size of page in the return of API

    chunksize : int
        Range of years for each request (breaking an big request in smaller pieces)

    cursor_from : int
        Parameter used to iterate the whole range of years

    cursor_to : int
        Parameter used to iterate the whole range of years

    output : str
        Output path to the extracted data
    '''

    def __init__(self, object_type = None, year_accession_from = None, year_accession_to = None, n_page = None, page_size = None, chunksize = None, cursor_from = None, cursor_to = None, output = None):
        self.object_type = object_type 
        self.year_accession_from = year_accession_from 
        self.year_accession_to = year_accession_to
        self.page_size = page_size
        self.chunksize = chunksize
        self.cursor_from = self.year_accession_from
        self.cursor_to = self.year_accession_from + self.chunksize
        self.n_page = n_page
        self.output = output

    def extract(self):
        '''
        Execute the requests to the API to extract the data
        '''
        # Loop to interate between selected dates, jumping through selected chunksize
        while self.cursor_from <= self.year_accession_to:

            # Create empty list 
            lst_result = []
            
            # Loop to interate through the pages
            for page in tqdm(range(1,self.n_page + 1)):
            
                # Defining URL
                url = f"https://api.vam.ac.uk/v2/objects/search?q_object_type={self.object_type}&year_accessioned_from={self.cursor_from}&year_accessioned_to={self.cursor_to}&order_sort=asc&page={page}&page_size={self.page_size}&cluster_size=20&response_format=json"

                # Requesting data
                resp = requests.get(url,headers= {'accept': 'application/json'})

                # Selecting data in JSON       
                temp_json = resp.json()['records']
                
                # Verifying if there's no more values and finishing the loop
                if temp_json == []:
                    
                    break

                else: 
                    # Appending in a list
                    lst_result += temp_json

            # Normalizing json and creating DataFrame
            result = pd.json_normalize(lst_result)

            # Exporting results
            result.to_csv(f'{self.output}/VAM_{self.cursor_from}-{self.cursor_to}.csv',index=False)

            print(f'\n Concluded extraction From: {self.cursor_from} - To: {self.cursor_to} \n')

            # Changing period of a time
            self.cursor_from = self.cursor_to + 1
            self.cursor_to += self.chunksize

        return 'Files created'

    def compile_files(self):
        '''
        Read the output path and aggregate the csv archives in one archive
        '''
        # Creating empty DataFrame
        df = pd.DataFrame()

        # Interating dir to compile files
        for archive in listdir(self.output):
            
            # Verifying if is an target archive
            if archive.startswith('VAM_'):
                
                # Reading information
                temp_df = pd.read_csv(f'{self.output}/{archive}')

                # Appending to DF
                df = df.append(temp_df)
            else:
                # Do nothing
                pass
        
        # Exporting aggregated result
        df.to_csv(f'{self.output}/Agg_VAM_{self.year_accession_from}-{self.year_accession_to}.csv',index=False)

        print('\n Files aggregated \n')

        return 'Files aggregated'

    def run(self):
        '''Function to run the whole process of extraction'''
        self.extract()
        self.compile_files()

In [None]:
vam = VAM(object_type = 'Painting', year_accession_from = 1850, year_accession_to = 2021, n_page = 100, page_size = 100, chunksize = 50, output = './Output')

In [None]:
%%time
vam.run()