In [4]:
import pandas as pd
import csv
import requests
import xml.etree.ElementTree as ET
import zipfile
import boto3
from io import StringIO

In [5]:
 '''
     The structure of the Steeleye class:
     This Steeleye Object have a parameter which contains:
    
     -url: Given a URL, one can parse through the XML data and extract the initial download link that has a file_type of DLTINS.
     
     
     -link: Requests the url to get the xml filetthan parse the Xml file to return the download_link containing only DLTINS file_type.
     -extract: Downloads the zip file and extract the xml file in zip file.
     -method xml_to_csv: Parse the xml file and Converts it into csv.
    '''

class Steeleye: 
    def __init__(self, url = None) -> None:
        self.url = url
        
    def link(self):
        '''
        To obtain the necessary path, use the URL of the class. Then, create a binary mode file named 'registers.xml' and write the path 
        data to the XML file. Next, parse the XML file to locate the relevant node and retrieve the download link.
        
        '''
        self.resp = requests.get(self.url)
        with open('registers.xml', 'wb') as f:
            f.write(self.resp.content)
        self.tree = ET.parse('registers.xml')
        self.root = self.tree.getroot()
        
        self.link = ''
        for item in self.root[1].iter("doc"):
            if item.find("str[@name = 'file_type']").text == 'DLTINS':
                self.link = item.find("str[@name='download_link']").text
                break
        return self.link
    
    def extract(self, link = None):
        '''
        The function takes in a parameter 'link', which is a URL link to download a zip file. Using this link, the function
        requests the file and creates a new file named 'zip_file.zip' to store the content of the downloaded file. 
        The function then extracts the zip file and retrieves the name of the file from the namelist. 
        Finally, the function returns the name of the extracted file.
        '''
        self.zip_file = requests.get(self.link)
        with open('zip_file.zip', 'wb') as f:
            f.write(self.zip_file.content)
        self.xml_file = ''
        with zipfile.ZipFile('zip_file.zip', 'r') as f:
            self.xml_file = f.namelist()[0]
            f.extractall('')
        return self.xml_file
    
    def xml_to_csv(self, xml = None):
        '''
        :param xml: xml file which is to be converted to csv
        
        Parse the xml file to find the required tags according to the following headers
        {FinInstrmGnlAttrbts.Id,
        FinInstrmGnlAttrbts.FullNm,
        FinInstrmGnlAttrbts.ClssfctnTp,
        FinInstrmGnlAttrbts.CmmdtyDerivInd,
        FinInstrmGnlAttrbts.NtnlCcy,
        Issr}
        
        Creates a DataFrame with the above headers and returns it.
        
        '''
        
        self.new = ET.parse(xml)     #parse the xml file
        self.test = self.new.getroot()

        self.pattern = 'FinInstrmGnlAttrbts'     #required node
        self.parameters = ['Id', 'FullNm', 'ClssfctnTp', 'CmmdtyDerivInd', 'NtnlCcy']     #required parameter nodes
        self.tag = 'Issr' #required node

        self.rows  = []
        self.cols = [self.pattern + '.' + k for k in self.parameters]
        self.cols.append(self.tag)
        
        self.parent = 'TermntdRcrd'        #main node
        
        for i in self.test.iter():         
            if self.parent in i.tag:       # If parent is found
                self.entry = [None for x in range(len(self.cols))]     # Initialise array of required elements
                for child in i:
                    if self.pattern in child.tag:    # If required child has been found
                            for c in child:     # Get the required grand-children
                                for k in range(len(self.children)):
                                        if self.parameters[k] in c.tag:    # If grandchildren found, update entry
                                            self.entry[k] = c.text
                    if self.tag in child.tag:     # If Issr found
                        self.entry[5] = child.text
                self.rows.append(self.entry)      # Add to list of rows
                
                
        self.df = pd.DataFrame(self.rows, columns=self.cols)      
        return self.df

In [6]:
if __name__ == '__main__':
    
    #step - 1 copying the download link to the url.
    url = "https://registers.esma.europa.eu/solr/esma_registers_firds_files/select?q=*&fq=publication_date:%5B2021-01-17T00:00:00Z+TO+2021-01-19T23:59:59Z%5D&wt=xml&indent=true&start=0&rows=100"
    data = Steeleye(url) #creating an object for Steeleye 
    
    #step - 2: From the xml, parse through to the first download link whose file_type is DLTINS and download the zip
    zipfile = data.link()
    
    #step - 3: Extract the xml from the zip.
    xmlfile = data.extract(zipfile)
    
    #step - 4: Convert the contents of the xml into a CSV
    dataframe = data.xml_to_csv(xmlfile)
    dataframe.to_csv('Blank-CSV-Template.csv')
    
    #step - 5: Store the csv from step 4) in an AWS S3 bucket 
    s3 = boto3.client("s3", aws_access_key_id = "AKIA3BJK6LUBWUA232H7", aws_secret_access_key="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")
    csv_buf = StringIO()
    df.to_csv(csv_buf, header = True, index = False)
    csv_buf.seek(0)
    s3.put_object(Bucket="mysteeleyebucket", Body=csv_buf.getvalue(), Key='Blank-CSV-Template.csv')