### Notebook to parse xml to produce cleaned text of federal regulations

Sean Rehaag

License: Creative Commons Attribution-NonCommercial 4.0 International [(CC BY-NC 4.0)](https://creativecommons.org/licenses/by-nc/4.0/). NOTE: Users must also comply with upstream [licensing](https://www.justice.gc.ca/eng/terms-avis/index.html) for the data source.

Dataset & Code to be cited as: 

    Sean Rehaag, "Federal Regulations Bulk Decisions Dataset" (2024), online: Refugee Law Laboratory <https://refugeelab.ca/bulk-data/regulations-fed/>.

Notes:

(1) Data Source: [Department of Justice Github](https://github.com/justicecanada/laws-lois-xml) & [Department of Justice Website](https://laws-lois.justice.gc.ca).

(2) Unofficial Data: The data are unofficial reproductions of materials available on the Department Justice's Consolidated Acts and Regulations of Canada website. Official versions are available [here](https://laws-lois.justice.gc.ca/eng/regulations/).

(3) Non-Affiliation / Endorsement: The data has been collected and reproduced without any affiliation or endorsement from the Government of Canada.

(4) Non-Commerical Use: As indicated in the license, data may be used for non-commercial use (with attribution) only. For commercial use, see the Department of Justice website's [Terms of Use](https://www.justice.gc.ca/eng/terms-avis/index.html).

(5) Accuracy: Data was collected and processed programmatically for the purposes of academic research. While we make best efforts to ensure accuracy, data gathering of this kind inevitably involves errors. As such the data should be viewed as preliminary information aimed to prompt further research and discussion, rather than as definitive information. 

In [1]:
##############################################
##############################################
# NOTE: Github API does not see files beyond #
# 1000 files in a directory (it truncates).  #
# So, locally clone the repo and point to    #
# the relevant dir_path                      #
##############################################
##############################################


# set paths
dir_path = 'd:/AI-Projects/laws-lois-xml/'  # Local clone of Github repo
en_path = dir_path + 'eng/regulations/'
fr_path = dir_path + 'fra/reglements/'
out_path_hf = 'd:/AI-Projects/canadian-legal-data/REGULATIONS-FED/'



In [2]:
# Process English Regulations

# Tables aren't extracted.

from lxml import etree as ET
import pandas as pd
import os
import time
import re
import numpy as np

def get_root_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        root = ET.parse(f).getroot()
    return root

def fix_errors(text):
    text = text.replace('(   ','(')
    text = text.replace(' )',')')
    text = text.replace(' .','.')
    text = text.replace(' ,',',')
    text = text.replace(' ;',';')
    text = text.replace('    ',' ')
    text = text.replace('   ',' ')
    text = text.replace('  ',' ')
    text = text.replace('  ',' ')
    text = text.replace('  ',' ')
    return text

def extract_text(elem):
    text_parts = []  

    if elem.text:
        if elem.tag == 'DefinedTermEn':  # need to change if FR
            text_parts.append('\n'+'*'+elem.text.strip()+'*')    
        else:     
            text_parts.append(elem.text.strip().replace('\n',''))

    for child in elem:
        child_text = extract_text(child) if child.tag != 'FootnoteRef' else None
        if child.tag == 'FootnoteRef':
            ref_text = '{' + (child.text.strip() if child.text else '') + '} '
            child_text = ref_text + (child.tail.strip().replace('\n', '') if child.tail else '')
        if child_text:
            text_parts.append(' ' + child_text + ' ')
        elif child.tail: 
            tail_text = child.tail.strip().replace('\n', '')
            if tail_text:
                text_parts.append(' ' + tail_text + ' ')
            
    if elem.tail:
        tail_text = elem.tail.strip().replace('\n','')
        if tail_text:
            text_parts.append(' ' + tail_text + ' ')
    return fix_errors(''.join(text_parts))

def get_table_text(elem):

    # Placeholder for table text
    return ''

def extract_ordered_elements(root):
    ordered_tag = []
    ordered_text = []
    
    elements_to_extract = [
        'TitleText',
        'Label',
        'Text',
        'MarginalNote',
        'TableGroup'
    ]

    special_labels = ['DIVISION', 'PART', 'SCHEDULE']
    
    for elem in root.iter():
        if elem.tag in elements_to_extract:       
            full_text = extract_text(elem)
            if full_text is None:
                continue
            if elem.tag == 'MarginalNote':
                ordered_text.append('\n\n### ' + full_text + '\n')
            elif elem.tag == 'Label':
                if ordered_tag and ordered_tag[-1] == 'Label':
                    ordered_text.append(full_text)
                elif any(special_labels in full_text for special_labels in special_labels):
                    ordered_text.append('\n\n# '+full_text)
                else:
                    ordered_text.append('\n' + full_text)
            elif elem.tag == 'TitleText':
                ordered_text.append('\n\n## ' + full_text)
            elif elem.tag == 'Text':
                ordered_text.append(full_text)
            elif elem.tag == 'TableGroup':
                ordered_text.append(get_table_text(elem))
                                    
            ordered_tag.append((elem.tag))

    ordered_text = ' '.join(ordered_text)

    ordered_text = re.sub(r'\n(?=\d)', '\n\n', ordered_text)

    breakpoints = ['# SCHEDULE',
                   '## RELATED PROVISIONS']
    break_applied = False               
    for break_point in breakpoints:
        if break_point in ordered_text:
            ordered_text = ordered_text[:ordered_text.index(break_point)]
            break_applied = True
            break

    repealed_info = root.find('.//Repealed')
    if repealed_info is not None and '[Repealed' not in ordered_text and '[Revoked' not in ordered_text and not break_applied:
        ordered_text = ordered_text + '\n\n' + repealed_info.text.strip()

    return ordered_text

def extract_citation(root, all_info = True):
    consolidated_number_element = root.find('.//InstrumentNumber')
    if consolidated_number_element is not None:
        citation = consolidated_number_element.text
    else:
        citation = ''

    return citation.strip()

def extract_date_registered(root, add_text = True):
    registered = root.find(".//RegistrationDate")
    if registered is not None:
        year = registered.find(".//YYYY").text
        month = registered.find(".//MM").text
        day = registered.find(".//DD").text
        date_registered = f"{year}-{month}-{day}"
    else:
        date_registered = ""
    return date_registered

def fix_doc_date(root, document_date, citation):
    if document_date == '':
        if citation[:3] == 'CRC': 
            if '1949' in citation:
                document_date = '1949-01-01'  # NOTE, JUST FOR YEARS, COULD GET MORE SPECIFIC
            elif '1955' in citation:
                document_date = '1955-01-01'  # NOTE, JUST FOR YEARS, COULD GET MORE SPECIFIC
            else:
                document_date = '1979-08-15'  # Came into force due to SI/79-131
        else:
            registered = root.find(".//RegulationMakerOrder")
            if registered is not None:
                year = registered.find(".//YYYY").text
                month = registered.find(".//MM").text
                day = registered.find(".//DD").text
                document_date = f"{year}-{month}-{day}"
            else:
                # manual fixes
                if citation == "SOR/54-743":
                    document_date = '1954-12-28'
                elif citation == "SOR/56-290":
                    document_date = '1956-07-19'
                elif citation == "SOR/57-176":
                    document_date = '1957-04-11'
                elif citation == "SOR/61-378":
                    document_date = '1961-08-22'
                elif citation == "SOR/67-619":
                    document_date = '1967-01-01' # NOTE JUST YEAR

                else:
                    document_date = ""
    return document_date

def extract_long_title(root):
    long_title_element = root.find(".//LongTitle")
    return ''.join(long_title_element.itertext())

def extract_short_title(root):
    short_title_element = root.find(".//ShortTitle")
    return ''.join(short_title_element.itertext())

def extract_title(root, include_short = True):
    if include_short:
        try:
            title = extract_short_title(root)
        except:
            title = extract_long_title(root)
    else:
        try:
            title = extract_long_title(root)
        except:
            title = ''
    # remove \n and extra spaces
    title = re.sub(r'\s+', ' ', title)
    return title

def extract_Enabling_Authority(root):
    try:
        enabling_authority = extract_text(root.find(".//EnablingAuthority"))
        if enabling_authority is None:
            enabling_authority = ''
        if enabling_authority != '':
            enabling_authority = 'Enabling authority: ' + enabling_authority.strip()
    except:
        enabling_authority = ''

    if enabling_authority is None:
        enabling_authority = ''
    return enabling_authority

def extract_full_text(root):
    title = str(extract_title(root, include_short=False))
    registered_date = str(extract_date_registered(root))
    ordered_text = str(extract_ordered_elements(root))
    enabling_authority = str(extract_Enabling_Authority(root))
    full_text = title + '\n\n' + registered_date + '\n\n' + enabling_authority + '\n\n' + ordered_text
    full_text = re.sub(r'^\s+$', '\n', full_text, flags=re.MULTILINE)
    full_text = re.sub(r'\n{3,}', '\n\n', full_text)
    return full_text

# iterate through all files in /acts/ folder and extract text to df
files = os.listdir(en_path)
data = []
for file in files:
    # skip 
    if file == 'regs.txt' or file == 'PLACEHOLDER2.xml':
        continue
    try:
        root = get_root_from_file(en_path+file)
        citation = extract_citation(root, all_info=False).replace('.','')
        document_date = extract_date_registered(root, add_text=False)
        document_date = fix_doc_date(root, document_date, citation)
        title = extract_title(root)
        full_text = extract_full_text(root)
        unofficial_text = '# '+ title + '\n\n' + citation + '\n\n' + full_text
        citation2 = ""
        dataset = "REGULATIONS-FED"
        year = np.int32(int(document_date[:4]))
        language = 'en'
        source_url = 'https://github.com/justicecanada/laws-lois-xml/tree/main/eng/regulations/' + file
        scraped_timestamp = time.strftime('%Y-%m-%d')
        other = ''
        data.append([citation,
                     citation2, 
                     dataset, 
                     year, 
                     title, 
                     language,
                     document_date, 
                     source_url,
                     scraped_timestamp,
                     unofficial_text,
                     other])
    except Exception as e:
        print(f'Error in {file}')
        print(e)

df_regs_en = pd.DataFrame(data, columns=['citation',
                                 'citation2',
                                 'dataset',
                                 'year',
                                 'name',
                                 'language',
                                 'document_date', 
                                 'source_url',
                                 'scraped_timestamp',
                                 'unofficial_text',
                                 'other'
                                 ])

# export to json
df_regs_en.to_json('DATA/df_regs_en.json', orient='records', lines=True)

df_regs_en

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other
0,"CRC, c 10",,REGULATIONS-FED,1979,Flying Accidents Compensation Regulations,en,1979-08-15,https://github.com/justicecanada/laws-lois-xml...,2025-01-05,# Flying Accidents Compensation Regulations\n\...,
1,"CRC, c 100",,REGULATIONS-FED,1979,Ottawa International Airport Zoning Regulations,en,1979-08-15,https://github.com/justicecanada/laws-lois-xml...,2025-01-05,# Ottawa International Airport Zoning Regulati...,
2,"CRC, c 101",,REGULATIONS-FED,1979,Penticton Airport Zoning Regulations,en,1979-08-15,https://github.com/justicecanada/laws-lois-xml...,2025-01-05,"# Penticton Airport Zoning Regulations\n\nCRC,...",
3,"CRC, c 1013",,REGULATIONS-FED,1979,Canada Industrial Relations Remuneration Regul...,en,1979-08-15,https://github.com/justicecanada/laws-lois-xml...,2025-01-05,# Canada Industrial Relations Remuneration Reg...,
4,"CRC, c 1015",,REGULATIONS-FED,1979,Fair Wages and Hours of Labour Regulations,en,1979-08-15,https://github.com/justicecanada/laws-lois-xml...,2025-01-05,# Fair Wages and Hours of Labour Regulations\n...,
...,...,...,...,...,...,...,...,...,...,...,...
4713,SOR/99-53,,REGULATIONS-FED,1999,Competency of Operators of Pleasure Craft Regu...,en,1999-1-15,https://github.com/justicecanada/laws-lois-xml...,2025-01-05,# Competency of Operators of Pleasure Craft Re...,
4714,SOR/99-7,,REGULATIONS-FED,1998,"Ozone-Depleting Substances Regulations, 1998",en,1998-12-16,https://github.com/justicecanada/laws-lois-xml...,2025-01-05,"# Ozone-Depleting Substances Regulations, 1998...",
4715,SOR/99-86,,REGULATIONS-FED,1999,Proclamation Designating Certain Countries as ...,en,1999-2-10,https://github.com/justicecanada/laws-lois-xml...,2025-01-05,# Proclamation Designating Certain Countries a...,
4716,SOR/99-93,,REGULATIONS-FED,1999,Tobacco (Access) Regulations,en,1999-2-11,https://github.com/justicecanada/laws-lois-xml...,2025-01-05,# Tobacco (Access) Regulations\n\nSOR/99-93\n\...,


In [3]:
# Process French Regulations

# Tables aren't extracted.

from lxml import etree as ET
import pandas as pd
import os
import time
import re

def get_root_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        root = ET.parse(f).getroot()
    return root

def fix_errors(text):
    text = text.replace('(   ','(')
    text = text.replace(' )',')')
    text = text.replace(' .','.')
    text = text.replace(' ,',',')
    text = text.replace(' ;',';')
    text = text.replace('    ',' ')
    text = text.replace('   ',' ')
    text = text.replace('  ',' ')
    text = text.replace('  ',' ')
    text = text.replace('  ',' ')
    return text

def extract_text(elem):
    text_parts = []  

    if elem.text:
        if elem.tag == 'DefinedTermFr':  # need to change if FR
            text_parts.append('\n'+'*'+elem.text.strip()+'*')    
        else:     
            text_parts.append(elem.text.strip().replace('\n',''))

    for child in elem:
        child_text = extract_text(child) if child.tag != 'FootnoteRef' else None
        if child.tag == 'FootnoteRef':
            ref_text = '{' + (child.text.strip() if child.text else '') + '} '
            child_text = ref_text + (child.tail.strip().replace('\n', '') if child.tail else '')
        if child_text:
            text_parts.append(' ' + child_text + ' ')
        elif child.tail: 
            tail_text = child.tail.strip().replace('\n', '')
            if tail_text:
                text_parts.append(' ' + tail_text + ' ')
            
    if elem.tail:
        tail_text = elem.tail.strip().replace('\n','')
        if tail_text:
            text_parts.append(' ' + tail_text + ' ')
    return fix_errors(''.join(text_parts))

def get_table_text(elem):

    # Placeholder for table text
    return ''

def extract_ordered_elements(root):
    ordered_tag = []
    ordered_text = []
    
    elements_to_extract = [
        'TitleText',
        'Label',
        'Text',
        'MarginalNote',
        'TableGroup'
    ]

    special_labels = ['SECTION', 'PARTIE', 'ANNEXE']

    
    for elem in root.iter():
        if elem.tag in elements_to_extract:       
            full_text = extract_text(elem)
            if full_text is None:
                continue
            if elem.tag == 'MarginalNote':
                ordered_text.append('\n\n### ' + full_text + '\n')
            elif elem.tag == 'Label':
                if ordered_tag and ordered_tag[-1] == 'Label':
                    ordered_text.append(full_text)
                elif any(special_labels in full_text for special_labels in special_labels):
                    ordered_text.append('\n\n# '+full_text)
                else:
                    ordered_text.append('\n' + full_text)
            elif elem.tag == 'TitleText':
                ordered_text.append('\n\n## ' + full_text)
            elif elem.tag == 'Text':
                ordered_text.append(full_text)
            elif elem.tag == 'TableGroup':
                ordered_text.append(get_table_text(elem))
                                    
            ordered_tag.append((elem.tag))

    ordered_text = ' '.join(ordered_text)

    ordered_text = re.sub(r'\n(?=\d)', '\n\n', ordered_text)

    breakpoints = ['# ANNEX',
                   '#  ANNEX',
                   '#  SCHEDULE',
                   '# SCHEDULE',
                   '## DISPOSITIONS CONNEXES']
                   
    break_applied = False
    for break_point in breakpoints:
        if break_point in ordered_text:
            ordered_text = ordered_text[:ordered_text.index(break_point)]
            break_applied = True
            break

    repealed_info = root.find('.//Repealed')
    if repealed_info is not None and '[Abrogé' not in ordered_text and not break_applied:
        ordered_text = ordered_text + '\n\n' + repealed_info.text.strip()

    return ordered_text

def extract_citation(root, all_info = True):
    consolidated_number_element = root.find('.//InstrumentNumber')
    if consolidated_number_element is not None:
        citation = consolidated_number_element.text
    else:
        citation = ''

    return citation.strip()

def extract_date_registered(root, add_text = True):
    registered = root.find(".//RegistrationDate")
    if registered is not None:
        year = registered.find(".//YYYY").text
        month = registered.find(".//MM").text
        day = registered.find(".//DD").text
        date_registered = f"{year}-{month}-{day}"
    else:
        date_registered = ""
    return date_registered

def fix_doc_date(root, document_date, citation):
    if document_date == '':
        if citation[:3] == 'CRC':
            if '1949' in citation:
                document_date = '1949-01-01' # NOTE, JUST FOR YEARS, COULD GET MORE SPECIFIC
            elif '1955' in citation:
                document_date = '1955-01-01' # NOTE, JUST FOR YEARS, COULD GET MORE SPECIFIC
            else:
                document_date = '1979-08-15'  # Came into force due to SI/79-131
        else:
            registered = root.find(".//RegulationMakerOrder")
            if registered is not None:
                year = registered.find(".//YYYY").text
                month = registered.find(".//MM").text
                day = registered.find(".//DD").text
                document_date = f"{year}-{month}-{day}"
            else:
                # manual fixes
                if citation == "DORS/54-743":
                    document_date = '1954-12-28'
                elif citation == "DORS/56-290":
                    document_date = '1956-07-19'
                elif citation == "DORS/57-176":
                    document_date = '1957-04-11'
                elif citation == "DORS/61-378":
                    document_date = '1961-08-22'
                elif citation == "DORS/67-619":
                    document_date = '1967-01-01' # NOTE JUST YEAR

                else:
                    document_date = ""
    return document_date

def extract_long_title(root):
    long_title_element = root.find(".//LongTitle")
    return ''.join(long_title_element.itertext())

def extract_short_title(root):
    short_title_element = root.find(".//ShortTitle")
    return ''.join(short_title_element.itertext())

def extract_title(root, include_short = True):
    if include_short:
        try:
            title = extract_short_title(root)
        except:
            title = extract_long_title(root)
    else:
        try:
            title = extract_long_title(root)
        except:
            title = ''
    # remove \n and extra spaces
    title = re.sub(r'\s+', ' ', title)
    return title

def extract_Enabling_Authority(root):
    try:
        enabling_authority = extract_text(root.find(".//EnablingAuthority"))
        if enabling_authority is None:
            enabling_authority = ''
        if enabling_authority != '':
            enabling_authority = 'Pouvoir habilitant: ' + enabling_authority.strip()  
    except:
        enabling_authority = ''

    if enabling_authority is None:
        enabling_authority = ''
    return enabling_authority

def extract_full_text(root):
    title = str(extract_title(root, include_short=False))
    registered_date = str(extract_date_registered(root))
    ordered_text = str(extract_ordered_elements(root))
    enabling_authority = str(extract_Enabling_Authority(root))
    full_text = title + '\n\n' + registered_date + '\n\n' + enabling_authority + '\n\n' + ordered_text
    full_text = re.sub(r'^\s+$', '\n', full_text, flags=re.MULTILINE)
    full_text = re.sub(r'\n{3,}', '\n\n', full_text)
    return full_text

# iterate through all files in /acts/ folder and extract text to df
files = os.listdir(fr_path)
data = []
for file in files:
    # skip 
    if file == 'regs.txt' or file == 'PLACEHOLDER2.xml':
        continue
    try:
        root = get_root_from_file(fr_path+file)
        citation = extract_citation(root, all_info=False).replace('.','')
        document_date = extract_date_registered(root, add_text=False)
        document_date = fix_doc_date(root, document_date, citation)
        title = extract_title(root)
        full_text = extract_full_text(root)
        unofficial_text = '# '+ title + '\n\n' + citation + '\n\n' + full_text
        citation2 = ""
        dataset = "REGULATIONS-FED"
        year = np.int32(int(document_date[:4]))
        language = 'fr'
        source_url = 'https://github.com/justicecanada/laws-lois-xml/tree/main/fra/reglements/'+file
        scraped_timestamp = time.strftime('%Y-%m-%d')
        other = ''
        data.append([citation,
                     citation2, 
                     dataset, 
                     year, 
                     title, 
                     language,
                     document_date, 
                     source_url,
                     scraped_timestamp,
                     unofficial_text,
                     other])
    except Exception as e:
        print(f'Error in {file}')
        print(e)

df_regs_fr = pd.DataFrame(data, columns=['citation',
                                 'citation2',
                                 'dataset',
                                 'year',
                                 'name',
                                 'language',
                                 'document_date', 
                                 'source_url',
                                 'scraped_timestamp',
                                 'unofficial_text',
                                 'other'
                                 ])

# export to json
df_regs_fr.to_json('DATA/df_regs_fr.json', orient='records', lines=True)

df_regs_fr

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other
0,"CRC, ch 10",,REGULATIONS-FED,1979,Règlement sur l’indemnisation en cas d’acciden...,fr,1979-08-15,https://github.com/justicecanada/laws-lois-xml...,2025-01-05,# Règlement sur l’indemnisation en cas d’accid...,
1,"CRC, ch 100",,REGULATIONS-FED,1979,Règlement de zonage de l’aéroport internationa...,fr,1979-08-15,https://github.com/justicecanada/laws-lois-xml...,2025-01-05,# Règlement de zonage de l’aéroport internatio...,
2,"CRC, ch 101",,REGULATIONS-FED,1979,Règlement de zonage de l’aéroport de Penticton,fr,1979-08-15,https://github.com/justicecanada/laws-lois-xml...,2025-01-05,# Règlement de zonage de l’aéroport de Pentict...,
3,"CRC, ch 1013",,REGULATIONS-FED,1979,Règlement du Canada sur la rémunération dans l...,fr,1979-08-15,https://github.com/justicecanada/laws-lois-xml...,2025-01-05,# Règlement du Canada sur la rémunération dans...,
4,"CRC, ch 1015",,REGULATIONS-FED,1979,Règlement sur les justes salaires et les heure...,fr,1979-08-15,https://github.com/justicecanada/laws-lois-xml...,2025-01-05,# Règlement sur les justes salaires et les heu...,
...,...,...,...,...,...,...,...,...,...,...,...
4713,TR/99-80,,REGULATIONS-FED,1999,Décret de remise visant le directeur exécutif ...,fr,1999-8-18,https://github.com/justicecanada/laws-lois-xml...,2025-01-05,# Décret de remise visant le directeur exécuti...,
4714,TR/99-81,,REGULATIONS-FED,1999,Décret de remise visant le directeur de la Com...,fr,1999-8-18,https://github.com/justicecanada/laws-lois-xml...,2025-01-05,# Décret de remise visant le directeur de la C...,
4715,TR/99-82,,REGULATIONS-FED,1999,Décret de remise visant Télésat Canada,fr,1999-8-18,https://github.com/justicecanada/laws-lois-xml...,2025-01-05,# Décret de remise visant Télésat Canada\n\nTR...,
4716,TR/99-9,,REGULATIONS-FED,1999,Décret sur la renonciation aux terres réservée...,fr,1999-2-3,https://github.com/justicecanada/laws-lois-xml...,2025-01-05,# Décret sur la renonciation aux terres réserv...,


In [4]:
# Export parquet to HuggingFace

# combaine the two df
df_regs = pd.concat([df_regs_en, df_regs_fr], axis=0)
# reset index
df_regs.reset_index(drop=True, inplace=True)
df_regs

# export to parquet
df_regs.to_parquet(out_path_hf+'train.parquet')


## VERIFICATION

In [None]:
# manually view unofficial_text in en
from IPython.display import clear_output
while True:
    print('Enter row number to print unofficial_text (or exit to exit):')
    clear_output(wait=True)
    os.system('cls')
    row_sought = input()
    if row_sought == 'exit':
        break
    print(df_regs_en.iloc[int(row_sought)].unofficial_text)


In [None]:
# manually view unofficial_text in fr
from IPython.display import clear_output
while True:
    print('Enter row number to print unofficial_text (or exit to exit):')
    clear_output(wait=True)
    os.system('cls')
    row_sought = input()
    if row_sought == 'exit':
        break
    print(df_regs_fr.iloc[int(row_sought)].unofficial_text)

In [None]:
df_regs_en