In [None]:
"""
Code to organize the laws and sections into structured csv files with organized predefined columns.
"""

In [1]:
import pandas as pd
import glob
import regex as re
import requests
from bs4 import BeautifulSoup
# Specify the folder path where your CSV files are located
folder_path = 'Extracted_Data/*.csv'  # Change this to your actual folder path

# Get a list of all CSV files in the specified folder
csv_files = glob.glob(folder_path)
# Iterate through each CSV file
for file in csv_files:
    print(file)
    try:
        df=pd.read_csv(rf'{file}')
    except:
        df=pd.read_csv(rf"{file}", encoding='latin-1')

    if 'Content' and 'Metadata' not in df.columns:
        def clean_section(section):
            # Use regex to remove everything before the section number
            cleaned_section = re.sub(r'^.*?(\d+[A-Z]?)\.?\s*(.*)', r'\1. \2', section)
            return cleaned_section

        # Apply the cleaning function to the Section column
        df['Section'] = df['Section'].apply(clean_section)


        # Function to fetch the data from a URL
        def remove_editorial_notes(text):
            # Split at the first occurrence of "EDITORIAL NOTES" and take the part before it
            return text.split('EDITORIAL NOTES')[0].strip()

        def remove_forwarddata(text, section):
            print(section)
            # Split at the first occurrence of "EDITORIAL NOTES" and take the part before it
            return text.split(section)[1].strip()

        def get_metadata(text):
            if 'EDITORIAL NOTES' 'Editorial Notes' in text:
                return text.split('EDITORIAL NOTES')[1].strip()
            if 'Editorial Notes' in text:
                return text.split('Editorial Notes')[1].strip()
            if 'Historical and Revision Notes' in text:
                return text.split('Historical and Revision Notes')[1].strip()
            print("NO METADATA")
            return None 

        def get_data_from_url(row):
            url = row['Url']
            section = clean_section(row['Section'])  # Clean the section text
            
            try:
                # Make a request to the URL
                response = requests.get(url)
                response.raise_for_status()  # Raise an exception for HTTP errors

                # Parse the HTML content using BeautifulSoup
                soup = BeautifulSoup(response.content, "html.parser")

                # Extract the text data (customize this based on your needs)
                content = soup.get_text()
                content = content.strip()  # Clean the content
                try:
                    content=remove_forwarddata(content,section )
                except:
                    pass
                # content= remove_editorial_notes(remove_forwarddata(content,section ))
                content=content.replace("\n", "")
                content= re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
                content = content.strip()
                return str(content)

            except Exception as e:
                print(f"Error fetching data from {url}: {e}")
                return f"Error fetching data: {e}"

        # Go through each URL and fetch the data
        df['Content'] = df.apply(get_data_from_url, axis=1)
        if 'Content' in df.columns:
            df['Metadata'] = df['Content'].apply(get_metadata)
            df['Content']= df['Content'].apply(remove_editorial_notes)
        df.to_csv(rf'{file}', index=False)
        print("Data fetching completed and Saved.")
    else:
        print(f"{file} is already processed!")


Extracted_Data\Chapter1-3_Robbery_and_Burglary.csv
103. —Front Matter
2111. Special maritime and territorial jurisdiction
2112. Personal property of United States
2113. Bank robbery and incidental crimes
2114. Mail, money, or other property of United States
2115. Post office
2116. Railway or steamboat post office
2117. Breaking or entering carrier facilities
2118. Robberies and burglaries involving controlled substances
2119. Motor vehicles
Data fetching completed and Saved.
Extracted_Data\Chapter101_Records_and_Reports.csv
101. —Front Matter
2071. Concealment, removal, or mutilation generally
2072. False crop reports
2073. False entries and reports of moneys or securities
2074. False weather reports
2075. Officer failing to make returns or reports
2076. Clerk of United States District Court
NO METADATA
Data fetching completed and Saved.
Extracted_Data\Chapter102_Riots.csv
102. —Front Matter
2101. Riots
2102. Definitions
NO METADATA
Data fetching completed and Saved.
Extracted_Data\Cha

In [2]:
folder_path = 'Extracted_Data/*.csv'  # Change this to your actual folder path

# Get a list of all CSV files in the specified folder
csv_files = glob.glob(folder_path)
# Iterate through each CSV file
for file in csv_files:
    try:
        df=pd.read_csv(rf'{file}')
    except:
        df=pd.read_csv(rf"{file}", encoding='latin-1')
    null_count = df['Content'].isnull().sum()
    print(f"{file}: {null_count}")

Extracted_Data\Chapter1-3_Robbery_and_Burglary.csv: 0
Extracted_Data\Chapter101_Records_and_Reports.csv: 0
Extracted_Data\Chapter102_Riots.csv: 0
Extracted_Data\Chapter105_Sabotage.csv: 0
Extracted_Data\Chapter107_Seamen_and_Stowaways.csv: 0
Extracted_Data\Chapter109A_Sexual_Abuse.csv: 0
Extracted_Data\Chapter109B_SEX_OFFENDER_AND_CRIMES_AGAINST_CHILDREN_REGISTRY.csv: 0
Extracted_Data\Chapter109_Searches_and_seizures.csv: 0
Extracted_Data\Chapter10_Biological_Weapons.csv: 0
Extracted_Data\Chapter110A_Domestic_Violence_and_stalking.csv: 0
Extracted_Data\Chapter110_SEXUAL_EXPLOITATION_AND_OTHER_ABUSE_OF_CHILDREN.csv: 0
Extracted_Data\Chapter111A_DESTRUCTION_OF_OR_INTERFERENCE_WITH_VESSELS_OR_MARITIME_FACILITIES.csv: 0
Extracted_Data\Chapter111_Shipping.csv: 0
Extracted_Data\Chapter113A_TELEMARKETING_AND_EMAIL_MARKETING_FRAUD.csv: 0
Extracted_Data\Chapter113B_Terrorism.csv: 0
Extracted_Data\Chapter113C_Torture.csv: 0
Extracted_Data\Chapter113_Stolen_property.csv: 0
Extracted_Data\Chapter1

In [3]:
folder_path = 'Extracted_Data/*.csv'  # Change this to your actual folder path

# Get a list of all CSV files in the specified folder
csv_files = glob.glob(folder_path)
# Iterate through each CSV file
for file in csv_files:
    try:
        df=pd.read_csv(rf'{file}')
    except:
        df=pd.read_csv(rf"{file}", encoding='latin-1')
    null_count = df['Metadata'].isnull().sum()
    print(f"{file}: {null_count}")

Extracted_Data\Chapter1-3_Robbery_and_Burglary.csv: 0
Extracted_Data\Chapter101_Records_and_Reports.csv: 1
Extracted_Data\Chapter102_Riots.csv: 1
Extracted_Data\Chapter105_Sabotage.csv: 1
Extracted_Data\Chapter107_Seamen_and_Stowaways.csv: 1
Extracted_Data\Chapter109A_Sexual_Abuse.csv: 0
Extracted_Data\Chapter109B_SEX_OFFENDER_AND_CRIMES_AGAINST_CHILDREN_REGISTRY.csv: 1
Extracted_Data\Chapter109_Searches_and_seizures.csv: 0
Extracted_Data\Chapter10_Biological_Weapons.csv: 1
Extracted_Data\Chapter110A_Domestic_Violence_and_stalking.csv: 2
Extracted_Data\Chapter110_SEXUAL_EXPLOITATION_AND_OTHER_ABUSE_OF_CHILDREN.csv: 3
Extracted_Data\Chapter111A_DESTRUCTION_OF_OR_INTERFERENCE_WITH_VESSELS_OR_MARITIME_FACILITIES.csv: 4
Extracted_Data\Chapter111_Shipping.csv: 5
Extracted_Data\Chapter113A_TELEMARKETING_AND_EMAIL_MARKETING_FRAUD.csv: 0
Extracted_Data\Chapter113B_Terrorism.csv: 5
Extracted_Data\Chapter113C_Torture.csv: 1
Extracted_Data\Chapter113_Stolen_property.csv: 3
Extracted_Data\Chapter1

In [9]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
file_name="Extracted_Data\Chapter11A_Child_Support.csv"
file_name=file_name.split('.csv')[0]
print(file_name)
df=pd.read_csv(rf'{file_name}.csv')

Extracted_Data\Chapter11A_Child_Support


In [56]:
import regex as re
def clean_section(section):
    # Use regex to remove everything before the section number
    cleaned_section = re.sub(r'^.*?(\d+[A-Z]?)\.?\s*(.*)', r'\1. \2', section)
    return cleaned_section

# Apply the cleaning function to the Section column
df['Section'] = df['Section'].apply(clean_section)

In [10]:
print(df['Content'][0])

18 USC Ch. 11A: Front Matter            Result 1 of 1                                      Current2018 Ed. and Supplement V (1/3/2024)2018 Ed. and Supplement IV (1/5/2023)2018 Ed. and Supplement III (1/3/2022)2018 Ed. and Supplement II (1/13/2021)2018 Ed. and Supplement I (1/24/2020)2018 Main Ed. (1/14/2019)2012 Ed. and Supplement V (1/12/2018)2012 Ed. and Supplement IV (1/6/2017)2012 Ed. and Supplement III (1/3/2016)2012 Ed. and Supplement II (1/5/2015)2012 Ed. and Supplement I (1/16/2014)2012 Main Ed. (1/15/2013)2006 Ed. and Supplement V (1/3/2012)2006 Ed. and Supplement IV (1/7/2011)2006 Ed. and Supplement III (2/1/2010)2006 Ed. and Supplement II (1/5/2009)2006 Ed. and Supplement I (1/8/2008)2006 Main Ed. (1/3/2007)2000 Ed. and Supplement V (1/2/2006)2000 Ed. and Supplement IV (1/3/2005)2000 Ed. and Supplement III (1/19/2004)2000 Ed. and Supplement II (1/6/2003)2000 Ed. and Supplement I (1/22/2002)2000 Main Ed. (1/2/2001)1994 Ed. and Supplement V (1/23/2000)1994 Ed. and Supplement I

In [57]:
print(df['Section'])

0                                     5. -Front Matter
1    81. Arson within special maritime and territor...
Name: Section, dtype: object


In [58]:
# Function to fetch the data from a URL
def remove_editorial_notes(text):
    # Split at the first occurrence of "EDITORIAL NOTES" and take the part before it
    return text.split('EDITORIAL NOTES')[0].strip()

def remove_forwarddata(text, section):
    # Split at the first occurrence of "EDITORIAL NOTES" and take the part before it
    return text.split(section)[1].strip()

def get_metadata(text):
    return text.split('EDITORIAL NOTES')[1].strip()

def get_data_from_url(row):
    url = row['Url']
    section = clean_section(row['Section'])  # Clean the section text
    
    try:
        # Make a request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")

        # Extract the text data (customize this based on your needs)
        content = soup.get_text()
        content = content.strip()  # Clean the content
        try:
            content=remove_forwarddata(content,section )
        except:
            pass
        try:
            content= remove_editorial_notes(content)
        except:
            pass
        # content= remove_editorial_notes(remove_forwarddata(content,section ))
        content=content.replace("\n", "")
        content= re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
        content = content.strip()
        return content

    except Exception as e:
        print(f"Error fetching data from {url}: {e}")
        return f"Error fetching data: {e}"

# Go through each URL and fetch the data
df['Content'] = df.apply(get_data_from_url, axis=1)
df['Metadata'] = df.apply(get_metadata, axis=1)

print(df['Content'])

print("Data fetching completed.")

0    18 USC Ch. 5: Front Matter            Result 1...
1    Whoever, within the special maritime and terri...
Name: Content, dtype: object
Data fetching completed.


In [59]:
df['Content'][1]

'Whoever, within the special maritime and territorial jurisdiction of the United States, willfully and maliciously sets fire to or burns any building, structure or vessel, any machinery or building materials or supplies, military or naval stores, munitions of war, or any structural aids or appliances for navigation or shipping, or attempts or conspires to do such an act, shall be imprisoned for not more than 25 years, fined the greater of the fine under this title or the cost of repairing or replacing any property that is damaged or destroyed, or both.If the building be a dwelling or if the life of any person be placed in jeopardy, he shall be fined under this title or imprisoned for any term of years or for life, or both.(June 25, 1948, ch. 645, 62 Stat. 688; Pub. L. 103–322, title XXXIII, §330016(1)(H), (K), Sept. 13, 1994, 108 Stat. 2147; Pub. L. 104–132, title VII, §708(b), Apr. 24, 1996, 110 Stat. 1296; Pub. L. 107–56, title VIII, §§810(a), 811(a), Oct. 26, 2001, 115 Stat. 380, 38

In [60]:
df['Content'][0]

'18 USC Ch. 5: Front Matter            Result 1 of 1                        \xa0             Current2018 Ed. and Supplement V (1/3/2024)2018 Ed. and Supplement IV (1/5/2023)2018 Ed. and Supplement III (1/3/2022)2018 Ed. and Supplement II (1/13/2021)2018 Ed. and Supplement I (1/24/2020)2018 Main Ed. (1/14/2019)2012 Ed. and Supplement V (1/12/2018)2012 Ed. and Supplement IV (1/6/2017)2012 Ed. and Supplement III (1/3/2016)2012 Ed. and Supplement II (1/5/2015)2012 Ed. and Supplement I (1/16/2014)2012 Main Ed. (1/15/2013)2006 Ed. and Supplement V (1/3/2012)2006 Ed. and Supplement IV (1/7/2011)2006 Ed. and Supplement III (2/1/2010)2006 Ed. and Supplement II (1/5/2009)2006 Ed. and Supplement I (1/8/2008)2006 Main Ed. (1/3/2007)2000 Ed. and Supplement V (1/2/2006)2000 Ed. and Supplement IV (1/3/2005)2000 Ed. and Supplement III (1/19/2004)2000 Ed. and Supplement II (1/6/2003)2000 Ed. and Supplement I (1/22/2002)2000 Main Ed. (1/2/2001)1994 Ed. and Supplement V (1/23/2000)1994 Ed. and Supplement

In [61]:
df.to_csv(rf'Extracted_Data/{file_name}.csv')