In [26]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage to explore
webpage_url = 'https://birmingham-city-observatory.datopian.com/dataset/purchase-card-transactions'

# Function to scrape all resource IDs from the webpage
def scrape_resource_ids(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    resource_ids = []

    # Find all 'a' tags with 'href' attributes
    for link in soup.find_all('a', href=True):
        href = link['href']
        if '/resource/' in href:
            resource_id = href.split('/resource/')[1].split('/')[0]
            resource_ids.append(resource_id)
    
    # Remove duplicate resource IDs
    resource_ids = list(set(resource_ids))
    
    return resource_ids

# Scrape the webpage
resource_ids = scrape_resource_ids(webpage_url)

# Display the results
print("Resource IDs found:", resource_ids)

# Save the results to a file
with open('resource_ids.txt', 'w') as f:
    for resource_id in resource_ids:
        f.write(f"{resource_id}\n")


Resource IDs found: ['8e0a36f3-02c6-4cac-9750-4d6de2bcd76f', '381f8796-d8e8-4660-9c37-a326232ec09e', '8e0031bb-ddde-4560-8142-1119bc4f222b', '4af96278-1569-4b41-854c-b6af4d0405a4', '7eadc488-bf4e-4a22-a099-96174360160e', 'aab236e7-0485-4d2b-9751-e0a92dc21556', 'ff5e4310-b403-43a9-8ca0-f4e9bdeb863a', '198c6b2b-4362-4e7e-ab49-f450b126b6d9', '6408d4ad-4f9d-4751-ad38-b9306c1629a6', 'a8ebbfdc-6aa1-485c-b8ea-9f63f578ec33', '211b6316-7473-43b7-8461-70cfe4df076e', '6fc51bfe-27e1-4184-a4e0-80dd62f7c1c3', 'd44ebaa1-d865-46d1-9548-2e87f1fe931c', 'e8d58cbb-c67b-4f88-8476-2c648864623c', '4b210058-3f78-45ec-8771-ff0f857229f1', '1941fdfd-6486-4435-8ffc-4db790bbe355', 'b4fa2e44-62b6-43b9-9a4d-cfb7acb18adb', 'd3708796-7f84-4f26-ae9e-aedb452d7e82', 'abec4185-1fc7-4bac-a159-c77a88f0d204', '7e4cdf3e-d004-4ae5-8137-5f1f616135a0', '37abdafd-8c02-4828-957e-8f63f7b392d1', 'f7d4bf39-b514-4cd5-bd27-db19758afd45', 'cd2b2a61-c8ed-40c1-bf7c-97fdad267005', '490c0a7f-23c5-4520-a1e8-4bb1cd8aaa20', '46ad2213-569e-41ff

In [28]:
import requests
import pandas as pd

# Function to fetch data from a resource link and return as a DataFrame
def fetch_data(resource_id):
    api_url = f'https://birmingham-city-observatory.datopian.com/api/3/action/datastore_search?resource_id={resource_id}&limit=999999999'
    response = requests.get(api_url)
    data = response.json()
    
    if data['success']:
        records = data['result']['records']
        df = pd.DataFrame(records)
        # Drop the '_id' column
        if '_id' in df.columns:
            df = df.drop(columns=['_id'])
        return df, api_url
    else:
        print(f"Failed to fetch data for resource ID {resource_id}: {data['error']['message']}")
        return pd.DataFrame(), api_url

# Read resource IDs from the file
with open('resource_ids.txt', 'r') as file:
    resource_ids = [line.strip() for line in file]

# Initialize an empty DataFrame to store all data
all_data = pd.DataFrame()

# Fetch and concatenate data for each resource ID
for resource_id in resource_ids:
    df, api_url = fetch_data(resource_id)
    if not df.empty:
        all_data = pd.concat([all_data, df], ignore_index=True)
        print(f"Data fetched successfully from {api_url}")

# Check for duplicates ignoring the '_id' column
duplicates = all_data[all_data.duplicated(keep=False)]

# Save duplicates to a file
duplicates.to_csv('duplicates.csv', index=False)

# Remove duplicates from the DataFrame
cleaned_data = all_data.drop_duplicates()

# Save the cleaned data to a CSV file
cleaned_data.to_csv('data.csv', index=False)

# Save the cleaned data to a pickle file
cleaned_data.to_pickle('data.pkl')

# Print confirmation
print(f"Total records fetched: {all_data.shape[0]}")
print(f"Total duplicates found: {duplicates.shape[0]}")
print(f"Total records after removing duplicates: {cleaned_data.shape[0]}")
print("Data saved to 'data.csv' and 'data.pkl'. Duplicates saved to 'duplicates.csv'.")



Data fetched successfully from https://birmingham-city-observatory.datopian.com/api/3/action/datastore_search?resource_id=8e0a36f3-02c6-4cac-9750-4d6de2bcd76f&limit=999999999
Data fetched successfully from https://birmingham-city-observatory.datopian.com/api/3/action/datastore_search?resource_id=381f8796-d8e8-4660-9c37-a326232ec09e&limit=999999999
Failed to fetch data for resource ID 8e0031bb-ddde-4560-8142-1119bc4f222b: Not found: Resource "8e0031bb-ddde-4560-8142-1119bc4f222b" was not found.
Data fetched successfully from https://birmingham-city-observatory.datopian.com/api/3/action/datastore_search?resource_id=4af96278-1569-4b41-854c-b6af4d0405a4&limit=999999999
Data fetched successfully from https://birmingham-city-observatory.datopian.com/api/3/action/datastore_search?resource_id=7eadc488-bf4e-4a22-a099-96174360160e&limit=999999999
Data fetched successfully from https://birmingham-city-observatory.datopian.com/api/3/action/datastore_search?resource_id=aab236e7-0485-4d2b-9751-e0a92

In [29]:
# Print confirmation
print(f"Total records fetched: {all_data.shape[0]}")
print(f"Total duplicates found: {duplicates.shape[0]}")
print(f"Total records after removing duplicates: {cleaned_data.shape[0]}")
print("Data saved to 'data.csv' and 'data.pkl'. Duplicates saved to 'duplicates.csv'.")

Total records fetched: 312944
Total duplicates found: 19800
Total records after removing duplicates: 300412
Data saved to 'data.csv' and 'data.pkl'. Duplicates saved to 'duplicates.csv'.
