# Scraping Charity Intelligence Canada

## Bibliotecas

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Paso 1: Investigar la página

Use las DevTools para saber cómo buscar en el código los links que me interesaban, además de notar que las páginas con los links eran la misma dirección y sólo se moodificaba el último número de 20 en 20.

![devtools](./images/devtools_01.png "Click derecho inspect")

## Paso 2: Solicitud a Chat GPT
Después pregunte a ChatGPT (prompt engineering)

![chatGPT](./images/prompt_01.png "pregunta a chatGPT")
![chatGPT](./images/prompt_02.png "pregunta a chatGPT")
![chatGPT](./images/prompt_03.png "pregunta a chatGPT")

## Paso 3. Recopilar links individuales de cada caridad

In [5]:
base_url = "https://www.charityintelligence.ca/charity-profiles/a-z-charity-listing?start="
start_points = range(0, 860, 20)  # generates start points from 0 to 840

all_links = []

for start in start_points:
    url = f"{base_url}{start}"
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        charity_links = soup.find_all('a', class_='title lnk')
        for link in charity_links:
            href = link.get('href')
            if href:
                all_links.append(f"https://www.charityintelligence.ca{href}")
    else:
        print(f"Failed to retrieve page with start={start}")

# Print all collected links
for link in all_links:
    print(link)

# Save the links to a file
with open("./data/charity_intelligence_links.txt", "w") as f:
    for link in all_links:
        f.write(link + "\n")

https://www.charityintelligence.ca/charity-details/856-360kids-support-services
https://www.charityintelligence.ca/charity-details/967-a-better-world-canada
https://www.charityintelligence.ca/charity-details/309-acadia-university
https://www.charityintelligence.ca/charity-details/154-acclaim-health
https://www.charityintelligence.ca/charity-details/1084-action-canada-for-sexual-health-rights
https://www.charityintelligence.ca/charity-details/542-adra-canada
https://www.charityintelligence.ca/charity-details/1-aga-khan-foundation-canada
https://www.charityintelligence.ca/charity-details/908-against-malaria-foundation-canada
https://www.charityintelligence.ca/charity-details/652-agence-ometz
https://www.charityintelligence.ca/charity-details/110-agincourt-community-services-association
https://www.charityintelligence.ca/charity-details/926-ahmadiyya-muslim-jama-at-canada
https://www.charityintelligence.ca/charity-details/692-alberta-adolescent-recovery-centre
https://www.charityintellige

## Paso 4: Investigar nuevamente las páginas

![Revisar código de cada elemento](./images/devtools_02.png "Revisar código de cada elemento")

## Paso 5: Solicitud a Chat GPT con una liga de prueba

Primero le doy el código que ya llevo (esto porque lo hice en un día diferente yes para darle contexto):
![chatGPT](./images/prompt_04.png "pregunta a chatGPT")
Luego le digo lo que necesito:
![chatGPT](./images/prompt_05.png "pregunta a chatGPT")
Luego copio el código y lo voy corrigiendo y complementando:
![chatGPT](./images/prompt_06.png "pregunta a chatGPT")

In [6]:
##############################################################################################################
######################## Éste es mi código sucio, lo deje como ejemplo #####################################
#############################################################################################################

# # Define the function to scrape details from the first charity link
# def scrape_charity_details(url):
#     response = requests.get(url)
#     if response.status_code != 200:
#         print(f"Failed to retrieve charity page {url}")
#         return {}

#     soup = BeautifulSoup(response.content, 'html.parser')
#     charity_details = {}

#     # Extract the required fields
#     charity_details['charity'] = soup.find('h1', class_='sppb-addon-title').get_text(strip=True)
#     # charity_type = soup.find('div', id='sppb-addon-1551725103589').get_text(strip=True)
#     # print(charity_type)

#     # Find the div with the specific id
#     addon_div = soup.find('div', id='sppb-addon-1551725103589')

#     # Extract the text from the <a> tags
#     links = addon_div.find_all('a')

#     # Extract the text for each <a> tag
#     charity_details['type']  = links[0].get_text(strip=True)
#     charity_details['status']  = links[1].get_text(strip=True)

#     # Extract address and Charitable Registration
#     address_section = soup.find('div', id="column-wrap-id-1548969998232")
#     if address_section:
#         address_content = address_section.find('div', class_='sppb-addon-content').get_text(separator='\n').split('\n')
#         charity_details['address'] = ' '.join(address_content[:3]).strip()
#         charity_details['Charitable Registration'] = address_content[-1].strip().split(':')[-1].strip()

#     charity_details['stars'] = soup.find('span', class_='rating_stars').get('title').split(':')[-1].strip()
#     charity_details['FINANCIAL TRANSPARENCY_1'] = soup.find('p', style="font-weight: bolder;font-size: 35px;", align="center").get_text(strip=True)
#     charity_details['FINANCIAL TRANSPARENCY_2'] = soup.find('p', style="font-size: 12px;").get_text(strip=True)
#     charity_details['RESULTS REPORTING_1'] = soup.find('p', style="font-family: 'Source Sans Pro', sans-serif;;font-weight: bolder;align='center';font-size: 35px; color:rgb(192,0,0)").get_text(strip=True)
#     charity_details['RESULTS REPORTING_2'] = soup.find('p', style="font-size: 12px;").get_text(strip=True)
#     charity_details['DEMONSTRATED IMPACT_1'] = soup.find('p', style="font-family: 'Source Sans Pro', sans-serif;font-weight: bolder;align='center';font-size: 35px;").get_text(strip=True)
#     charity_details['DEMONSTRATED IMPACT_2'] = soup.find('p', style="font-size: 12px;").get_text(strip=True)
# ###### FALTA EL SVG #####
#     charity_details['NEED FOR FUNDING'] = soup.find('p', style="font-size: 12px;").get_text(strip=True)
#     charity_details['CENTS TO THE CAUSE_1'] = soup.find('p', style="font-family: 'Source Sans Pro', sans-serif;font-weight: bolder;align='center';font-size: 35px;").get_text(strip=True)
#     charity_details['CENTS TO THE CAUSE_2'] = soup.find('p', style="font-size: 12px;").get_text(strip=True)
#     overview = soup.find('div', id="sppb-addon-1549915418046").find_all('p')
#     charity_details['OVERVIEW'] = ' '.join([p.get_text(strip=True) for p in overview])
#     results_impact = soup.find('div', id='sppb-addon-1549916615776').find_all('p')
#     # print(results_impact)
#     charity_details['RESULTS AND IMPACT'] = ' '.join([p.get_text(strip=True) for p in results_impact])
#     finances = soup.find('div', id='column-wrap-id-1551726899554').find_all('p')
#     charity_details['FINANCES'] = ' '.join([p.get_text(strip=True) for p in finances])
#     # print(finances)
#     salary = soup.find('div', id='sppb-addon-1549916615773').find_all('p')
#     # charity_details['Salary Information'] = ' '.join([p.get_text(strip=True) for p in salary])
#     charity_details['Full-time staff']  = salary[0].find('span').get_text(strip=True) if len(salary) > 0 else ''
#     charity_details['Avg. compensation']  = salary[1].find('span').get_text(strip=True) if len(salary) > 1 else ''
#     # print(salary)

#     # Find the div containing the tables
#     stats_div = soup.find('div', id='stats')
    
#     # Extract the first table with class 'stats_table ratios'
#     table1 = stats_div.find('table', class_='stats_table ratios')
#     # print(table1)

#     # Extract the headers and data from the Financial Ratios table
#     headers1 = [th.get_text(strip=True) for th in table1.find_all('th')]
#     rows1 = table1.find_all('tr')[1:]  # Skip the header row
#     # print(headers1,rows1)
# ##############################
#     data1 = []
#     for row in rows1:
#         cols = row.find_all('td')
#         data_row = [col.get_text(strip=True) for col in cols]
#         data1.append(data_row)
#     # print(data1)
#     # print(data1[0][0] + " (" + headers1[0] +")")

#     for j in range(len(data1[0])):
#         # print(data1[j][0])
#         for i in range(len(headers1)):
#             # print(headers1[i])
#             # print(data1[j][0] + " (" + headers1[i] +")")
#             # print(data1[j][i+1])
#             column_name = data1[j][0] + " (" + headers1[i] +")"
#             # print(column_name)
#             charity_details[column_name] = data1[j][i+1]

#     # Extract the second table with class 'stats_table summary'
#     table2 = stats_div.find('table', class_='stats_table summary')
    
#     # Extract the headers and data from the Summary Financial Statements table
#     headers2 = [th.get_text(strip=True) for th in table2.find_all('th')]
#     rows2 = table2.find_all('tr')[1:]  # Skip the header row
#     # print(rows2)
    
#     data2 = []
#     for row in rows2:
#         cols = row.find_all('td')
#         data_row = [col.get_text(strip=True) for col in cols]
#         data2.append(data_row)
#     # print(data2[5][0])

#     for j in range(len(data2)):
#         # print(data2[j])
#         for i in range(len(headers2)):
#             # print(headers1[i])
#             # print(data1[j][0] + " (" + headers1[i] +")")
#             # print(data1[j][i+1])
#             column_name = data2[j][0] + " (" + headers2[i] +")"
#             # print(column_name)
#             charity_details[column_name] = data2[j][i+1]

#     table3 = soup.find('div', id='sppb-addon-1549916615773').find('table')
#     # print(table3)
    
#     # Initialize lists to hold the data
#     salary_ranges = []
#     counts = []

#     # Iterate over all rows in the table
#     for row in table3.find_all('tr'):
#         cells = row.find_all('td')
#         if len(cells) == 2:  # Ensure the row has the correct number of columns
#             salary_range = cells[0].get_text(strip=True)
#             count = cells[1].get_text(strip=True)
#             charity_details[salary_range] = count
#             salary_ranges.append(salary_range)
#             counts.append(count)
#     # print(salary_ranges, counts)

#     comments = soup.find('div', id='sppb-addon-1549917617861').find_all('p')
#     charity_details['COMMENTS'] = ' '.join([p.get_text(strip=True) for p in comments])
#     # print(comments)

#     # Find the div containing the contact info
#     contact_div = soup.find('div', id='sppb-addon-1549917617869').find('div', class_='sppb-addon-content')

#     # Extract and clean the contact info text
#     contact_info = contact_div.get_text(separator=' ', strip=True)

#     # Print the contact info
#     # print(contact_info)

#     # Split the contact info into parts
#     # Remove the unwanted string "This email address is being protected from spambots. You need JavaScript enabled to view it."
#     contact_info_cleaned = contact_info.replace("This email address is being protected from spambots. You need JavaScript enabled to view it.", "").strip()
#     charity_details['contact'] = contact_info_cleaned 



#     return charity_details


In [7]:
# # Scrape details for the first charity link
# first_charity_url = all_links[0]
# charity_data = scrape_charity_details(first_charity_url)

# # Create DataFrame
# df = pd.DataFrame([charity_data])

# # Save DataFrame to a CSV file
# df.to_csv('./data/charity_intelligence_details.csv', index=False)

# # Print DataFrame
# df

In [8]:
# df.columns

Este es mi codigo final sólo me falto el SVG

In [9]:
def scrape_charity_details(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve charity page {url}")
        return {}

    soup = BeautifulSoup(response.content, 'html.parser')
    charity_details = {}

    # Extract the required fields
    charity_name_tag = soup.find('h1', class_='sppb-addon-title')
    if charity_name_tag:
        charity_details['charity'] = charity_name_tag.get_text(strip=True)

    addon_div = soup.find('div', id='sppb-addon-1551725103589')
    if addon_div:
        links = addon_div.find_all('a')
        if len(links) > 0:
            charity_details['type'] = links[0].get_text(strip=True)
        if len(links) > 1:
            charity_details['status'] = links[1].get_text(strip=True)

    address_section = soup.find('div', id="column-wrap-id-1548969998232")
    if address_section:
        address_content = address_section.find('div', class_='sppb-addon-content').get_text(separator='\n').split('\n')
        if len(address_content) >= 3:
            charity_details['address'] = ' '.join(address_content[:3]).strip()
        if len(address_content) > 0:
            charity_details['Charitable Registration'] = address_content[-1].strip().split(':')[-1].strip()

    rating_tag = soup.find('span', class_='rating_stars')
    if rating_tag:
        charity_details['stars'] = rating_tag.get('title').split(':')[-1].strip()

    financial_transparency_1 = soup.find('p', style="font-weight: bolder;font-size: 35px;", align="center")
    if financial_transparency_1:
        charity_details['FINANCIAL TRANSPARENCY_1'] = financial_transparency_1.get_text(strip=True)

    financial_transparency_2 = soup.find('p', style="font-size: 12px;")
    if financial_transparency_2:
        charity_details['FINANCIAL TRANSPARENCY_2'] = financial_transparency_2.get_text(strip=True)

    results_reporting_1 = soup.find('p', style="font-family: 'Source Sans Pro', sans-serif;;font-weight: bolder;align='center';font-size: 35px; color:rgb(192,0,0)")
    if results_reporting_1:
        charity_details['RESULTS REPORTING_1'] = results_reporting_1.get_text(strip=True)

    results_reporting_2 = soup.find('p', style="font-size: 12px;")
    if results_reporting_2:
        charity_details['RESULTS REPORTING_2'] = results_reporting_2.get_text(strip=True)

    demonstrated_impact_1 = soup.find('p', style="font-family: 'Source Sans Pro', sans-serif;font-weight: bolder;align='center';font-size: 35px;")
    if demonstrated_impact_1:
        charity_details['DEMONSTRATED IMPACT_1'] = demonstrated_impact_1.get_text(strip=True)

    demonstrated_impact_2 = soup.find('p', style="font-size: 12px;")
    if demonstrated_impact_2:
        charity_details['DEMONSTRATED IMPACT_2'] = demonstrated_impact_2.get_text(strip=True)

    need_for_funding = soup.find('p', style="font-size: 12px;")
    if need_for_funding:
        charity_details['NEED FOR FUNDING'] = need_for_funding.get_text(strip=True)

    cents_to_cause_1 = soup.find('p', style="font-family: 'Source Sans Pro', sans-serif;font-weight: bolder;align='center';font-size: 35px;")
    if cents_to_cause_1:
        charity_details['CENTS TO THE CAUSE_1'] = cents_to_cause_1.get_text(strip=True)

    cents_to_cause_2 = soup.find('p', style="font-size: 12px;")
    if cents_to_cause_2:
        charity_details['CENTS TO THE CAUSE_2'] = cents_to_cause_2.get_text(strip=True)

    overview = soup.find('div', id="sppb-addon-1549915418046").find_all('p')
    if overview:
        charity_details['OVERVIEW'] = ' '.join([p.get_text(strip=True) for p in overview])

    results_impact = soup.find('div', id='sppb-addon-1549916615776').find_all('p')
    if results_impact:
        charity_details['RESULTS AND IMPACT'] = ' '.join([p.get_text(strip=True) for p in results_impact])

    finances = soup.find('div', id='column-wrap-id-1551726899554').find_all('p')
    if finances:
        charity_details['FINANCES'] = ' '.join([p.get_text(strip=True) for p in finances])

    salary = soup.find('div', id='sppb-addon-1549916615773').find_all('p')
    if salary:
        charity_details['Full-time staff'] = salary[0].find('span').get_text(strip=True) if len(salary) > 0 else ''
        charity_details['Avg. compensation'] = salary[1].find('span').get_text(strip=True) if len(salary) > 1 else ''

    stats_div = soup.find('div', id='stats')
    if stats_div:
        table1 = stats_div.find('table', class_='stats_table ratios')
        if table1:
            headers1 = [th.get_text(strip=True) for th in table1.find_all('th')]
            rows1 = table1.find_all('tr')[1:]
            data1 = []
            for row in rows1:
                cols = row.find_all('td')
                data_row = [col.get_text(strip=True) for col in cols]
                data1.append(data_row)
            for j in range(len(data1)):
                for i in range(1, len(headers1)):
                    if i < len(data1[j]):
                        column_name = data1[j][0] + " (" + headers1[i] +")"
                        charity_details[column_name] = data1[j][i]

        table2 = stats_div.find('table', class_='stats_table summary')
        if table2:
            headers2 = [th.get_text(strip=True) for th in table2.find_all('th')]
            rows2 = table2.find_all('tr')[1:]
            data2 = []
            for row in rows2:
                cols = row.find_all('td')
                data_row = [col.get_text(strip=True) for col in cols]
                data2.append(data_row)
            for j in range(len(data2)):
                for i in range(1, len(headers2)):
                    if i < len(data2[j]):
                        column_name = data2[j][0] + " (" + headers2[i] +")"
                        charity_details[column_name] = data2[j][i]

    table3 = soup.find('div', id='sppb-addon-1549916615773').find('table')
    if table3:
        salary_ranges = []
        counts = []
        for row in table3.find_all('tr'):
            cells = row.find_all('td')
            if len(cells) == 2:
                salary_range = cells[0].get_text(strip=True)
                count = cells[1].get_text(strip=True)
                charity_details[salary_range] = count
                salary_ranges.append(salary_range)
                counts.append(count)

    comments = soup.find('div', id='sppb-addon-1549917617861').find_all('p')
    if comments:
        charity_details['COMMENTS'] = ' '.join([p.get_text(strip=True) for p in comments])

    contact_div = soup.find('div', id='sppb-addon-1549917617869').find('div', class_='sppb-addon-content')
    if contact_div:
        contact_info = contact_div.get_text(separator=' ', strip=True)
        contact_info_cleaned = contact_info.replace("This email address is being protected from spambots. You need JavaScript enabled to view it.", "").strip()
        charity_details['contact'] = contact_info_cleaned

    return charity_details


## Paso 6: Crear un for

In [252]:
# Initialize a list to hold the data for all charities
all_charity_data = []

# Loop through all links and scrape details for each charity
for link in all_links:
    charity_data = scrape_charity_details(link)
    all_charity_data.append(charity_data)

# Create DataFrame
df = pd.DataFrame(all_charity_data)

Failed to retrieve charity page https://www.charityintelligence.ca/charity-details/257-uforchange


In [253]:
df

Unnamed: 0,charity,type,status,address,Charitable Registration,stars,FINANCIAL TRANSPARENCY_1,FINANCIAL TRANSPARENCY_2,RESULTS REPORTING_1,RESULTS REPORTING_2,...,Business activities (net) (2015),Business activities (net) (2014),Special events (2014),Other costs (2015),Investment income (2011),Other income (2011),Program costs - International (2011),Program costs - Canada (2011),Fundraising costs (2011),Donor-designated donations (2023)
0,360Kids Support Services,Social Services - At-Risk Youth,Operating Charity,"80F Centurian Drive, Suite 200 Markham, ON L3R...",89703 6620 RR0001,5/5,✔+,Audited financial statementsfor current and pr...,A-,Audited financial statementsfor current and pr...,...,,,,,,,,,,
1,A Better World Canada,International Aid,Operating Charity,"#206-5033 52nd Street Lacombe, AB T4L 2A6",80099 4006 RR0001,2/5,✖,Audited financial statementsavailable only thr...,B+,Audited financial statementsavailable only thr...,...,,,,,,,,,,
2,Acadia University,Education - University,Operating Charity,"15 University Avenue Wolfville, NS B4P 2R6",10668 1893 RR0001,3/5,✔+,Audited financial statementsfor current and pr...,B-,Audited financial statementsfor current and pr...,...,,,,,,,,,,
3,Acclaim Health,Health,Operating Charity,"2370 Speers Road Oakville, ON L6L 5M2",11928 4602 RR0001,2/5,✔+,Audited financial statementsfor current and pr...,C,Audited financial statementsfor current and pr...,...,,,,,,,,,,
4,Action Canada for Sexual Health & Rights,Health,Operating Charity,"240 Bank Street Ottawa, ON K2P 1X4",10784 8319 RR0001,3/5,✖,Audited financial statementsavailable only upo...,B+,Audited financial statementsavailable only upo...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
843,YWCA Calgary,Social Services - Women,Operating Charity,"1715 – 17 Avenue SE Calgary, AB T2G 5J1",10822 7927 RR0001,4/5,✔+,Audited financial statementsfor current and pr...,B,Audited financial statementsfor current and pr...,...,,,,,,,,,,
844,YWCA Metro Vancouver,Social Services - Women,Operating Charity,"535 Hornby Street Vancouver, BC V6C 2E8",10822 7943 RR0001,2/5,✔+,Audited financial statementsfor current and pr...,B,Audited financial statementsfor current and pr...,...,,,,,,,,,,
845,YWCA NWT,Social Services - Women,Operating Charity,"4904-54 Ave. Suite 104 Yellowknife, NT X1A 1H7",11930 7411 RR0001,2/5,✖,Audited financial statementsavailable only thr...,B-,Audited financial statementsavailable only thr...,...,,,,,,,,,,
846,YWCA Toronto,Social Services - Women,Operating Charity,"87 Elm Street Toronto, ON M5G 0A8",10822 9865 RR0001,3/5,✔+,Audited financial statementsfor current and pr...,B-,Audited financial statementsfor current and pr...,...,,,,,,,,,,


## Paso 7: Guardar a un CSV

In [254]:
# Save DataFrame to a CSV file
df.to_csv('./data/charity_intelligence_details.csv', index=False)