In [5]:
import re
import os

%pip install pandas
import pandas as pd

%pip install --upgrade pip

%pip install requests
import requests

%pip install bs4
from bs4 import BeautifulSoup

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [6]:
def urls_in_file(file_name):
    """ Takes an .eml file name as a string and returns a list of all unique urls linking to news sources """

    #reads the file and removes newlines and equal signs (which are placed at the end of each line)
    with open(file_name, 'r') as file:
        textfile = file.read().replace('\n', '').replace('=', '')
    
    #any strings without white spaces starting with 3Dhttps:// and ending with &amp, non-greedy matching
    valid_urls = re.findall('3D(https://\S*?)?&amp', textfile)
    
    #creates a list of each unique url (removes duplicates)
    unique_urls = list(set(valid_urls))
    
    return unique_urls

#demonstrates the function with the first file in the folder
urls_in_file('eml_files/1. El Salvador Mineria 1-94 eml/Google Alert - El Salvador minería_1.eml')

['https://www.laprensagrafica.com/opinion/Las-locuras-del-emperador-II-20250328-0081.html',
 'https://diario.elmundo.sv/el-mundo/un-pueblo-del-sur-de-honduras-podria-perder-su-historica-iglesia-por-la-explotacion-minera',
 'https://www.elsalvador.com/noticias/nacional/mercados-mercado-cuscatlan-alcaldia-de-san-salvador-centro-/1209302/2025/',
 'https://diario.elmundo.sv/nacionales/organizaciones-dicen-que-el-sistema-judicial-esta-a-prueba-tras-la-entrega-de-mas-de-59000-firmas-contra-de-la-mineria',
 'https://www.laprensagrafica.com/tendencias/Un-Pikachu-revolucionario-un-manifestante-disfrazado-de-Pokemon-se-unio-a-las-protestas-en-Turquia-20250329-0020.html',
 'https://www.elsalvador.com/opinion/editoriales/mineria-/1209390/2025/',
 'https://www.revistafactum.com/slm-grok/']

In [7]:
def urls_in_folder(folder_name):
    """ Takes a folder of .eml files as a string and returns a list of all unique urls contained in each of the 
    files in the folder 
"""
    
    #creates a list of all file names in folder_name
    filenames = os.listdir(folder_name)
    filenames = [os.path.join(folder_name, file) for file in filenames]
    
    #creates a list to store the unique urls found in each file using urls_in_file function
    all_urls = []
    for file in filenames:
        urls = urls_in_file(file)
        all_urls.extend(urls)
    
    return all_urls

#demonstrates the function with folder '1. El Salvador Mineria 1-94 eml'
urls_in_folder('eml_files/1. El Salvador Mineria 1-94 eml')[:7]

['https://www.elsalvador.com/noticias/nacional/gobierno-abusa-de-sus-facultades-dice-secretario-fmln/1192326/2025/',
 'https://diario1.com/zona-deportiva/2025/01/el-real-madrid-se-regala-una-goleada-en-cartagena-antes-de-la-supercopa/',
 'https://www.contrapunto.com.sv/muro-colapso-tras-sismo-en-san-salvador-y-dejo-a-una-persona-herida/',
 'https://www.laprensagrafica.com/elsalvador/El-Chino-Flores-secretario-del-FMLN-dice-que-reactivara-al-partido-20250107-0097.html',
 'https://lapagina.com.sv/deportes/ancelotti-se-lleva-a-arabia-a-cuatro-canteranos-y-a-alaba-para-buscar-el-tercer-titulo-de-la-temporada/',
 'https://www.elsalvador.com/opinion/editoriales/mineria-/1192178/2025/',
 'https://infodemia.com.sv/el-gobierno-declara-confidenciales-los-estudios-sobre-mineria-en-el-salvador']

In [15]:
def extract_website_details(url, cutoff=20):
    """
    Takes in a url as a string and returns the text content of the webpage, removing paragraphs shorter than cutoff words.
    Returns a list containing the title, date, and body text of the webpage.
    If the webpage cannot be accessed, returns a list of None values.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser') #creates a BeautifulSoup object from the webpage content

        #Extracts text from webpage and attempts to remove headers and footers
        extracted_text = soup.get_text(separator='\n', strip=True)
        split = extracted_text.split('\n') #split the text into a list divided every time there is a new line \n
        body = [paragraph for paragraph in split if len(paragraph.split()) > cutoff] #remove paragraphs with less than CUTOFF words
        body_text = ' '.join(body) #combine remaining text into a single string

        #Extracts title from webpage
        title = soup.title.string if soup.title else 'Title not found'

        #Extracts date from webpage, if available
        #This part of the function isn't working yet, so it always returns 'Date not found'
        date = soup.find("span", class_="post-date").get_text(strip=True) if soup.find("span", class_="post-date") else 'Date not found'
        
        #Creating a BeautifulSoup object takes significant time, so this function extracts all neccessary information in one go
        #which is more efficient than creating separate functions for title, date, and body text.

        #Returns a list containing the title, date, and body text
        return [title, date, body_text]
    
    # If the webpage cannot be accessed, return a list of None values to fill in the table
    # It may be helpful to later fill in the error values instead of None to troubleshoot different problems
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return [None, None, None]
    

#this used to be a separate function, but it is now integrated into extract_website_details
#def clean_extracted_text(text, cutoff=20):
    """Cleans the text extracted using extract_website_text and removes short paragraphs (headers, footers, etc...) based on word count cutoff"""
    if text is None or text == [None, None, None]:
        return [None, None, None]
    else: 
        split = text[2].split('\n') #split the text into a list divided every time there is a new line \n
        body = [paragraph for paragraph in split if len(paragraph.split()) > cutoff] #remove paragraphs with less than CUTOFF words.
        body_text = ' '.join(body) #combine remaining text into a single string
        return [text[0], text[1], body_text]

#demonstrates the extract_website_details function with an example url
example_text = extract_website_details('https://www.contrapunto.com.sv/muro-colapso-tras-sismo-en-san-salvador-y-dejo-a-una-persona-herida/')
example_text

['Muro colapsó tras sismo en San Salvador y dejó a una persona herida',
 'Date not found',
 'Una persona quedó atrapada tras el colapso de un muro en San Salvador, provocado por un sismo. Fue rescatada y atendida por Cruz Verde Salvadoreña. Una persona quedó atrapada bajo los escombros de un muro que colapsó en la colonia Las Flores, calle Agua Caliente, San Salvador Este, tras el sismo registrado la tarde del lunes 5 de enero a las 5:37 p. m.; La víctima, cuya identidad no ha sido revelada, fue localizada por un residente del lugar y posteriormente rescatada por elementos de Cruz Verde Salvadoreña, quienes le brindaron primeros auxilios antes de trasladarla a un centro asistencial. Este sismo fue una de las réplicas del movimiento telúrico de magnitud 6.3 que ocurrió frente a la costa de La Paz a las 11:18 a.m. del domingo 5 de enero. Según el Ministerio de Medio Ambiente, desde entonces se han registrado 147 réplicas, de las cuales 21 han sido sentidas por la población. Las autoridad

In [16]:
def urls_to_table(urls):
    """Extracts text list of urls returns a table filled with urls, article titles, and body text."""
    
    url_title_date_text = [extract_website_details(url) for url in urls]

    cleaned_texts = [text[2] for text in url_title_date_text]
    dates = [text[1] for text in url_title_date_text]
    titles = [text[0] for text in url_title_date_text]

    #[clean_extracted_text(text, section='header') for text in raw_url_text]
    #cleaned_texts = [clean_extracted_text(text, section='body') for text in raw_url_text]

    extracted_info = pd.DataFrame({
        'url': urls,
        'title': titles,
        'date': dates,
        'body': cleaned_texts
    })
    
    return extracted_info

#demonstrates the function with the first file in the folder
urls_1 = urls_in_file('eml_files/Google Alert - _Bitcoin_ _El Salvador__1.eml')
mineria_1 = urls_to_table(urls_1)
mineria_1

Error fetching https://tradersunion.com/news/editors-picks/show/177448-at-the-starting-line/: 403 Client Error: Forbidden for url: https://tradersunion.com/news/editors-picks/show/177448-at-the-starting-line/
Error fetching https://cointelegraph.com/learn/articles/bitcoin-adoption-by-country: 403 Client Error: Forbidden for url: https://cointelegraph.com/learn/articles/bitcoin-adoption-by-country


Unnamed: 0,url,title,date,body
0,https://news.bitcoin.com/first-tokenized-wareh...,First Tokenized Warehouse Complex Built in El ...,Date not found,
1,https://www.cryptoninjas.net/news/new-legislat...,New Legislation Introduced in Panama to Turn t...,Date not found,Panama is set to introduce a comprehensive dra...
2,https://www.novinite.com/articles/231545/The%2...,The Role of Cryptocurrency in Developing Econo...,Date not found,"Money, for most people, is something they don’..."
3,https://blockchain.news/flashnews/trump-to-mee...,Trump to Meet El Salvador Leader at White Hous...,Date not found,"According to @rovercrc, former President Trump..."
4,https://crypto.news/trump-plans-white-house-vi...,Trump plans White House visit for El Salvador’...,Date not found,"by Bloomberg, follows Bukele’s agreement to de..."
5,https://coincentral.com/trump-to-host-nayib-bu...,Trump to Host Nayib Bukele Following Deportati...,Date not found,Trump plans to host El Salvador's President Bu...
6,https://tradersunion.com/news/editors-picks/sh...,,,
7,https://www.crypto-reporter.com/press-releases...,"Bukele Rejects IMF, Keeps Buying BTC, and FXGu...",Date not found,El Salvador’s President Nayib Bukele made news...
8,https://cointelegraph.com/learn/articles/bitco...,,,
9,https://cryptobriefing.com/bitcoin-meeting-whi...,Trump plans to meet with Bitcoin bull Nayib Bu...,Date not found,The two pro-Bitcoin leaders have maintained co...


In [17]:
def alert_folder_to_table(file_path):
    """Extracts text from all .eml files in a folder and returns a table filled with urls, article titles, and body text."""
    
    urls = [url for url in urls_in_folder(file_path)]

    return urls_to_table(urls)

#demonstrates the function with the full mineria google alert folder
example_table = alert_folder_to_table('eml_files/1. El Salvador Mineria 1-94 eml')
example_table

Error fetching https://diario1.com/zona-deportiva/2025/01/el-real-madrid-se-regala-una-goleada-en-cartagena-antes-de-la-supercopa/: 403 Client Error: Forbidden for url: https://diario1.com/zona-deportiva/2025/01/el-real-madrid-se-regala-una-goleada-en-cartagena-antes-de-la-supercopa/
Error fetching https://www.laprensagrafica.com/elsalvador/El-Chino-Flores-secretario-del-FMLN-dice-que-reactivara-al-partido-20250107-0097.html: 403 Client Error: Forbidden for url: https://www.laprensagrafica.com/elsalvador/El-Chino-Flores-secretario-del-FMLN-dice-que-reactivara-al-partido-20250107-0097.html
Error fetching https://dev.elmundo.sv/politica/salvadorenos-califican-con-773-el-trabajo-de-nayib-bukele-segun-encuesta-ufg: 404 Client Error: Not Found for url: https://dev.elmundo.sv/politica/salvadorenos-califican-con-773-el-trabajo-de-nayib-bukele-segun-encuesta-ufg
Error fetching https://dev.elmundo.sv/politica/encuesta-ufg-un-4833-cree-que-el-pais-no-tiene-las-condiciones-para-la-mineria-metalic

Unnamed: 0,url,title,date,body
0,https://www.elsalvador.com/noticias/nacional/g...,"Gobierno “abusa” de sus facultades, dice secre...",Date not found,"El periodismo que hacemos requiere tiempo, esf..."
1,https://diario1.com/zona-deportiva/2025/01/el-...,,,
2,https://www.contrapunto.com.sv/muro-colapso-tr...,Muro colapsó tras sismo en San Salvador y dejó...,Date not found,Una persona quedó atrapada tras el colapso de ...
3,https://www.laprensagrafica.com/elsalvador/El-...,,,
4,https://lapagina.com.sv/deportes/ancelotti-se-...,Ancelotti se lleva a Arabia a cuatro canterano...,Date not found,Ancelotti se lleva a Arabia a cuatro canterano...
...,...,...,...,...
709,https://diario.elmundo.sv/nacionales/creemos-q...,"""Creemos que la dignidad del gremio se ha veni...",Date not found,El candidato a presidente del Colegio Médico d...
710,https://lapagina.com.sv/nacionales/cnn-destaca...,CNN destaca a Bukele como modelo político; pre...,Date not found,"El presidente de El Salvador, Nayib Bukele, re..."
711,https://www.elsalvador.com/entretenimiento/cul...,El Salvador y el Imperio del Manchukuo - Notic...,Date not found,"El periodismo que hacemos requiere tiempo, esf..."
712,https://www.elsalvador.com/fotogalerias/mineri...,"""Lo peor que podemos hacer es quedarnos callad...",Date not found,"""Lo peor que podemos hacer es quedarnos callad..."


In [19]:
#exports the table to a csv file
example_table.to_csv('export_csv_files/example_table', index=False)