In [2]:
import re
import os

#%pip install pandas
import pandas as pd

#%pip install --upgrade pip

#%pip install requests
import requests

#%pip install bs4
from bs4 import BeautifulSoup


#this is how I imported the library i've been using to extract keywords
#I have not been able to get it to work yet (see extract_keywords_from_text function)
#if anyone has suggestions or other libraries that might work better, please let me know!

#this was copied from https://stackoverflow.com/questions/38916452/nltk-download-ssl-certificate-verify-failed
#correcting an error that sometimes occurs when downloading nltk functions

#%pip install rake-nltk
from rake_nltk import Rake
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()
nltk.download('punkt_tab')

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/erindominguez/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
def urls_in_file(file_name):
    """ Takes an .eml file name as a string and returns a list of all unique urls linking to news sources """

    #reads the file and removes newlines and equal signs (which are placed at the end of each line)
    with open(file_name, 'r') as file:
        textfile = file.read().replace('\n', '').replace('=', '')
    
    #any strings without white spaces starting with 3Dhttps:// and ending with &amp, non-greedy matching
    valid_urls = re.findall('3D(https://\S*?)?&amp', textfile)
    
    #creates a list of each unique url (removes duplicates)
    unique_urls = list(set(valid_urls))
    
    return unique_urls

#demonstrates the function with the first file in the folder
#urls_in_file('eml_files/1. El Salvador Mineria 1-94 eml/Google Alert - El Salvador minería_1.eml')

In [4]:
def urls_in_folder(folder_name):
    """ Takes a folder of .eml files as a string and returns a list of all unique urls contained in each of the 
    files in the folder 
"""
    
    #creates a list of all file names in folder_name
    filenames = os.listdir(folder_name)
    filenames = [os.path.join(folder_name, file) for file in filenames]
    
    #creates a list to store the unique urls found in each file using urls_in_file function
    all_urls = []
    for file in filenames:
        urls = urls_in_file(file)
        all_urls.extend(urls)
    
    return all_urls

#demonstrates the function with folder '1. El Salvador Mineria 1-94 eml'
#urls_in_folder('eml_files/1. El Salvador Mineria 1-94 eml')[:7]

In [5]:
def extract_keywords_from_eml(file_name):
    """ 
    Takes an .eml file name as a string and returns a list of the keywords used in the Google Alerts
    that generated the file. 

    Only works with Google Alert .eml files with an ENGLISH subject line, which are formatted as:
    Subject: Google Alert - "keyword1" "keyword2" "keyword3"

    Google Alerts in Spanish appear to be formatted with UTF-8 encoding, which results in subject lines like:
    Subject: =?UTF-8?Q?Google_Alert_=2D_El_Salvador_miner=C3=ADa?=

    NOTE: I chose to use the actual eml file text rather than the name of the file because I didn't want
    the function to be dependent on the file name format, which might vary in the future.
    """   
    
    #reads the file and removes equal signs (which are placed at the end of each line)
    with open(file_name, 'r') as file:
        textfile = file.read().replace('=', '')
    
    #creates a list of keywords found in the textfile
    pattern = 'Subject: Google Alert - (.*)' 
    keyword_block = re.findall(pattern, textfile)
    keywords = keyword_block[0].split('" "')
    keywords = [keywords[i].replace('"', '').replace("'", "") for i in range(len(keywords))]
    
    return keywords

#demonstrates function with Google Alert - Bitcoin_ _El Salvador__1.eml
extract_keywords_from_eml('eml_files/Google Alert - _Bitcoin_ _El Salvador__1.eml')

['Bitcoin', 'El Salvador']

In [6]:
def extract_website_details(url, cutoff=20):
    """
    Takes in a url as a string and returns the text content of the webpage, removing paragraphs shorter than cutoff words.
    Returns a list containing the title, date, and body text of the webpage.
    If the webpage cannot be accessed, returns a list of None values.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser') #creates a BeautifulSoup object from the webpage content

        #Extracts text from webpage and attempts to remove headers and footers
        extracted_text = soup.get_text(separator='\n', strip=True)
        split = extracted_text.split('\n') #split the text into a list divided every time there is a new line \n
        body = [paragraph for paragraph in split if len(paragraph.split()) > cutoff] #remove paragraphs with less than CUTOFF words
        body_text = ' '.join(body) #combine remaining text into a single string

        #Extracts title from webpage
        title = soup.title.string if soup.title else 'Title not found'

        #Extracts date from webpage, if available
        #This part of the function isn't working yet, so it always returns 'Date not found'
        date = soup.find("span", class_="post-date").get_text(strip=True) if soup.find("span", class_="post-date") else 'Date not found'
        
        #Creating a BeautifulSoup object takes significant time, so this function extracts all neccessary information in one go
        #which is more efficient than creating separate functions for title, date, and body text.

        #Returns a list containing the title, date, and body text
        return [title, date, body_text]
    
    # If the webpage cannot be accessed, return a list of None values to fill in the table
    # It may be helpful to later fill in the error values instead of None to troubleshoot different problems
    except requests.RequestException as e:
        # Uncomment the next line to print errors during debugging
        #print(f"Error fetching {url}: {e}")
        return [None, None, None]
    

#this used to be a separate function, but it is now integrated into extract_website_details
#def clean_extracted_text(text, cutoff=20):
    #"""Cleans the text extracted using extract_website_text and removes short paragraphs (headers, footers, etc...) based on word count cutoff"""
    #if text is None or text == [None, None, None]:
    #    return [None, None, None]
    #else: 
    #    split = text[2].split('\n') #split the text into a list divided every time there is a new line \n
    #    body = [paragraph for paragraph in split if len(paragraph.split()) > cutoff] #remove paragraphs with less than CUTOFF words.
    #    body_text = ' '.join(body) #combine remaining text into a single string
    #    return [text[0], text[1], body_text]

#demonstrates the extract_website_details function with an example url
example_text = extract_website_details('https://www.contrapunto.com.sv/muro-colapso-tras-sismo-en-san-salvador-y-dejo-a-una-persona-herida/')
example_text

['Muro colapsó tras sismo en San Salvador y dejó a una persona herida',
 'Date not found',
 'Una persona quedó atrapada tras el colapso de un muro en San Salvador, provocado por un sismo. Fue rescatada y atendida por Cruz Verde Salvadoreña. Una persona quedó atrapada bajo los escombros de un muro que colapsó en la colonia Las Flores, calle Agua Caliente, San Salvador Este, tras el sismo registrado la tarde del lunes 5 de enero a las 5:37 p. m.; La víctima, cuya identidad no ha sido revelada, fue localizada por un residente del lugar y posteriormente rescatada por elementos de Cruz Verde Salvadoreña, quienes le brindaron primeros auxilios antes de trasladarla a un centro asistencial. Este sismo fue una de las réplicas del movimiento telúrico de magnitud 6.3 que ocurrió frente a la costa de La Paz a las 11:18 a.m. del domingo 5 de enero. Según el Ministerio de Medio Ambiente, desde entonces se han registrado 147 réplicas, de las cuales 21 han sido sentidas por la población. Las autoridad

In [7]:
def urls_to_table(urls):
    """Extracts text list of urls returns a table filled with urls, article titles, and body text."""
    
    url_title_date_text = [extract_website_details(url) for url in urls]

    cleaned_texts = [text[2] for text in url_title_date_text]
    dates = [text[1] for text in url_title_date_text]
    titles = [text[0] for text in url_title_date_text]

    #[clean_extracted_text(text, section='header') for text in raw_url_text]
    #cleaned_texts = [clean_extracted_text(text, section='body') for text in raw_url_text]

    extracted_info = pd.DataFrame({
        'url': urls,
        'title': titles,
        'date': dates,
        'body': cleaned_texts
    })
    
    return extracted_info

#demonstrates the function with the first file in the folder
urls_1 = urls_in_file('eml_files/Google Alert - _Bitcoin_ _El Salvador__1.eml')
mineria_1 = urls_to_table(urls_1)
mineria_1

Unnamed: 0,url,title,date,body
0,https://blockchain.news/flashnews/trump-to-mee...,Trump to Meet El Salvador Leader at White Hous...,Date not found,"According to @rovercrc, former President Trump..."
1,https://tradersunion.com/news/editors-picks/sh...,,,
2,https://cointelegraph.com/learn/articles/bitco...,,,
3,https://www.novinite.com/articles/231545/The%2...,The Role of Cryptocurrency in Developing Econo...,Date not found,"Money, for most people, is something they don’..."
4,https://www.crypto-reporter.com/press-releases...,"Bukele Rejects IMF, Keeps Buying BTC, and FXGu...",Date not found,El Salvador’s President Nayib Bukele made news...
5,https://crypto.news/trump-plans-white-house-vi...,Trump plans White House visit for El Salvador’...,Date not found,"by Bloomberg, follows Bukele’s agreement to de..."
6,https://news.bitcoin.com/first-tokenized-wareh...,First Tokenized Warehouse Complex Built in El ...,Date not found,
7,https://cryptobriefing.com/bitcoin-meeting-whi...,Trump plans to meet with Bitcoin bull Nayib Bu...,Date not found,The two pro-Bitcoin leaders have maintained co...
8,https://coincentral.com/trump-to-host-nayib-bu...,Trump to Host Nayib Bukele Following Deportati...,Date not found,Trump plans to host El Salvador's President Bu...
9,https://www.cryptoninjas.net/news/new-legislat...,New Legislation Introduced in Panama to Turn t...,Date not found,Panama is set to introduce a comprehensive dra...


In [None]:
def alert_folder_to_table(file_path):
    """Extracts text from all .eml files in a folder and returns a table filled with urls, article titles, and body text."""
    
    urls = [url for url in urls_in_folder(file_path)]

    return urls_to_table(urls)

#demonstrates the function with the full mineria google alert folder

#takes 12-18 minutes to run it may be helpful to comment out
example_table = alert_folder_to_table('eml_files/1. El Salvador Mineria 1-94 eml')
example_table[:3]

Unnamed: 0,url,title,date,body
0,https://www.elsalvador.com/opinion/editoriales...,Un equipo técnico–científico - Noticias de El ...,Date not found,"El periodismo que hacemos requiere tiempo, esf..."
1,https://lapagina.com.sv/deportes/ancelotti-se-...,Ancelotti se lleva a Arabia a cuatro canterano...,Date not found,Ancelotti se lleva a Arabia a cuatro canterano...
2,https://diario1.com/zona-deportiva/2025/01/el-...,,,


In [9]:
#exports example_table to a csv file
#example_table.to_csv('export_csv_files/example_table.csv', index=False)

In [10]:
def extract_keywords_from_text(text, language='english', ignore=[]):
    """ 
    Extracts keywords from a given text, ignoring specified keywords.
    Not working yet... so far just kind of spews nonsense for some reason
    """
    rake = Rake(language=language)
    rake.extract_keywords_from_text(text)
    top_phrases = rake.get_ranked_phrases()
    return top_phrases

trial_url = urls_in_file('eml_files/Google Alert - _Bitcoin_ _El Salvador__1.eml')[4]
trial_text = extract_website_details(trial_url)[2]
trial_keywords = extract_keywords_from_text(trial_text)
trial_keywords

['fxguys lets users trade without checking identity',
 'national asset built trust among',
 '1 listing price ozak ai',
 'president nayib bukele made news',
 'framework helps investors earn regularly',
 'yield real financial gains rather',
 'high growth potential besides bitcoin',
 'one main trait distinguishing fxguys',
 'fxguys users get immediate benefits',
 'investors seek options beyond long',
 'profits plus broker trading revenue',
 'lets users earn rewards',
 'fxguys gives traders direct chances',
 'top prop trading companies',
 'fxg tokens gives users',
 'fxguys platform gives access',
 'investors direct financial rewards',
 'price rises like bitcoin',
 'prop trading funding program',
 'offers clear financial growth',
 'rewards users actively',
 'every trade made',
 'fxguys gives traders',
 'several trade terminals',
 'international monetary fund',
 'fxg token costs',
 'el salvador sticks',
 'direct money gain',
 'demand plus liquidity',
 'trade2earn method gives',
 'el salvador