In [1]:
path = 'data/emails.txt'
with open(path, 'rb') as f:
    text = f.read().decode('utf-8', errors='replace')

In [2]:
import re

def remove_special_characters(dirty_text):
    # The following pattern will match any character that is not a letter, number, basic punctuation, or `-` and ``
    pattern = re.compile(r'[^a-zA-Z0-9\s.,?!:;@#$%^&*()_+={}[\]|\\<>/"\'`-]', flags=re.UNICODE)
    clean_text = pattern.sub('', dirty_text)
    return clean_text

In [3]:
def remove_minute_read(dirty_text):
    minute_read_pattern = re.compile(r'\(?\d*\s*minute\s*read\)?', re.IGNORECASE)
    clean_text = re.sub(minute_read_pattern, '', dirty_text)
    return clean_text

In [4]:
import quopri

def decode_quoted_printable(coded_text):
    decoded_text = quopri.decodestring(coded_text.encode('utf-8', errors='replace')).decode('utf-8', errors='replace')
    return decoded_text

In [5]:
def remove_square_brackets_urls(dirty_text):
    cleaned_text = re.sub(r'\[.*?\]', '', dirty_text)
    return cleaned_text

In [6]:
def remove_blank_lines(dirty_text):
    lines = dirty_text.splitlines()
    cleaned_lines = [line.strip() for line in lines if line.strip()]
    cleaned_text = '\n'.join(cleaned_lines)
    return cleaned_text

In [7]:
def cleanup_text(dirty_text):
    dirty_text = decode_quoted_printable(dirty_text)
    dirty_text = remove_special_characters(dirty_text)
    dirty_text = remove_square_brackets_urls(dirty_text)
    dirty_text = remove_minute_read(dirty_text)
    clean_text = remove_blank_lines(dirty_text)
    return clean_text

text = cleanup_text(text)

In [8]:
def extract_sections(text, start_section, end_section):
    # Define the regular expression pattern to match the desired sections
    pattern = re.compile(r'(?i){}(.*?)(?={})'.format(re.escape(start_section), re.escape(end_section)), re.MULTILINE | re.DOTALL)
    sections = re.findall(pattern, text)

    return sections

In [9]:
arguments = [
    "Big Tech & Startups",
    "Science & Futuristic Technology",
    "Programming, Design & Data Science",
    "Miscellaneous"
]

text_by_argument = {}

for i in range(len(arguments) - 1):
    argument = arguments[i]
    next_argument = arguments[i + 1]

    text_by_argument[argument] = extract_sections(text, argument, next_argument)

for section, content in text_by_argument.items():
    print(f"{section}: {len(content)} sections extracted")

Big Tech & Startups: 240 sections extracted
Science & Futuristic Technology: 251 sections extracted
Programming, Design & Data Science: 266 sections extracted


In [10]:
def print_first_10_sections(sections):
    for i, section in enumerate(sections[:10], start=1):
        print(section)

#print the first 10 extracted section for the BIG TECH & STARTUPS argument
print_first_10_sections(text_by_argument[arguments[0]])


BIG NEWS FROM MICROSOFT CTO ANDREAS BRAUN
Microsoft CTO Andreas Braun announced that GPT-4 will be released as
early as next week. The new model will be multimodal and will have
video capabilities. The announcement was made during the 'AI in Focus
- Digital Kickoff' hybrid event held on March 9 where Microsoft
Germany employees presented LLMs like the GPT series.
GOOGLE DUSTS OFF THE FAILED GOOGLE+ PLAYBOOK TO FIGHT CHATGPT
Google was threatened by Facebook's rise to success in 2011. This
prompted the company to issue a decree to its employees to build
social features into everything, with their bonuses tied to Google's
social success. The decree resulted in ham-fisted social integrations
across Google that users despised, such as YouTube comments being tied
to Google+ and the real name policy. Just like with the company's
social panic, its current ChatGPT panic may see employees' ratings and
reviews for promotions influenced by their ability to integrate
artificial intelligence into 