In [29]:
import newspaper
import nltk
from textblob import TextBlob
from datasets import load_dataset
from pathlib import Path
import os
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
import polars as pl
from multiprocessing import Pool, cpu_count
import time
import operator
from functools import partial

In [3]:
folders = [r'C:\Users\Joshh\Projects\Stocks\Data\Webscraping\News Scaper\ICInsider', r'C:\Users\Joshh\Projects\Stocks\Data\Webscraping\News Scaper\JamaicaGleaner'
           r'C:\Users\Joshh\Projects\Stocks\Data\Webscraping\News Scaper\JIS', r'C:\Users\Joshh\Projects\Stocks\Data\Webscraping\News Scaper\JamaicaObserver\business']

In [12]:
# Defining longer lists for a meaningful performance comparison
a = list(range(1, 100001))
b = list(range(1, 100001))

# Perform multiple measurements for accuracy
num_iterations = 50
mapt = []
loopt = []

In [13]:
# Measure the performance of the map function
for _ in range(num_iterations):
    t1 = time.time()
    result = list(map(operator.mul, a, b))
    t2 = time.time()
    mapt.append(t2 - t1)

In [14]:
# Measure the performance of the for loop
for _ in range(num_iterations):
    t1 = time.time()
    result = [a[i] * b[i] for i in range(len(a))]
    t2 = time.time()
    loopt.append(t2 - t1)

In [30]:
# Map Function Implementation
def process_file_with_article(file, shared_article):
    """Process a single file using a shared article object"""
    with open(file, 'r', encoding='utf-8', errors='ignore') as f:
        html_content = f.read()
    
    # Set the HTML content directly
    shared_article.html = html_content
    shared_article.download_state = 2
    shared_article.parse()
    shared_article.nlp()
    
    return shared_article.text

def get_sentiment_map(files):
    # Initialize newspaper
    testUrl = 'https://www.jamaicaobserver.com/2025/04/27/fa-cup-glory-wont-salvage-man-citys-troubled-season-guardiola'
    article = newspaper.Article(testUrl)
    
    # Process files with map
    # Create a partial function with the shared article object
    process_with_shared_article = partial(process_file_with_article, shared_article=article)
    
    # Use map to process files
    articles = list(map(process_with_shared_article, files[:100]))

In [32]:
docs4 = get_sentiment_map(files)

In [16]:
import time
from itertools import cycle

# Assuming you have already defined a, b, and num_iterations
iter_times = []

# Create a cycling iterator for range(num_iterations)
cycle_iter = cycle(range(num_iterations))

# Take only the required number of iterations
for _ in range(num_iterations):
    i = next(cycle_iter)  # Get the next value from the cycle
    
    t1 = time.time()
    result = [a[i] * b[i] for i in range(len(a))]
    t2 = time.time()
    iter_times.append(t2 - t1)

avg_iter = sum(iter_times) / num_iterations

print(f"Average time of cycle: {avg_iter:.6f} seconds")

Average time of cycle: 0.005043 seconds


In [15]:
# Calculate average times
avg_mapt = sum(mapt) / num_iterations
avg_loopt = sum(loopt) / num_iterations

print(f"Average time of map: {avg_mapt:.6f} seconds")
print(f"Average time of for loop: {avg_loopt:.6f} seconds")

Average time of map: 0.002621 seconds
Average time of for loop: 0.004583 seconds


In [4]:
def get_files(folderLoc):
    ''''This function accepts a list of folders and returns a list of html file paths'''

    allFiles = []

    folders = folderLoc

    for folder in folders:
        folder = Path(folder)
        folderFiles = list(folder.glob('**/*.html'))

        allFiles.append(folderFiles)

    # Filter out folders
    filesOnly = [f for f in folderFiles if f.is_file()]

    return filesOnly

In [21]:
# Function to get articles and perform sentiment analysis
def get_sentiment(files: list) -> list:
    # Initialize newspaper
    testUrl = 'https://www.jamaicaobserver.com/2025/04/27/fa-cup-glory-wont-salvage-man-citys-troubled-season-guardiola'
    article = newspaper.Article(testUrl)

    articles = []

    for file in files[:100]:
        with open(file, 'r', encoding='utf-8', errors='ignore') as f:
            html_content = f.read()

        # Set the HTML content directly
        article.html = html_content
        article.download_state = 2
        article.parse()
        article.nlp()

        articles.append(article.text)

    return articles

In [5]:
files = get_files(folders)

In [33]:
docs = get_sentiment(files)

In [34]:
docs = get_sentiment2(files)

In [25]:
len(docs)

100

In [22]:
import newspaper
from concurrent.futures import ThreadPoolExecutor
import multiprocessing

def process_single_file(file_path):
    """Process a single file and return the article text"""
    # Create a new Article object for each thread to avoid conflicts
    article = newspaper.Article('')
    
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            html_content = f.read()
        
        # Set the HTML content directly
        article.html = html_content
        article.download_state = 2
        article.parse()
        article.nlp()
        
        return article.text
    except Exception as e:
        return f"Error processing {file_path}: {str(e)}"

def get_sentiment2(files: list) -> list:
    """Get articles and perform sentiment analysis using parallel processing"""
    # Determine optimal number of workers based on CPU cores
    # Use max(4, CPU_COUNT) to ensure at least 4 threads for I/O-bound tasks
    num_workers = max(4, multiprocessing.cpu_count())
    
    # Only process first 10 files as in the original function
    files_to_process = files[:100]
    results = []
    
    # Use ThreadPoolExecutor for I/O-bound operations
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        # Submit all file processing tasks
        future_to_file = {executor.submit(process_single_file, file): file for file in files_to_process}
        
        # Collect results as they complete
        for future in future_to_file:
            try:
                text = future.result()
                results.append(text)
            except Exception as exc:
                print(f'File processing generated an exception: {exc}')
    
    return results

In [26]:
import newspaper
from concurrent.futures import ThreadPoolExecutor
import time

def process_file_batch(file_batch):
    """Process a batch of files with a single Article object"""
    article = newspaper.Article('')
    results = []
    
    for file in file_batch:
        try:
            with open(file, 'r', encoding='utf-8', errors='ignore') as f:
                html_content = f.read()
            
            # Reset article state
            article.html = html_content
            article.download_state = 2
            article.is_parsed = False
            article.is_downloaded = True
            
            # Process
            article.parse()
            article.nlp()
            
            results.append(article.text)
        except Exception as e:
            results.append(f"Error processing {file}: {str(e)}")
    
    return results

def get_sentiment3(files: list) -> list:
    """Get articles and perform sentiment analysis with optimized threading"""
    # Limit to 10 files as in the original
    files_to_process = files[:100]
    
    # For small numbers of files, threading might not be beneficial
    if len(files_to_process) <= 3:
        # Use the original approach for small batches
        article = newspaper.Article('')
        results = []
        
        for file in files_to_process:
            with open(file, 'r', encoding='utf-8', errors='ignore') as f:
                html_content = f.read()
            
            article.html = html_content
            article.download_state = 2
            article.parse()
            article.nlp()
            
            results.append(article.text)
        
        return results
    
    # For larger numbers, use a balanced approach with just 2-3 threads
    # Create 2-3 batches
    num_batches = min(3, len(files_to_process))
    batch_size = len(files_to_process) // num_batches
    batches = [files_to_process[i:i + batch_size] for i in range(0, len(files_to_process), batch_size)]
    
    results = []
    with ThreadPoolExecutor(max_workers=len(batches)) as executor:
        batch_results = executor.map(process_file_batch, batches)
        
        for batch_result in batch_results:
            results.extend(batch_result)
    
    return results

In [28]:
docs3 = get_sentiment3(files)