## I copy the script information to count the terms from the Wikipedia file and extract all the lists

## Code I used to initially extract the terms

In [10]:
import csv

# Function to count the occurrences of a term in a text file
def count_term_occurrences(term, text_file):
    """
    Counts the number of occurrences of a term in a text file, case-insensitively.

    Args:
        term (str): The term to search for.
        text_file (str): Path to the text file.

    Returns:
        int: The number of occurrences of the term in the text file.
    """
    with open(text_file, 'r', encoding='utf-8') as file:
        content = file.read()
        return content.lower().count(term.lower())  # Case-insensitive count

# Function to extract unique terms from the first column of a CSV file
def extract_unique_terms(csv_file):
    """
    Extracts unique terms from the first column of a CSV file.

    Args:
        csv_file (str): Path to the CSV file.

    Returns:
        set: A set of unique terms.
    """
    unique_terms = set()  # Use a set to store unique terms
    with open(csv_file, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        for row in reader:
            if row:  # Ensure the row is not empty
                unique_terms.add(row[0].strip())  # Add the term to the set
    return unique_terms

# Main function to count occurrences of terms and save results
def search_and_count_terms(csv_file, text_file, output_file):
    """
    Reads terms from a CSV file, counts their occurrences in a text file, 
    and writes the results to an output file.

    Args:
        csv_file (str): Path to the CSV file containing terms.
        text_file (str): Path to the text file to search in.
        output_file (str): Path to the output file to save results.
    """
    unique_terms = extract_unique_terms(csv_file)
    with open(output_file, 'w', encoding='utf-8') as output:
        for term in unique_terms:
            occurrences = count_term_occurrences(term, text_file)
            output.write(f"{term}: appears {occurrences} time(s).\n")

# File paths
csv_file_path = "spanish/[Physics ~ Electro-magnetism]_and_[Technological Sciences ~ Electrical technology and engineering].csv"
text_file_path = "spanish/depth_1/specific_wikipedia_files/eswiki_electromagnetism_depth1.txt"  # Replace with the path to your text file
output_file_path = "spanish/depth_1/terms_counter/terms_number_electromagnetism.txt"  # File to save the results

# Execute the main function
search_and_count_terms(csv_file_path, text_file_path, output_file_path)

## Second code with better handling of compound terms. V2

In [11]:
import csv
import re

# Function to count exact occurrences of a term in a text file
def count_exact_occurrences_v2(term, text_file):
    """
    Counts the exact occurrences of a term in a text file.

    Args:
        term (str): The term to search for.
        text_file (str): Path to the text file.

    Returns:
        int: The number of occurrences of the term in the text file.
    """
    pattern = rf"\b{re.escape(term)}\b"  # \b ensures exact word matching
    count = 0

    with open(text_file, 'r', encoding='utf-8') as file:
        for line in file:  # Process line by line
            count += len(re.findall(pattern, line, re.IGNORECASE))  # Count occurrences in the line

    return count

# Main function to count occurrences of terms and display results
def search_terms_and_count_v2(csv_file, text_file, output_file):
    """
    Reads terms from a CSV file, counts their occurrences in a text file, 
    and writes the results to an output file.

    Args:
        csv_file (str): Path to the CSV file containing terms.
        text_file (str): Path to the text file to search in.
        output_file (str): Path to the output file to save results.
    """
    unique_terms = extract_unique_terms(csv_file)

    with open(output_file, 'w', encoding='utf-8') as output:
        for term in unique_terms:
            occurrences = count_exact_occurrences_v2(term, text_file)
            output.write(f"{term}: appears {occurrences} time(s).\n")
            print(f"✅ {term}: {occurrences} time(s) in the corpus" if occurrences > 0 else f"❌ {term}: Not found in the corpus")

# 📌 File paths
csv_file_path_law = "spanish/[Juridical Sciences & Law ~ Constitutional law]_and_[Technological Sciences ~ Environmental technology and engineering].csv"
text_file_path_law = "spanish/depth_1/specific_wikipedia_files/eswiki_law_depth1.txt"
output_file_path_law = "spanish/depth_1/terms_counter/terms_number_law_v2.txt"

# Execute
search_terms_and_count_v2(csv_file_path_law, text_file_path_law, output_file_path_law)


✅ sección: 302 time(s) in the corpus
✅ rango: 316 time(s) in the corpus
✅ proceso: 2077 time(s) in the corpus
✅ movilidad: 111 time(s) in the corpus
✅ efecto: 434 time(s) in the corpus
✅ evolución: 153 time(s) in the corpus
✅ sucesión: 103 time(s) in the corpus
✅ medio: 1219 time(s) in the corpus
✅ muestra: 245 time(s) in the corpus
✅ plataforma continental: 1 time(s) in the corpus
✅ plan: 774 time(s) in the corpus
✅ base: 1050 time(s) in the corpus
✅ suspensión: 188 time(s) in the corpus
✅ acción: 1032 time(s) in the corpus
✅ afinidad: 12 time(s) in the corpus
✅ limitación: 99 time(s) in the corpus
✅ medida preventiva: 5 time(s) in the corpus
✅ viabilidad: 23 time(s) in the corpus
✅ programa: 986 time(s) in the corpus
✅ medida: 831 time(s) in the corpus
✅ procesar: 41 time(s) in the corpus
✅ proyecto: 1606 time(s) in the corpus
✅ disolver: 116 time(s) in the corpus
✅ prueba: 280 time(s) in the corpus
✅ laguna: 11 time(s) in the corpus


## Proposal for faster context extraction

In [12]:
import csv
import re

# Function to find exact matches and extract their context with the term highlighted
def extract_context(term, text_file, context_chars=200):
    """
    Extracts the context of exact matches of a term in a text file, highlighting the term.

    Args:
        term (str): The term to search for.
        text_file (str): Path to the text file.
        context_chars (int): Number of characters to include before and after the match.

    Returns:
        list: A list of contexts where the term appears, with the term highlighted.
    """
    pattern = rf"\b{re.escape(term)}\b"  # Exact word match
    contexts = []

    with open(text_file, 'r', encoding='utf-8') as file:
        content = file.read()  # Load the entire text

    for match in re.finditer(pattern, content, re.IGNORECASE):
        start = max(0, match.start() - context_chars)
        end = min(len(content), match.end() + context_chars)
        context = content[start:end].replace("\n", " ")  # Remove line breaks

        # Highlight the term with **asterisks**
        highlighted_term = f"**{content[match.start():match.end()]}**"
        highlighted_context = context[: match.start() - start] + highlighted_term + context[match.end() - start :]

        contexts.append(highlighted_context.strip())

    return contexts  # List of found contexts

# Main function to search for terms and save their context
def search_terms_and_save(csv_file, text_file, output_txt, output_csv):
    """
    Reads terms from a CSV file, searches for their occurrences in a text file, 
    and saves the context of each occurrence to a TXT and CSV file.

    Args:
        csv_file (str): Path to the CSV file containing terms.
        text_file (str): Path to the text file to search in.
        output_txt (str): Path to the output TXT file to save contexts.
        output_csv (str): Path to the output CSV file to save contexts.
    """
    unique_terms = extract_unique_terms(csv_file)

    with open(output_txt, 'w', encoding='utf-8') as txt_out, open(output_csv, 'w', encoding='utf-8', newline='') as csv_out:
        csv_writer = csv.writer(csv_out)
        csv_writer.writerow(["Term", "Context", "Occurrences"])  # CSV headers

        for term in unique_terms:
            contexts = extract_context(term, text_file)
            occurrences = len(contexts)

            # Save to TXT
            txt_out.write(f"\n=== {term.upper()} ({occurrences} occurrences) ===\n")
            if occurrences > 0:
                for i, context in enumerate(contexts, 1):
                    txt_out.write(f"{i}. ...{context}...\n")
            else:
                txt_out.write("Not found in the corpus.\n")

            # Save to CSV
            for context in contexts:
                csv_writer.writerow([term, context, occurrences])

            # Print message to console
            print(f"✅ {term}: {occurrences} occurrences in the corpus" if occurrences > 0 else f"❌ {term}: Not found in the corpus")


# 📌 File paths
csv_file_path_environmental = "spanish/[Medical Sciences ~ Pharmaceutical Technology]_and_[Technological Sciences ~ Environmental technology and engineering].csv"
text_file_path_environmental = "spanish/depth_4/specific_wikipedia_files/eswiki_environmental.txt"
output_txt_path_environmental = "spanish/depth_4/contexts/context_terms_environmental.txt"
output_csv_path_environmental = "spanish/depth_4/contexts/context_terms_environmental.csv"

# Execute
search_terms_and_save(csv_file_path_environmental, text_file_path_environmental, output_txt_path_environmental, output_csv_path_environmental)

✅ indicador biológico: 1 occurrences in the corpus
✅ cierre: 519 occurrences in the corpus
✅ cloruro: 601 occurrences in the corpus
✅ flujo: 1529 occurrences in the corpus
✅ dosificación: 56 occurrences in the corpus
✅ brillo: 139 occurrences in the corpus
✅ enlace: 483 occurrences in the corpus
✅ medio: 7890 occurrences in the corpus
✅ producción: 8005 occurrences in the corpus
✅ sonda: 237 occurrences in the corpus
✅ material filtrante: 5 occurrences in the corpus
✅ componente: 754 occurrences in the corpus
✅ rayo: 175 occurrences in the corpus
✅ eficacia: 777 occurrences in the corpus
✅ polvo: 818 occurrences in the corpus
✅ substrato: 27 occurrences in the corpus
✅ determinación: 303 occurrences in the corpus
✅ temperatura ambiente: 175 occurrences in the corpus
✅ productor: 467 occurrences in the corpus
✅ valoración: 231 occurrences in the corpus
✅ ensayo: 513 occurrences in the corpus
✅ colorante: 70 occurrences in the corpus
✅ normativa: 407 occurrences in the corpus
✅ inspecció