## Proposal for faster context extraction

In [1]:
%pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


In [7]:
import csv
import re
import os
import pandas as pd
import openpyxl

def get_unique_terms(file_path):
    """
    Extracts unique terms from the first column of an Excel file or CSV file.

    Args:
        file_path (str): Path to the Excel file or CSV file.

    Returns:
        set: A set of unique terms.
    """
    if file_path.endswith(".xlsx"):
        df = pd.read_excel(file_path, engine='openpyxl')
    elif file_path.endswith(".csv"):
        df = pd.read_csv(file_path, engine='python')
    else:
        raise ValueError("Unsupported file format")

    first_column = df.columns[0]
    terms = set(df[first_column].dropna().astype(str).str.strip())
    return terms

# Get the context of each occurrence of a term in all text files within a folder
def get_term_context(term, text_folder, context_chars=200):
    """
    Finds the context of a term in text files within a folder.

    Args:
        term (str): The term to search for.
        text_folder (str): Path to the folder containing text files.
        context_chars (int): Number of characters to include before and after the term.

    Returns:
        tuple: A list of contexts and the number of files where the term was found.
    """
    pattern = rf"\b{re.escape(term)}\b"
    contexts = []
    found_files = set()
    for file_name in os.listdir(text_folder):
        if file_name.endswith(".txt"):
            file_path = os.path.join(text_folder, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
            found = False
            for match in re.finditer(pattern, content, re.IGNORECASE):
                if not found:
                    found_files.add(file_name)
                    found = True
                start = max(0, match.start() - context_chars)
                end = min(len(content), match.end() + context_chars)
                context = content[start:end].replace("\n", " ")
                highlighted_term = f"**{content[match.start():match.end()]}**"
                highlighted_context = (
                    context[: match.start() - start]
                    + highlighted_term
                    + context[match.end() - start :]
                )
                contexts.append(highlighted_context.strip())
    return contexts, len(found_files)

# Main function to search terms and save results
def search_terms_and_save(input_excel, text_folder, output_txt_path, output_csv_path, output_log_path):
    """
    Searches for terms in text files and saves the results in TXT, CSV, and log files.

    Args:
        input_excel (str): Path to the Excel file containing terms.
        text_folder (str): Path to the folder containing text files.
        output_txt_path (str): Path to save the TXT file with contexts.
        output_csv_path (str): Path to save the CSV file with contexts.
        output_log_path (str): Path to save the log file with statistics.
    """
    unique_terms = get_unique_terms(input_excel)
    with open(output_txt_path, 'w', encoding='utf-8') as txt_out, \
            open(output_csv_path, 'w', encoding='utf-8', newline='') as csv_out, \
            open(output_log_path, 'w', encoding='utf-8') as log_out:
        csv_writer = csv.writer(csv_out)
        csv_writer.writerow(["Term", "Context", "Occurrences"])
        for term in unique_terms:
            contexts, num_files = get_term_context(term, text_folder)
            occurrences = len(contexts)
            # Write header in TXT
            txt_out.write(f"\n=== {term.upper()} ({occurrences} occurrences in {num_files} file(s)) ===\n")
            if occurrences > 0:
                for i, context in enumerate(contexts, 1):
                    txt_out.write(f"{i}. ...{context}...\n")
            else:
                txt_out.write("Not found in the corpus.\n")

            # Write to CSV
            for context in contexts:
                csv_writer.writerow([term, context, occurrences])

            # Write message to console and log
            message = (
                f"✅ {term}: {occurrences} occurrences in {num_files} file(s)"
                if occurrences > 0
                else f"❌ {term}: Not found in any file"
            )
            print(message)
            log_out.write(message + "\n")


# File paths
input_excel = 'BA-ES-[Medical Sciences ~ Pharmaceutical Technology]_and_[Technological Sciences ~ Environmental technology and engineering].xlsx'
text_folder = 'garaterm_full_corpora/osasuna'
output_txt_path = 'contexts/context_terms_medicine_ba_es.txt'
output_csv_path = 'contexts/context_terms_medicine_ba_es.csv'
output_log_path = 'stats_medicine_ba_es.txt'

# Execute
search_terms_and_save(input_excel, text_folder, output_txt_path, output_csv_path, output_log_path)

✅ eragile: 426 occurrences in 186 file(s)
✅ garbiketa: 363 occurrences in 103 file(s)
✅ izozte: 4 occurrences in 2 file(s)
✅ ezaugarri organoleptiko: 3 occurrences in 3 file(s)
✅ hauskortasun: 16 occurrences in 8 file(s)
✅ elikagai: 696 occurrences in 203 file(s)
✅ balioztapen: 6 occurrences in 3 file(s)
✅ estalki: 34 occurrences in 21 file(s)
✅ trinkotasun: 2 occurrences in 2 file(s)
✅ berogailu: 4 occurrences in 4 file(s)
✅ urradura: 9 occurrences in 5 file(s)
✅ ioi: 184 occurrences in 61 file(s)
✅ talka: 32 occurrences in 25 file(s)
✅ jalkin: 2 occurrences in 2 file(s)
✅ betetze: 36 occurrences in 14 file(s)
✅ xafla: 171 occurrences in 46 file(s)
✅ beira: 59 occurrences in 19 file(s)
✅ indargetzaile: 28 occurrences in 12 file(s)
❌ oxidazio-prozesu: Not found in any file
✅ eragin: 3016 occurrences in 794 file(s)
✅ pikor: 105 occurrences in 16 file(s)
