# Extract keywords from SLAC experiment logs

This example notebook will demonstrate how to configure and run the ScienceSearch Python tools for keyword extraction.

For more information about ScienceSearch, see also:
- [sciencesearch Github repository](https://github.com/ScienceSearch/sciencesearch).
- AI-generated [documentation pages](https://deepwiki.com/ScienceSearch/sciencesearch/1-overview).

## Prerequisites
- A Python environment which includes ScienceSearch Python package `sciencesearch` (see [../README.md](../README.md))
- A SLAC-generated SQLite database

## Setup
Python imports and some logging setup

In [None]:
# imports
from pathlib import Path
from sciencesearch.nlp.search import KeywordExplorer
from sciencesearch.nlp.slac_data_extractor import SLACDatabaseDataExtractor
from IPython.core.display import HTML

# logging setup
import logging

logging.root.setLevel(logging.ERROR)  # silence pke warnings
slog = logging.getLogger("sciencesearch")
slog.setLevel(logging.WARNING)
from sciencesearch.nlp.visualize_kws import JsonView
from pathlib import Path
import json

In [None]:
# Connect to config directory
conf_dir = Path(".") / "config_files"

In [None]:
## Experiment descriptions NOT REPLACEMENT
conf_file_descriptions = conf_dir / "slac_config_descriptions.json"
SLACDatabaseDataExtractor(
    conf_file_descriptions, replace_abbrv=False
).process_experiment_descriptions()
# kwe_des = KeywordExplorer.from_config(conf_file_descriptions)
# kwe_des.save_keywords_to_file('description_kws1')

In [None]:
## Experiment descriptions  REPLACEMENT
conf_file_descriptions = conf_dir / "slac_config_descriptions.json"
SLACDatabaseDataExtractor(
    conf_file_descriptions, replace_abbrv=True
).process_experiment_descriptions()
# kwe_des = KeywordExplorer.from_config(conf_file_descriptions)
# kwe_des.save_keywords_to_file('acronym_description_kws1')

In [None]:
## Elogs and experiment parameters NOT REPLACEMENT
conf_file_params = conf_dir / "slac_config_params.json"
SLACDatabaseDataExtractor(
    conf_file_params, replace_abbrv=False
).process_experiment_elog_parameters()
# kwe_param = KeywordExplorer.from_config(conf_file_params)
# kwe_param.save_keywords_to_file('param_kws1')

In [None]:
## Elogs and experiment parameters REPLACEMENT
conf_file_params = conf_dir / "slac_config_params.json"
SLACDatabaseDataExtractor(
    conf_file_params, replace_abbrv=True
).process_experiment_elog_parameters()
# kwe_param = KeywordExplorer.from_config(conf_file_params)
# kwe_param.save_keywords_to_file('acronym_param_kws1')

In [None]:
# *# Only elogs that are misc. commentary NO REPLACEMENT
conf_file_commentary = conf_dir / "slac_config_commentary.json"
SLACDatabaseDataExtractor(
    conf_file_commentary, replace_abbrv=False
).process_experiment_elog_commentary()
# kwe_comment = KeywordExplorer.from_config(conf_file_commentary)
# kwe_comment.save_keywords_to_file('commentary_kws1')

In [None]:
# *# Only elogs that are misc. commentary REPLACEMENT
conf_file_commentary = conf_dir / "slac_config_commentary.json"
SLACDatabaseDataExtractor(
    conf_file_commentary, replace_abbrv=True
).process_experiment_elog_commentary()
# kwe_comment = KeywordExplorer.from_config(conf_file_commentary)
# kwe_comment.save_keywords_to_file('acronym_commentary_kws1')

## Get KW Diffs

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn2


def diff(csv_acronym, csv_default, csv_acryonyms_found):
    df_a = pd.read_csv(csv_acronym)
    df_b = pd.read_csv(csv_default)
    df_c = pd.read_csv(csv_acryonyms_found)

    merged = pd.merge(
        df_a[["experiment_name", "predicted"]],
        df_b[["experiment_name", "predicted"]],
        on="experiment_name",
        suffixes=("_replacement", "_default"),
    )

    merged = pd.merge(
        merged[["experiment_name", "predicted_replacement", "predicted_default"]],
        df_c[["experiment_name", "acronyms"]],
        on="experiment_name",
    )

    def to_set(value):
        if pd.isna(value):
            return set()
        value = value.replace("'", "")

        value = value.replace("[", "")
        value = value.replace("]", "")
        split_list = str(value).split(",") if "," in str(value) else {str(value)}
        split_list_clean = [value.rstrip().lstrip() for value in split_list]
        return set(split_list_clean)

    # Apply set operations
    results = []
    for _, row in merged.iterrows():
        set_a = to_set(row["predicted_replacement"])
        set_b = to_set(row["predicted_default"])

        results.append(
            {
                "experiment_name": row["experiment_name"],
                "all keywords without acronym expansion": ", ".join(set_b),
                "all keywords with acronyms expansion": ", ".join(set_a),
                "common keywords": ", ".join(set_a & set_b),
                "acronym repacement unique keywords": ", ".join(set_a - set_b),
                "without acronym replacement unique keywords": ", ".join(set_b - set_a),
                "acronyms replaced": row["acronyms"],
            }
        )

    # Create output dataframe
    output_df = pd.DataFrame(results)
    return output_df

In [None]:
comment_diff = diff(
    "../private_data/results/acronym_commentary_kws1.csv",
    "../private_data/results/commentary_kws1.csv",
    "../private_data/commentary/replaced_abbrv2.csv",
)
comment_diff.to_csv("../private_data/results/comparison_commentary.csv", index=False)

In [None]:
comment_diff = diff(
    "../private_data/results/acronym_description_kws1.csv",
    "../private_data/results/description_kws1.csv",
    "../private_data/descriptions/replaced_abbrv2.csv",
)
comment_diff.to_csv("../private_data/results/comparison_description.csv", index=False)

In [None]:
comment_diff = diff(
    "../private_data/results/acronym_param_kws1.csv",
    "../private_data/results/param_kws1.csv",
    "../private_data/params/replaced_abbrv2.csv",
)
comment_diff.to_csv("../private_data/results/comparison_params.csv", index=False)