# Extract keywords from SLAC experiment logs

This example notebook will demonstrate how to configure and run the ScienceSearch Python tools for keyword extraction.

For more information about ScienceSearch, see also:
- [sciencesearch Github repository](https://github.com/ScienceSearch/sciencesearch).
- AI-generated [documentation pages](https://deepwiki.com/ScienceSearch/sciencesearch/1-overview).

## Prerequisites
- A Python environment which includes ScienceSearch Python package `sciencesearch` (see [../README.md](../README.md))
- A SLAC-generated SQLite database

## Setup
Python imports and some logging setup

In [None]:
# imports
from pathlib import Path
from sciencesearch.nlp.search import KeywordExplorer
from sciencesearch.nlp.slac_data_extractor import SLACDatabaseDataExtractor
from IPython.core.display import HTML

# logging setup
import logging

logging.root.setLevel(logging.ERROR)  # silence pke warnings
slog = logging.getLogger("sciencesearch")
slog.setLevel(logging.WARNING)
from sciencesearch.nlp.visualize_kws import JsonView
from pathlib import Path
import json

In [None]:
# Connect to config directory 
conf_dir = Path(".") / "config_files"

In [None]:
## Experiment descriptions NOT REPLACEMENT
conf_file_descriptions =  conf_dir / "slac_config_descriptions.json"
SLACDatabaseDataExtractor(conf_file_descriptions, replace_abbrv = False).process_experiment_descriptions()
kwe_des = KeywordExplorer.from_config(conf_file_descriptions)
kwe_des.save_keywords_to_file('description_kws1')



In [None]:
## Experiment descriptions  REPLACEMENT
conf_file_descriptions =  conf_dir / "slac_config_descriptions.json"
SLACDatabaseDataExtractor(conf_file_descriptions, replace_abbrv = True).process_experiment_descriptions()
kwe_des = KeywordExplorer.from_config(conf_file_descriptions)
kwe_des.save_keywords_to_file('acronym_description_kws1')



In [None]:

## Elogs and experiment parameters NOT REPLACEMENT
conf_file_params = conf_dir / "slac_config_params.json"
SLACDatabaseDataExtractor(conf_file_params, replace_abbrv = False).process_experiment_elog_parameters()
kwe_param = KeywordExplorer.from_config(conf_file_params)
kwe_param.save_keywords_to_file('param_kws1')


In [None]:
## Elogs and experiment parameters REPLACEMENT
conf_file_params = conf_dir / "slac_config_params.json"
SLACDatabaseDataExtractor(conf_file_params, replace_abbrv = True).process_experiment_elog_parameters()
kwe_param = KeywordExplorer.from_config(conf_file_params)
kwe_param.save_keywords_to_file('acronym_param_kws1')


In [None]:
#*# Only elogs that are misc. commentary NO REPLACEMENT
conf_file_commentary =  conf_dir / "slac_config_commentary.json"
SLACDatabaseDataExtractor(conf_file_commentary, replace_abbrv = False).process_experiment_elog_commentary()
kwe_comment = KeywordExplorer.from_config(conf_file_commentary)
kwe_comment.save_keywords_to_file('commentary_kws1')


In [None]:
#*# Only elogs that are misc. commentary REPLACEMENT
conf_file_commentary =  conf_dir / "slac_config_commentary.json"
SLACDatabaseDataExtractor(conf_file_commentary, replace_abbrv = True).process_experiment_elog_commentary()
kwe_comment = KeywordExplorer.from_config(conf_file_commentary)
kwe_comment.save_keywords_to_file('acronym_commentary_kws1')


## Get KW Diffs

In [None]:
import pandas as pd

def diff(csv_acronym, csv_default)
    df_a = pd.read_csv(csv_acronym)
    df_b = pd.read_csv(csv_default)

    # Merge the dataframes on experiment name to align rows
    merged = pd.merge(df_a[['experiment name', 'predicted']], 
                    df_b[['experiment name', 'predicted']], 
                    on='experiment name', 
                    suffixes=('_replacement', '_default'))

    # Function to convert predicted values to sets (assuming comma-separated keywords)
    def to_set(value):
        if pd.isna(value):
            return set()
        return set(str(value).split(',')) if ',' in str(value) else {str(value)}

    # Apply set operations
    results = []
    for _, row in merged.iterrows():
        set_a = to_set(row['predicted_replacement'])
        set_b = to_set(row['predicted_default'])
        
        results.append({
            'experiment name': row['experiment name'],
            'keywords with both acyronym replacement and without acyronym replacement': ', '.join(set_a & set_b),
            'keywords only with acyronym replacement': ', '.join(set_a - set_b),
            'keywords only without acyronym replacement': ', '.join(set_b - set_a)
        })

    # Create output dataframe
    output_df = pd.DataFrame(results)
    return output_df

# For space-separated values    
def to_set(value):
    if pd.isna(value):
        return set()
    return set(str(value).split(','))