Goals: Create a terminology dataset from NIST Glossary

glossary location: https://csrc.nist.gov/csrc/media/glossary/glossary-export.zip

In [226]:
import requests
from io import BytesIO
import os
import zipfile

def download_and_extract_zip_from_url(url: str, output_directory: str):
    """
    Downloads a ZIP file from a given URL and extracts it to a specified output directory.

    Parameters:
    - url: The URL to download the ZIP file from.
    - output_directory: The directory where the ZIP file will be extracted to.

    Returns:
    - A list of filenames that were extracted.
    """
    extracted_files = []
    if not os.path.exists(output_directory):
        try:
            # Download the ZIP file
            response = requests.get(url)
            if response.status_code != 200:
                print(f"Failed to download the ZIP file, status code: {response.status_code}")
                return None

            # Use BytesIO to open the in-memory ZIP file
            with zipfile.ZipFile(BytesIO(response.content), 'r') as zip_ref:
                # Extract all the contents into the output directory
                zip_ref.extractall(output_directory)
                extracted_files = zip_ref.namelist()

            return extracted_files

        except Exception as e:
            print(f"An error occurred: {e}")
            return None


url = 'https://csrc.nist.gov/csrc/media/glossary/glossary-export.zip'
output_directory = './glossary_extracted/'  # Change to your desired output directory
extracted_files = download_and_extract_zip_from_url(url, output_directory)
extracted_files

Json load from the downloaded json

In [227]:
# Import the necessary modules
import json
import pandas as pd
from pandas import json_normalize

# Load the JSON data from the file 'glossary-export.json'
with open('./glossary_extracted/glossary-export.json', 'r', encoding="utf-8-sig") as f:
    json_data = json.load(f)


Preprocess Data

In [228]:
# Flatten the 'parentTerms' column to focus on the parent terms
flattened_parent_terms_df = json_normalize(json_data, record_path='parentTerms', sep='_')

# drop note and see_also
flattened_parent_terms_df = flattened_parent_terms_df.drop(columns=['note','seeAlso'])

# Show the first few rows of the DataFrame containing only the parent terms
flattened_parent_terms_df.head()

Unnamed: 0,term,link,abbrSyn,definitions
0,(EC)DH,https://csrc.nist.gov/glossary/term/_ec_dh,[{'text': '(Elliptic Curve) Diffie-Hellman'}],
1,"(p, t)-completeness",https://csrc.nist.gov/glossary/term/p_t_comple...,,"[{'text': 'For a given set of n variables, (p,..."
2,(t + k)-way combination coverage,https://csrc.nist.gov/glossary/term/t_k_way_co...,,[{'text': 'For a given test set that provides ...
3,.csv,https://csrc.nist.gov/glossary/term/_csv,"[{'text': 'Comma-Separated Value', 'link': 'ht...",
4,{ },https://csrc.nist.gov/glossary/term/curly_brace,,"[{'text': 'In this Recommendation, the curly b..."


In [229]:
# Initialize a list to store the new rows for the fully flattened data
new_rows = []

# Loop through each row in the DataFrame containing parent terms
for idx, row in flattened_parent_terms_df.iterrows():
    term = row['term']
    link = row['link']

    # Flatten 'abbrSyn'
    abbr_list = row['abbrSyn']
    if abbr_list is not None and isinstance(abbr_list, list):
        for abbr in abbr_list:
            new_row = {
                'term': term,
                'link': link,
                'abbrSyn': abbr.get('text', None),
                'definitions': None,
            }
            new_rows.append(new_row)

    # Flatten 'definitions'
    def_list = row['definitions']
    if def_list is not None and isinstance(def_list, list):
        for definition in def_list:
            new_row = {
                'term': term,
                'link': link,
                'abbrSyn': None,
                'definitions': definition.get('text', None),
            }
            new_rows.append(new_row)

    # Case when both 'abbrSyn' and 'definitions' are None or not lists
    if (abbr_list is None or not isinstance(abbr_list, list)) and (def_list is None or not isinstance(def_list, list)):
        new_row = {
            'term': term,
            'link': link,
            'abbrSyn': None,
            'definitions': None,
        }
        new_rows.append(new_row)

# Create a new DataFrame from the list of new rows
fully_flattened_df_refactored = pd.DataFrame(new_rows)

# Show fully flattened DataFrame
fully_flattened_df_refactored


Unnamed: 0,term,link,abbrSyn,definitions
0,(EC)DH,https://csrc.nist.gov/glossary/term/_ec_dh,(Elliptic Curve) Diffie-Hellman,
1,"(p, t)-completeness",https://csrc.nist.gov/glossary/term/p_t_comple...,,"For a given set of n variables, (p, t)-complet..."
2,(t + k)-way combination coverage,https://csrc.nist.gov/glossary/term/t_k_way_co...,,For a given test set that provides 100% t-way ...
3,.csv,https://csrc.nist.gov/glossary/term/_csv,Comma-Separated Value,
4,{ },https://csrc.nist.gov/glossary/term/curly_brace,,"In this Recommendation, the curly braces { } a..."
...,...,...,...,...
15317,ZT,https://csrc.nist.gov/glossary/term/zt,,A collection of concepts and ideas designed to...
15318,ZTA,https://csrc.nist.gov/glossary/term/zta,zero trust architecture,
15319,ZTA,https://csrc.nist.gov/glossary/term/zta,Zero Trust Architecture,
15320,ZTA,https://csrc.nist.gov/glossary/term/zta,,"A security model, a set of system design princ..."


In [230]:
# Change all values to lowercase
fully_flattened_df_refactored = fully_flattened_df_refactored.applymap(lambda x: x.lower().strip() if isinstance(x, str) else x)

fully_flattened_df_refactored.head()

Unnamed: 0,term,link,abbrSyn,definitions
0,(ec)dh,https://csrc.nist.gov/glossary/term/_ec_dh,(elliptic curve) diffie-hellman,
1,"(p, t)-completeness",https://csrc.nist.gov/glossary/term/p_t_comple...,,"for a given set of n variables, (p, t)-complet..."
2,(t + k)-way combination coverage,https://csrc.nist.gov/glossary/term/t_k_way_co...,,for a given test set that provides 100% t-way ...
3,.csv,https://csrc.nist.gov/glossary/term/_csv,comma-separated value,
4,{ },https://csrc.nist.gov/glossary/term/curly_brace,,"in this recommendation, the curly braces { } a..."


Analyze data for nulls then process nulls

In [231]:
# Get isnull sum
fully_flattened_df_refactored.isnull().sum()

term              0
link              0
abbrSyn        7634
definitions    7689
dtype: int64

In [232]:
# Group the DataFrame by the 'term' column and use 'ffill' and 'bfill' to fill NaN values within each group
grouped_df = fully_flattened_df_refactored.groupby('term', group_keys=False).apply(lambda group: group.ffill().bfill())

# Reset the index for the DataFrame
grouped_df.reset_index(drop=True, inplace=True)

grouped_df


Unnamed: 0,term,link,abbrSyn,definitions
0,(ec)dh,https://csrc.nist.gov/glossary/term/_ec_dh,(elliptic curve) diffie-hellman,
1,"(p, t)-completeness",https://csrc.nist.gov/glossary/term/p_t_comple...,,"for a given set of n variables, (p, t)-complet..."
2,(t + k)-way combination coverage,https://csrc.nist.gov/glossary/term/t_k_way_co...,,for a given test set that provides 100% t-way ...
3,.csv,https://csrc.nist.gov/glossary/term/_csv,comma-separated value,
4,{ },https://csrc.nist.gov/glossary/term/curly_brace,,"in this recommendation, the curly braces { } a..."
...,...,...,...,...
15317,zt,https://csrc.nist.gov/glossary/term/zt,zero trust,a collection of concepts and ideas designed to...
15318,zta,https://csrc.nist.gov/glossary/term/zta,zero trust architecture,"a security model, a set of system design princ..."
15319,zta,https://csrc.nist.gov/glossary/term/zta,zero trust architecture,"a security model, a set of system design princ..."
15320,zta,https://csrc.nist.gov/glossary/term/zta,zero trust architecture,"a security model, a set of system design princ..."


In [233]:
# Get isnull sum
grouped_df.isnull().sum()

term              0
link              0
abbrSyn        4287
definitions    5886
dtype: int64

In [234]:
# Extract the last part of the URL in the 'link' column and use it to fill NaN values in 'abbrSyn'
grouped_df['abbrSyn'] = grouped_df.apply(
    lambda row: row['link'].split('/')[-1] if pd.isna(row['abbrSyn']) else row['abbrSyn'],
    axis=1
)

# Show the first few rows of the DataFrame after filling 'abbrSyn' with the last part of the URL
grouped_df

Unnamed: 0,term,link,abbrSyn,definitions
0,(ec)dh,https://csrc.nist.gov/glossary/term/_ec_dh,(elliptic curve) diffie-hellman,
1,"(p, t)-completeness",https://csrc.nist.gov/glossary/term/p_t_comple...,p_t_completeness,"for a given set of n variables, (p, t)-complet..."
2,(t + k)-way combination coverage,https://csrc.nist.gov/glossary/term/t_k_way_co...,t_k_way_combination_coverage,for a given test set that provides 100% t-way ...
3,.csv,https://csrc.nist.gov/glossary/term/_csv,comma-separated value,
4,{ },https://csrc.nist.gov/glossary/term/curly_brace,curly_brace,"in this recommendation, the curly braces { } a..."
...,...,...,...,...
15317,zt,https://csrc.nist.gov/glossary/term/zt,zero trust,a collection of concepts and ideas designed to...
15318,zta,https://csrc.nist.gov/glossary/term/zta,zero trust architecture,"a security model, a set of system design princ..."
15319,zta,https://csrc.nist.gov/glossary/term/zta,zero trust architecture,"a security model, a set of system design princ..."
15320,zta,https://csrc.nist.gov/glossary/term/zta,zero trust architecture,"a security model, a set of system design princ..."


In [235]:
grouped_df.isnull().sum()

term              0
link              0
abbrSyn           0
definitions    5886
dtype: int64

Update the 'definitions' column based on the condition for 'abbrSyn'

In [236]:
# Update the 'definitions' column based on the condition for 'abbrSyn'
grouped_df['definitions'] = grouped_df.apply(
    lambda row: f"{row['abbrSyn']}: {row['link'].split('/')[-1].replace('_', ' ')}" if (pd.isna(row['definitions'])) else row['definitions'],
    axis=1
)

# Show the first few rows of the DataFrame after updating 'definitions'
grouped_df

Unnamed: 0,term,link,abbrSyn,definitions
0,(ec)dh,https://csrc.nist.gov/glossary/term/_ec_dh,(elliptic curve) diffie-hellman,(elliptic curve) diffie-hellman: ec dh
1,"(p, t)-completeness",https://csrc.nist.gov/glossary/term/p_t_comple...,p_t_completeness,"for a given set of n variables, (p, t)-complet..."
2,(t + k)-way combination coverage,https://csrc.nist.gov/glossary/term/t_k_way_co...,t_k_way_combination_coverage,for a given test set that provides 100% t-way ...
3,.csv,https://csrc.nist.gov/glossary/term/_csv,comma-separated value,comma-separated value: csv
4,{ },https://csrc.nist.gov/glossary/term/curly_brace,curly_brace,"in this recommendation, the curly braces { } a..."
...,...,...,...,...
15317,zt,https://csrc.nist.gov/glossary/term/zt,zero trust,a collection of concepts and ideas designed to...
15318,zta,https://csrc.nist.gov/glossary/term/zta,zero trust architecture,"a security model, a set of system design princ..."
15319,zta,https://csrc.nist.gov/glossary/term/zta,zero trust architecture,"a security model, a set of system design princ..."
15320,zta,https://csrc.nist.gov/glossary/term/zta,zero trust architecture,"a security model, a set of system design princ..."


In [237]:
grouped_df.isnull().sum()

term           0
link           0
abbrSyn        0
definitions    0
dtype: int64

In [238]:
grouped_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15322 entries, 0 to 15321
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   term         15322 non-null  object
 1   link         15322 non-null  object
 2   abbrSyn      15322 non-null  object
 3   definitions  15322 non-null  object
dtypes: object(4)
memory usage: 478.9+ KB


In [242]:
# Identify and drop duplicated rows based on all given columns
grouped_df.drop_duplicates(subset=['term', 'link', 'abbrSyn', 'definitions'], keep='first', inplace=True)

# Show the first few rows of the DataFrame after dropping duplicates
grouped_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 13873 entries, 0 to 15321
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   term         13873 non-null  object
 1   link         13873 non-null  object
 2   abbrSyn      13873 non-null  object
 3   definitions  13873 non-null  object
dtypes: object(4)
memory usage: 541.9+ KB


In [243]:
# write output to file.
grouped_df.to_csv('./glossary_extracted/glossary-export-processed.csv')