## Clustering Issue Report Notebook

Note: This notebook must be run outside dev container to run successfully. Pyodbc throws error when run inside container.

To run outside, you need this notebook and dependent libraries (alcs_global.py, alcs_llm.py) and fix relative path below
to reference them correctly.

In [None]:
%pip install -r requirements.txt

In [None]:
# Environment variable keys
env_var_openai_key = 'NUREG_AZURE_OPENAI_SERVICE_KEY'
env_var_openai_uri = 'NUREG_AZURE_OPENAI_SERVICE_URI'
env_var_openai_model = 'NUREG_AZURE_OPENAI_CHATGPT_MODEL'
env_var_openai_embedding_model = 'SEARCH_EVAL_OPENAI_EMBEDDING_MODEL'

glove_model_location = "pre-trained/glove.6B.200d.txt"

# Add custom library path
import sys
sys.path.append('../../src/evaluation/search/helper')


In [None]:
# Import Libraries
import pyodbc, os, re, html, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from dotenv import load_dotenv
from alcs_llm import AzureOpenAIModel, AzureOpenAIService
from alcs_global import Common
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
# Download files necessay for nltk

# nltk.download("wordnet")
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('punkt_tab') 

In [None]:
# Load environment variables
load_dotenv()

# Set paramters for Azure OpenAI
azure_openai_model = AzureOpenAIModel(
    open_api_key = os.getenv(env_var_openai_key),
    open_api_uri = os.getenv(env_var_openai_uri),
    chatgpt_model_id = os.getenv(env_var_openai_model)
)

# Initialize Azure OpenAI
azure_openai = AzureOpenAIService(azure_openai_model = azure_openai_model)

In [None]:
def dimension_reduction(embedding, method):

    pca = PCA(n_components=2, random_state=42)
    pca_vecs = pca.fit_transform(embedding)

    # save our two dimensions into x0 and x1
    x0 = pca_vecs[:, 0]
    x1 = pca_vecs[:, 1]

    return x0, x1


In [None]:
# -------------------- TIME RANGE --------------------
 
def get_time_period(months):
    current_date = pd.Timestamp.now()
    end_date = current_date.strftime('%Y%m%d')
    start_date = (current_date - pd.DateOffset(months=months)).strftime('%Y%m%d')
    return start_date, end_date
 
# -------------------- FETCH DATASET --------------------
 
def fetch_dataset(start_date, end_date):
    conn_str = (
        f"DRIVER={{Oracle in OraClient19Home1}};"
        f"DBQ=AS9NUCRP;"
        f"UID=;"
        f"PWD=;"
    )
 
    query = f"""
        SELECT
            A.AR_NUMBER,
            A.ORIGINATION_DATE,
            A.AR_SUBJECT,
            A.REPORT_TO,
            A.AR_SEVERITY,
            RTRIM(
                REPLACE(
                    REPLACE(
                        XMLAGG(
                            XMLELEMENT("x", REGEXP_REPLACE(T.DESCRIPTION_NOTES, '[^[:print:]]', '') || ' ')
                            ORDER BY T.GEN_ARG
                        ).GetClobVal(),
                    '<x>', ''),
                '</x>', ''), ' ') AS CONTENT
        FROM TIDARMST A
        LEFT JOIN TIDARCOM T ON T.AR_NUMBER = A.AR_NUMBER
        WHERE A.ORIGINATION_DATE BETWEEN '{start_date}' AND '{end_date}'
        GROUP BY A.AR_NUMBER, A.ORIGINATION_DATE, A.AR_SUBJECT, A.REPORT_TO, A.AR_SEVERITY
        ORDER BY DBMS_RANDOM.VALUE
        FETCH FIRST 10000 ROWS ONLY
    """
 
    conn = pyodbc.connect(conn_str)
    df = pd.read_sql(query, conn)
    conn.close()

    df = df[df['CONTENT'].str.strip().astype(bool)]

    return df
 

In [None]:
# Set timeperiod of dataset
start_date, end_date = get_time_period(12)

# Get dataset
df = fetch_dataset(start_date, end_date)
print(f"Fetched {len(df)} records")

# Regex pattern to match and extract
pattern = r"Description:\s*(.*?)(?=\s*(Recommended Actions|Immediate actions taken):|$)"

# Clean dataset
df['text_cleaned'] = df['CONTENT'].apply(lambda text: Common.preprocess_text(text, pattern, clean_text=True))
df = df[df['text_cleaned'] != '']


In [None]:
# Default vectorization (ada-002)

# Set embedding model from env
embedding_model=os.getenv(env_var_openai_embedding_model)

# Generate embeddings
X_default = azure_openai.generate_embeddings(entries=df['text_cleaned'],
                                            embedding_model_id=embedding_model)
if len(X_default) == 0:
    print('\t\tError generating default embeddings')
    sys.exit("Errors!")

# Drop data rows where error in creating embeddings.
X_default = [x for x in X_default if x  != -99999.99999]


In [None]:
# TF-IDF Vectorization

vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.95)
X_TFIDF = vectorizer.fit_transform(df['text_cleaned']).toarray()

In [None]:
# Glove Vectorization

def load_glove_model(File):
    print("Loading Glove Model")
    glove_model = {}
    with open(File,'r', encoding="utf-8") as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model

# Load glove embeddings
glove_embeddings = load_glove_model(glove_model_location)

# Set the maximum sentence length and embedding dimension
max_length = 200 
embedding_dim = 200

# define a function to convert a sentence to a fixed-size vector using GloVe embeddings
def sentence_embedding(sentence):
    words = sentence.split()
    num_words = min(len(words), max_length)
    embedding_sentence = np.zeros((max_length, embedding_dim))

    for i in range(num_words):
        word = words[i]
        if word in glove_embeddings:
            embedding_sentence[i] = glove_embeddings[word]
            
    return embedding_sentence.flatten()

X_encode_glove = df['text_cleaned'].apply(lambda sentence: sentence_embedding(sentence))
X_glove = np.vstack(X_encode_glove)

In [None]:
cluster_metric = []
kmax = 10
skip = 1

for k in range(2, kmax, skip):
    for embedding_and_method in [(X_default, 'Default'), (X_TFIDF, 'tfidf'),(X_glove, 'glove')]:
        
        embedding, method = embedding_and_method[0], embedding_and_method[1]
        
        # Initialize kmeans with k centroids
        kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42)

        embedding_scaled  = embedding

        # fit the model
        kmeans.fit(embedding_scaled)

        # store cluster labels in a variable
        clusters = kmeans.labels_

        pca0,  pca1 = dimension_reduction(embedding_scaled, method)

        cluster_metric.append({'method': method,
                               'k':k,
                               'clusters': clusters,
                               'pca0': pca0,
                               'pca1': pca1,
                               'sse':kmeans.inertia_, 
                               'sil':silhouette_score(embedding_scaled, clusters, metric = 'euclidean')})


In [None]:
df_metric = pd.DataFrame(cluster_metric)
row_cnt = len(df_metric['k'].unique())
col_cnt = len(df_metric['method'].unique())

### Plot cluster graph with each K

In [None]:
# Create a figure with rows and columns
fig, axes = plt.subplots(row_cnt, col_cnt, figsize=(row_cnt * 5, col_cnt * 5))

# Flatten the axes array for easier iteration
axes = axes.flatten()

# # Plot scatterplots in each subplot
for i, ax in enumerate(axes):
    data = pd.DataFrame({
        'x': df_metric['pca0'][i],
        'y1': df_metric['pca1'][i],
        'cluster': df_metric['clusters'][i]
    })

    # Create scatter plot
    sns.scatterplot(data=data, x='x', y='y1', hue='cluster', palette="viridis", ax=ax, legend=False)
    ax.set_ylabel('')

    ax.set_title(f"K={df_metric['k'][i]} clustering with {df_metric['method'][i]}", fontdict={"fontsize": 10})
    ax.set_xlabel("x0")
    if i % 3 == 0:  # Add Y-axis label to the first subplot of each row
        ax.set_ylabel("x1")

# plt.tight_layout()
plt.show()


### Plot cluster metric

In [None]:
# Create a figure with rows and columns

categories = df_metric['method'].unique()
len_c = len(categories)

fig, axes = plt.subplots(len_c, 2, figsize=(len_c * 5, 10), sharex=True)

for i, category in enumerate(categories):
    ax = axes[i, 0] if len_c > 1 else axes  # Handle single subplot case
    subset = df_metric[df_metric['method'] == category]
    ax.plot(subset['k'], subset['sse'], marker='o', label=f'Method {category}')
    ax.set_title(f'Method -> {category}')
    if i+1 == len_c:
        ax.set_xlabel('Number of Clusters')
    ax.set_xticks(subset['k'])  # Set x-axis range
    ax.set_ylabel('WCSS')

    ax = axes[i, 1] if len_c > 1 else axes  # Handle single subplot case
    subset = df_metric[df_metric['method'] == category]
    ax.plot(subset['k'], subset['sil'], marker='o', label=f'Method {category}')
    ax.set_title(f'Method -> {category}')
    if i+1 == len_c:
        ax.set_xlabel('Number of Clusters')
    ax.set_xticks(subset['k'])  # Set x-axis range
    ax.set_ylabel('SIL')

plt.tight_layout()
plt.show()
