
# AIC Chat Topic Modelling

Purpose: To identify topics based off AIC chat transcripts, and then classify conversations by topic.

Compute Cluster: fs-cluster-m

Created: 1/5/2024

Current status: In progress (16/5/2025)


### s001 Libraries

In [0]:
# Install libraries
%pip install bertopic==0.16.0 \
  transformers==4.34.0 \
  sentence-transformers==3.0.0 \
  datasets==2.14.5 \
  huggingface-hub==0.17.3 \
  einops

In [0]:
# Import libraries
import os
import re
import umap.umap_ as umap
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from datetime import datetime
from collections import Counter
from hdbscan import HDBSCAN
from datasets import Dataset
from bertopic import BERTopic
from pyspark.sql.functions import lit
from sentence_transformers import SentenceTransformer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance

# Set environment variables
os.environ['SENTENCE_TRANSFORMERS_OFFLINE']='1'
os.environ['DATASETS_OFFLINE']='1'
os.environ['TOKENIZERS_PARALLELISM']='false'

### s002 Snowflake Connector

In [0]:
# Get the private key from Azure Key Vault via Databricks secrets
password = dbutils.secrets.get(scope="auea-kv-sbx-dxdtlprdct01", key="sfdbrsdskey")

# Define Snowflake connection options
sf_options = {
    "sfURL": "vodafonenz_prod.australia-east.azure.snowflakecomputing.com",
    "sfUser": "SVC_LAB_DS_DATABRICKS",
    "pem_private_key": password.replace('\\n', '\n'),
    "sfDatabase": "LAB_ML_STORE",
    "sfSchema": "SANDBOX",
    "sfWarehouse": "LAB_DS_WH_SCALE"
}

# Load the Snowflake table into a Spark DataFrame
raw_df = spark.read \
    .format("snowflake") \
    .options(**sf_options) \
    .option("dbtable", "AIC_CHAT_CONTENT_20250611") \
    .load()

raw_pd_df = raw_df.toPandas()

In [0]:
display(
    raw_df
    .limit(100)      
)

### s003 Data Preprocessing

In [0]:
# Keep relevant columns
df2 = raw_pd_df[['CONTACTID', 'CONTENT', 'INPUTID', 'ROLE', 'AGT_AFTERCONTACTWORKENDTIMESTAMP']]

# Remove rows starting with '(concierge)'
df2 = df2[~df2['CONTENT'].str.startswith('(concierge)')]

# Remove prefixes from content
df2['CONTENT'] = df2['CONTENT'].str.replace(r'^\([^\)]+\)\s*', '', regex=True)

# Add formatted content column with '[a]', '[u]' prefixes
df2['FORMATTED_CONTENT'] = df2.apply(lambda row: f"[u]{row['CONTENT']}" if row['ROLE'] == 'user' else f"[a]{row['CONTENT']}", axis=1)

# Clean 'FORMATTED_CONTENT' column to remove prefixes like (rag)
df2['FORMATTED_CONTENT'] = df2['FORMATTED_CONTENT'].str.replace(r'\([^\)]+\)\s*', '', regex=True)

# Split into user and non-user dataframes and drop duplicate inputids
user_df = df2[df2['ROLE'] == 'user'].drop_duplicates(subset='INPUTID', keep='first')
non_user_df = df2[df2['ROLE'] != 'user']

df2 = pd.concat([user_df, non_user_df]).sort_index()

# Drop rows from dataframe where 'CONTENT' is null or NA
df2 = df2.dropna(subset=['CONTENT'])
df2 = df2[~df2['CONTENT'].isin(['NA'])]

# Sort so that user text is before assistant text
df2 = df2.sort_values(by=['CONTACTID', 'AGT_AFTERCONTACTWORKENDTIMESTAMP', 'ROLE'],
                      ascending=[True, True, False])

# Drop unnecessary columns
df2 = df2.drop(columns=['INPUTID', 'AGT_AFTERCONTACTWORKENDTIMESTAMP'])

# Group full and user content by contactid
content_by_contactid = df2.groupby('CONTACTID')['CONTENT'].apply(lambda x: ' '.join(x)).reset_index(name='ALL_CONTENT')
user_content_by_contactid = df2[df2['ROLE'] == 'user'].groupby('CONTACTID')['CONTENT'].apply(lambda x: ' '.join(x)).reset_index(name='USER_CONTENT')

# Group formatted content by contactid
formatted_content_by_contactid = df2.groupby('CONTACTID')['FORMATTED_CONTENT'].apply(lambda x: ' '.join(x)).reset_index(name='FORMATTED_CONTENT')

# Merge outputs
merged_df = pd.merge(content_by_contactid, user_content_by_contactid, on='CONTACTID', how='left')
merged_df = pd.merge(merged_df, formatted_content_by_contactid, on='CONTACTID', how='left')

# Show dataframe
display(merged_df)

### s004 Embeddings

In [0]:
# Load local mpnet model
checkpoint = '/dbfs/FileStore/tables/ms/saved_mpnet_model/'
embedding_model = SentenceTransformer(checkpoint, trust_remote_code=True)

In [0]:
# Convert user-only text and all text conversations to lists
user_documents_cleaned = merged_df['USER_CONTENT'].tolist()
all_documents_cleaned = merged_df['ALL_CONTENT'].tolist()

In [0]:
# Generate embeddings
user_embeddings = embedding_model.encode(user_documents_cleaned, show_progress_bar=True)

### s005 Topic Modelling

In [0]:
# Level 1 topics

# For dimensionality reduction
low_umap_model = umap.UMAP(
    n_components=3,
    n_neighbors=10,
    min_dist = 0.5,
    random_state=42
)

# For clustering embeddings
low_hdbscan_model = HDBSCAN(
    min_samples=2,
    min_cluster_size=8,
    cluster_selection_method = 'eom',
    prediction_data=True
)

# Set-up vectoriser model to remove stopwords after topics have been generated
vectorizer_model = CountVectorizer(stop_words="english")

# Reduce redundancy in topic representations
representation_model = MaximalMarginalRelevance(diversity=0.2)

# Create Level 1 topic model
low_topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=low_umap_model,
    hdbscan_model=low_hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    verbose=True
)

In [0]:
# Print level 1 topics

low_topics, low_probs = low_topic_model.fit_transform(user_documents_cleaned, user_embeddings)

low_topic_model.get_topic_info()

for topic in range(len(low_topic_model.get_topics())):
    print(f"Topic {topic}: {low_topic_model.get_topic(topic)}\n")

In [0]:
def run_subtopic_model(docs, embeddings, level=2):
    # Set parameters
    umap_dims = {2: 5, 3: 3}
    n_neighbors = {2: 10, 3: 7}
    min_cluster_size = {2: 3, 3: 2}
    diversity = {2: 0.2, 3: 0.3}

    # Setup model
    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap.UMAP(
            n_components=umap_dims[level],
            n_neighbors=n_neighbors[level],
            min_dist=0.5,
            metric='cosine',
            random_state=42
        ),
        hdbscan_model=HDBSCAN(
            min_samples=2,
            min_cluster_size=min_cluster_size[level],
            cluster_selection_method='eom',
            prediction_data=True
        ),
        vectorizer_model=CountVectorizer(stop_words='english'),
        representation_model=MaximalMarginalRelevance(diversity=diversity[level]),
        verbose=False
    )

    topics, _ = topic_model.fit_transform(docs, embeddings)
    topic_counts = Counter(topics)

    for topic_num in topic_model.get_topic_info().Topic:
        if topic_num == -1:
            continue  # Skip other topic

        topic_docs_idx = [i for i, t in enumerate(topics) if t == topic_num]
        topic_docs = [docs[i] for i in topic_docs_idx]
        topic_embeddings = np.array([embeddings[i] for i in topic_docs_idx])

        indent = "  " * (level - 1)
        print(f"{indent}Sub{'-sub' * (level - 2)}topic {topic_num} ({len(topic_docs)} documents):")
        print(f"{indent}  Keywords: {topic_model.get_topic(topic_num)}\n")

        # Recursive subtopic modeling
        if level < 3 and len(topic_docs) >= 5:
            run_subtopic_model(topic_docs, topic_embeddings, level + 1)


for topic_num in low_topic_model.get_topic_info().Topic:
    if topic_num == -1:
        continue  # Skip other topic

    # Get docs and embeddings for this topic
    docs_idx = [i for i, t in enumerate(low_topics) if t == topic_num]
    docs_subset = [user_documents_cleaned[i] for i in docs_idx]
    embeddings_subset = np.array([user_embeddings[i] for i in docs_idx])

    print(f"\n=== Main Topic {topic_num} ({len(docs_subset)} documents) ===")
    print(f"Keywords: {low_topic_model.get_topic(topic_num)}\n")

    if len(docs_subset) < 5:
        print(f"Skipping topic {topic_num}, too few docs.\n")
        continue

    # Start subtopic modeling from level 2
    run_subtopic_model(docs_subset, embeddings_subset, level=2)


### s006 Topic Classification

In [0]:
# Define topics

level_1_topics = [
    "Connectivity and Technical Support",
    "Roaming and International",
    "Billing and Payments",
    "Account Management and Balance",
    "Other",
    "Device Purchases and Inquiries",
    "Plans and Services"
]

level_2_topics = [
    "Network & Service Connectivity",
    "Roaming Details and Inquiries",
    "Payment Processing Issues",
    "Billing and Invoice Queries",
    "Account Changes",
    "Other",
    "Account Balance and Top-ups",
    "Device Buying & Upgrading",
    "Account Access",
    "Plan Inquiries and Information",
    "Plan Changes and Switching",
    "Roaming Technical Issues and Support",
    "Account & App Access Problems",
    "eSIM and Device setup",
    "Billing Disputes"
]


level_3_topics = [
    "Messaging Issues",
    "Roaming Charges and Add-ons",
    "Top-Up and Balance Issues",
    "Roaming Usage and Coverage",
    "Network Registration and Service Access Issues",
    "Bill Amount and Payment Issues",
    "Plan or Account Modifications",
    "Other",
    "Top-up Not Applied",
    "Device Purchase and Availability",
    "Unexpected Balance Change",
    "PIN & Security Code Recovery",
    "Bill Access and Documentation",
    "Plan Pricing and Details",
    "Prepaid/Monthly Plan Changes",
    "Roaming Technical Support and Troubleshooting",
    "Social Media App Access Problems",
    "eSIM Activation and Transfer",
    "Plan Usage and Eligibility",
    "Incorrect Charges and Billing Errors",
    "Duplicate or Missing Charges",
    "Address or Number Update",
    "eSIM Purchase and Setup",
    "Refunds and Credits Issues",
    "Broadband/WiFi and Other Add-ons",
    "Upgrades and Trade-ins"
]

In [0]:
# Create models

def create_bertopic_model(embedding_model, topic_list, zeroshot_min_similarity=0.1, top_n_words=10, verbose=True):
    return BERTopic(
        embedding_model=embedding_model,
        zeroshot_min_similarity=zeroshot_min_similarity,
        zeroshot_topic_list=topic_list,
        top_n_words=top_n_words,
        verbose=verbose
    )

level_1_model = create_bertopic_model(embedding_model, level_1_topics)
level_2_model = create_bertopic_model(embedding_model, level_2_topics)
level_3_model = create_bertopic_model(embedding_model, level_3_topics)

In [0]:
def encode_texts(embedding_model, texts, show_progress_bar=True):
    return embedding_model.encode(texts, show_progress_bar=show_progress_bar)

all_documents_embeddings = encode_texts(embedding_model, all_documents_cleaned)

In [0]:
# Apply topic classification

def fit_transform_model(model, documents, embeddings):
    return model.fit_transform(documents, embeddings)

level_1_topics_all, _ = fit_transform_model(level_1_model, all_documents_cleaned, all_documents_embeddings)
level_2_topics_all, _ = fit_transform_model(level_2_model, all_documents_cleaned, all_documents_embeddings)
level_3_topics_all, _ = fit_transform_model(level_3_model, all_documents_cleaned, all_documents_embeddings)


In [0]:
level1_info_df = level_1_model.get_topic_info()
level2_info_df = level_2_model.get_topic_info()
level3_info_df = level_3_model.get_topic_info()

# Set-up topic name mappings
level1_topic_map = dict(zip(level1_info_df['Topic'], level1_info_df['Name']))
level2_topic_map = dict(zip(level2_info_df['Topic'], level2_info_df['Name']))
level3_topic_map = dict(zip(level3_info_df['Topic'], level3_info_df['Name']))

In [0]:
# Get level 1,2,3 topics and confidence scores

level_1_topics1, level_1_scores = level_1_model.transform(all_documents_cleaned)
level_2_topics2, level_2_scores = level_2_model.transform(all_documents_cleaned)
level_3_topics3, level_3_scores = level_3_model.transform(all_documents_cleaned)

### s007 Evaluate and export zero-shot results

In [0]:
# Export CSV

def map_topic_names(topic_numbers, topic_map):
    return [topic_map.get(t, 'No Topic') for t in topic_numbers]

df_final = pd.DataFrame({
    'CONTACTID': merged_df['CONTACTID'],
    'CONTENT': merged_df['FORMATTED_CONTENT'],

    'LEVEL1_TOPIC_NUMBER': level_1_topics1,
    'LEVEL1_CONFIDENCE_SCORE': level_1_scores,
    'LEVEL1_TOPIC_NAME': map_topic_names(level_1_topics1, level1_topic_map),

    'LEVEL2_TOPIC_NUMBER': level_2_topics2,
    'LEVEL2_CONFIDENCE_SCORE': level_2_scores,
    'LEVEL2_TOPIC_NAME': map_topic_names(level_2_topics2, level2_topic_map),

    'LEVEL3_TOPIC_NUMBER': level_3_topics3,
    'LEVEL3_CONFIDENCE_SCORE': level_3_scores,
    'LEVEL3_TOPIC_NAME': map_topic_names(level_3_topics3, level3_topic_map)
})

df_final.to_csv("topic_assignment_export8.csv", index=False)
print("Saved final CSV")

In [0]:
# Load the predicted and gold CSVs
ml_df = pd.read_csv("topic_assignment_export7.csv")
human_df = pd.read_csv("/dbfs/FileStore/tables/ms/newnewbook3c.csv")

# Merge them on CONTACTID
merged_df = pd.merge(ml_df, human_df, on="CONTACTID")

# Rename columns for better comparison
merged_df = merged_df.rename(columns={
    "LEVEL1_TOPIC_NAME": "ML_L1",
    "LEVEL2_TOPIC_NAME": "ML_L2",
    "LEVEL3_TOPIC_NAME": "ML_L3",
    "CLASSIFIED_L1": "HUMAN_L1",
    "CLASSIFIED_L2": "HUMAN_L2",
    "CLASSIFIED_L3": "HUMAN_L3"
})

# Calculate accuracy for each level
level_1_accuracy = (merged_df["ML_L1"] == merged_df["HUMAN_L1"]).mean()
level_2_accuracy = (merged_df["ML_L2"] == merged_df["HUMAN_L2"]).mean()
level_3_accuracy = (merged_df["ML_L3"] == merged_df["HUMAN_L3"]).mean()

# Print results
print(f"Level 1 Accuracy: {level_1_accuracy:.2%}")
print(f"Level 2 Accuracy: {level_2_accuracy:.2%}")
print(f"Level 3 Accuracy: {level_3_accuracy:.2%}")

accuracy_scores = [level_1_accuracy, level_2_accuracy, level_3_accuracy]
levels = ['Level 1', 'Level 2', 'Level 3']

plt.figure(figsize=(8, 5))
bars = plt.bar(levels, accuracy_scores, color=['#4CAF50', '#2196F3', '#FFC107'])
plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('Accuracy at Each Classification Level')

for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height + 0.02, f'{height:.1%}', ha='center', va='bottom')

plt.tight_layout()
plt.show()


In [0]:
# Generate chart of accuracy at random for reference

num_trials = 1000

random_acc_level_1 = []
random_acc_level_2 = []
random_acc_level_3 = []

for _ in range(num_trials):
    random_guess_l1 = np.random.choice(level_1_topics, size=len(merged_df))
    random_guess_l2 = np.random.choice(level_2_topics, size=len(merged_df))
    random_guess_l3 = np.random.choice(level_3_topics, size=len(merged_df))

    acc_l1 = np.mean(random_guess_l1 == merged_df["HUMAN_L1"])
    acc_l2 = np.mean(random_guess_l2 == merged_df["HUMAN_L2"])
    acc_l3 = np.mean(random_guess_l3 == merged_df["HUMAN_L3"])

    random_acc_level_1.append(acc_l1)
    random_acc_level_2.append(acc_l2)
    random_acc_level_3.append(acc_l3)

plt.figure(figsize=(10,6))
plt.boxplot([random_acc_level_1, random_acc_level_2, random_acc_level_3], labels=['Level 1', 'Level 2', 'Level 3'])
plt.ylabel('Random Guess Accuracy')
plt.title('Random Baseline Accuracy Distribution by Level')
plt.grid(axis='y')
plt.show()


In [0]:
# Upload results to Snowflake table
df_spark = spark.createDataFrame(df_final)

# Add the current timestamp as a new column
df_spark = df_spark.withColumn("created_datetime", lit(datetime.now()))

# Now write to Snowflake
df_spark.write \
    .format("snowflake") \
    .options(**sf_options) \
    .option("dbtable", "SANDBOX.AIC_ZEROSHOT_LABEL_LEVEL123") \
    .mode("overwrite") \
    .save()

### s008 Implementing the supervised model

In [0]:
# Hierarchy dictionary
hierarchy = {
    "Connectivity and Technical Support": {
        "Network & Service Connectivity": ["Messaging Issues", "Network Registration and Service Access Issues"],
        "Account & App Access Problems": ["Social Media App Access Problems"],
    },
    "Roaming and International": {
        "Roaming Details and Inquiries": ["Roaming Charges and Add-ons", "Roaming Usage and Coverage"],
        "Roaming Technical Issues and Support": ["Roaming Technical Support and Troubleshooting"],
    },
    "Billing and Payments": {
        "Billing and Invoice Queries": ["Bill Amount and Payment Issues", "Bill Access and Documentation"],
        "Billing Disputes": ["Incorrect Charges and Billing Errors", "Refunds and Credit Issues"],
        "Payment Processing Issues": ["Top-Up and Balance Issues", "Duplicate or Missing Charges"],
    },
    "Account Management and Balance": {
        "Account Access": ["PIN & Security Code Recovery"],
        "Account Balance and Top-ups": ["Top-up Not Applied", "Unexpected Balance Change"],
        "Account Changes": ["Plan or Account Modifications", "Address or Number Update"],
    },
    "Other": {
        "Other": ["Other"],
    },
    "Device Purchases and Inquiries": {
        "Device Buying & Upgrading": ["Device Purchase and Availability", "Upgrades and Trade-ins"],
        "eSIM and Device setup": ["eSIM Activation and Transfer", "eSIM Purchase and Setup"],
    },
    "Plans and Services": {
        "Plan Changes and Switching": ["Prepaid/Monthly Plan Changes", "Broadband/WiFi and Other Add-ons"],
        "Plan Inquiries and Information": ["Plan Pricing and Details", "Plan Usage and Eligibility"],
    }
}

In [0]:
import pandas as pd
import numpy as np

# Extract necessary components from the pipeline results
y1_test_pred = models['level1'][1]  # Predicted labels for Level 1
le1 = models['level1'][3]  # LabelEncoder for Level 1
X_test = models['X_test']  # Test set features
contact_ids = models['contact_ids']  # Contact IDs for test set

# Ensure contact_ids is a pandas Series and aligned with X_test
contact_ids = contact_ids.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

# Verify lengths match
assert len(contact_ids) == len(X_test) == len(y1_test_pred), "Mismatch in test set lengths"

# Create a DataFrame for the test set predictions
test_df = pd.DataFrame({
    'CONTACTID': contact_ids,
    'USER_CONTENT': X_test,
    'Predicted_Label_Enc': y1_test_pred,
    'Predicted_Label': le1.inverse_transform(y1_test_pred),
    'Confidence': models['confidence']['level1']
})

# Merge with gold_data_l23 to get the true CLASSIFIED_L1 labels based on CONTACTID
test_df = test_df.merge(
    gold_data_l23[['CONTACTID', 'CLASSIFIED_L1']],
    on='CONTACTID',
    how='left'
)

# Rename CLASSIFIED_L1 to True_Label for clarity
test_df = test_df.rename(columns={'CLASSIFIED_L1': 'True_Label'})

# Create a mask for misclassified instances
# Encode True_Label using le1 for comparison with Predicted_Label_Enc
test_df['True_Label_Enc'] = le1.transform(test_df['True_Label'])
misclassified_mask = test_df['Predicted_Label_Enc'] != test_df['True_Label_Enc']

# Filter for misclassified instances
misclassified_df = test_df[misclassified_mask][[
    'CONTACTID', 'USER_CONTENT', 'True_Label', 'Predicted_Label', 'Confidence'
]]

# Reset index for clarity
misclassified_df = misclassified_df.reset_index(drop=True)

# Save to CSV
misclassified_df.to_csv('mismissclassified1.csv', index=False)

# Print the DataFrame to inspect
print("Misclassified Instances for Level 1:")
print(misclassified_df)

# Verify True_Label matches gold_data_l23
print("\nSample of True_Label values from misclassified_df:")
print(misclassified_df[['CONTACTID', 'True_Label']].head())
print("\nCorresponding CLASSIFIED_L1 values from gold_data_l23:")
print(gold_data_l23[gold_data_l23['CONTACTID'].isin(misclassified_df['CONTACTID'])][['CONTACTID', 'CLASSIFIED_L1']].head())

In [0]:
# Load gold dataset
gold_data_l23 = pd.read_csv("/dbfs/FileStore/tables/ms/newnewbook3c.csv")

# Load all-mpnet-v2 model
checkpoint = '/dbfs/FileStore/tables/ms/saved_mpnet_model/'
model = SentenceTransformer(checkpoint, trust_remote_code=True)

# Train SVM classifier with hyperparameter tuning
def train_and_predict(X_train_feat, y_train, X_test_feat):
    svm_model = SVC(probability=True)
    svm_params = {'C': [0.1, 1], 'kernel': ['linear', 'rbf']}
    grid_search = GridSearchCV(svm_model, svm_params, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_feat, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test_feat)
    return best_model, y_pred, grid_search.best_params_

# Constrain predictions based on hierarchy
def constrained_predict(model, X, allowed_classes, label_encoder):
    scores = model.predict_proba(X)
    allowed_indices = [np.where(label_encoder.classes_ == c)[0][0] for c in allowed_classes if c in label_encoder.classes_]
    preds = []
    for row_scores in scores:
        masked = np.full_like(row_scores, -np.inf)
        masked[allowed_indices] = row_scores[allowed_indices]
        pred_idx = np.argmax(masked)
        preds.append(pred_idx)
    return np.array(preds)

# Main pipeline
def run_hierarchical_svm_pipeline(data, level1_col, level2_col, level3_col):
    X = data['USER_CONTENT']
    y1 = data[level1_col]
    y2 = data[level2_col]
    y3 = data[level3_col]

    X_train, X_test, y1_train, y1_test, y2_train, y2_test, y3_train, y3_test = train_test_split(
        X, y1, y2, y3, test_size=0.2, random_state=42, stratify=y1
    )

    # Remove rows from the test if the labels are unseen in level 2 or 3 test sets
    valid_l2_labels = set(y2_train)
    valid_l3_labels = set(y3_train)

    # mask for valid rows
    valid_mask = y2_test.isin(valid_l2_labels) & y3_test.isin(valid_l3_labels)

    # Apply mask
    X_test = X_test[valid_mask]
    y1_test = y1_test[valid_mask]
    y2_test = y2_test[valid_mask]
    y3_test = y3_test[valid_mask]

    # Encode text
    X_train_embedded = model.encode(X_train.tolist(), show_progress_bar=True)
    X_test_embedded = model.encode(X_test.tolist(), show_progress_bar=True)

    # Encode labels
    le1 = LabelEncoder()
    y1_train_enc = le1.fit_transform(y1_train)
    y1_test_enc = le1.transform(y1_test)

    le2 = LabelEncoder()
    y2_train_enc = le2.fit_transform(y2_train)
    y2_test_enc = le2.transform(y2_test)

    le3 = LabelEncoder()
    y3_train_enc = le3.fit_transform(y3_train)
    y3_test_enc = le3.transform(y3_test)

    # Level 1
    model1, y1_test_pred, params1 = train_and_predict(X_train_embedded, y1_train_enc, X_test_embedded)

    # Train level 2 classifier with level 1 predictions as a feature
    X_train_level2 = np.hstack([X_train_embedded, y1_train_enc.reshape(-1,1)])
    X_test_level2 = np.hstack([X_test_embedded, y1_test_pred.reshape(-1,1)])
    model2, _, params2 = train_and_predict(X_train_level2, y2_train_enc, X_test_level2)

    # Predict Level 2 with hierarchy constraints
    y2_test_pred = []
    for i in range(len(X_test_level2)):
        pred_l1_label = le1.inverse_transform([y1_test_pred[i]])[0]
        valid_l2 = list(hierarchy.get(pred_l1_label, {}).keys())
        pred_l2_idx = constrained_predict(model2, X_test_level2[i:i+1], valid_l2, le2)[0]
        y2_test_pred.append(pred_l2_idx)
    y2_test_pred = np.array(y2_test_pred)

    # Train level 3 classifier with level 2 predictions as a feature
    print("Training Level 3 classifier with Level 2 predictions as feature...")
    X_train_level3 = np.hstack([X_train_embedded, y2_train_enc.reshape(-1,1)])
    X_test_level3 = np.hstack([X_test_embedded, y2_test_pred.reshape(-1,1)])
    model3, _, params3 = train_and_predict(X_train_level3, y3_train_enc, X_test_level3)

    # Predict Level 3 with hierarchy constraints
    y3_test_pred = []
    for i in range(len(X_test_level3)):
        pred_l1_label = le1.inverse_transform([y1_test_pred[i]])[0]
        pred_l2_label = le2.inverse_transform([y2_test_pred[i]])[0]
        valid_l3 = hierarchy.get(pred_l1_label, {}).get(pred_l2_label, [])
        pred_l3_idx = constrained_predict(model3, X_test_level3[i:i+1], valid_l3, le3)[0]
        y3_test_pred.append(pred_l3_idx)
    y3_test_pred = np.array(y3_test_pred)

    # Print accuracies
    print("\nLevel 1 Accuracy:", accuracy_score(y1_test_enc, y1_test_pred))
    print(classification_report(
        y1_test_enc, y1_test_pred,
        labels=np.unique(y1_test_enc),
        target_names=le1.inverse_transform(np.unique(y1_test_enc)),
        zero_division=0
    ))

    print("\nLevel 2 Accuracy:", accuracy_score(y2_test_enc, y2_test_pred))
    print(classification_report(
        y2_test_enc, y2_test_pred,
        labels=np.unique(y2_test_enc),
        target_names=le2.inverse_transform(np.unique(y2_test_enc)),
        zero_division=0
    ))

    print("\nLevel 3 Accuracy:", accuracy_score(y3_test_enc, y3_test_pred))
    print(classification_report(
        y3_test_enc, y3_test_pred,
        labels=np.unique(y3_test_enc),
        target_names=le3.inverse_transform(np.unique(y3_test_enc)),
        zero_division=0
    ))

    # Compute confidence scores
    level1_confidences = model1.predict_proba(X_test_embedded)[np.arange(len(y1_test_pred)), y1_test_pred]
    level2_confidences = model2.predict_proba(X_test_level2)[np.arange(len(y2_test_pred)), y2_test_pred]
    level3_confidences = model3.predict_proba(X_test_level3)[np.arange(len(y3_test_pred)), y3_test_pred]

    # Compute accuracies
    acc_l1 = accuracy_score(y1_test_enc, y1_test_pred)
    acc_l2 = accuracy_score(y2_test_enc, y2_test_pred)
    acc_l3 = accuracy_score(y3_test_enc, y3_test_pred)

    # Return everything in one dictionary
    return {
        'level1': (model1, y1_test_pred, params1, le1),
        'level2': (model2, y2_test_pred, params2, le2),
        'level3': (model3, y3_test_pred, params3, le3),
        'X_test': X_test.reset_index(drop=True),
        'contact_ids': data.loc[X_test.index, 'CONTACTID'].reset_index(drop=True),
        'confidence': {
            'level1': level1_confidences,
            'level2': level2_confidences,
            'level3': level3_confidences
        },
        'accuracy': {
            'level1': acc_l1,
            'level2': acc_l2,
            'level3': acc_l3
        }
    }


# Run the pipeline
models = run_hierarchical_svm_pipeline(gold_data_l23, 'CLASSIFIED_L1', 'CLASSIFIED_L2', 'CLASSIFIED_L3')

# Get accuracies
accuracies = models['accuracy']



In [0]:
import pandas as pd
import numpy as np

# Extract necessary components from the pipeline results
y1_test_pred = models['level1'][1]  # Predicted labels for Level 1
le1 = models['level1'][3]  # LabelEncoder for Level 1
X_test = models['X_test']  # Test set features
contact_ids = models['contact_ids']  # Contact IDs for test set
y1_test = gold_data_l23.loc[X_test.index, 'CLASSIFIED_L1'].reset_index(drop=True)  # True Level 1 labels

# Create a mask for misclassified instances
misclassified_mask = y1_test_pred != le1.transform(y1_test)

# Create DataFrame for misclassified instances
misclassified_df = pd.DataFrame({
    'CONTACTID': contact_ids[misclassified_mask],
    'USER_CONTENT': X_test[misclassified_mask],
    'True_Label': y1_test[misclassified_mask],
    'Predicted_Label': le1.inverse_transform(y1_test_pred[misclassified_mask]),
    'Confidence': models['confidence']['level1'][misclassified_mask]
})

# Reset index for clarity
misclassified_df = misclassified_df.reset_index(drop=True)

# Display the DataFrame
print("Misclassified Instances for Level 1:")
print(misclassified_df)

In [0]:
misclassified_df.to_csv('misclassified_level11.csv', index=False)

In [0]:
import pandas as pd
import numpy as np

# Extract necessary components from the pipeline results
y1_test_pred = models['level1'][1]  # Predicted labels for Level 1
le1 = models['level1'][3]  # LabelEncoder for Level 1
X_test = models['X_test']  # Test set features
contact_ids = models['contact_ids']  # Contact IDs for test set

# Extract true Level 1 labels directly from gold_data_l23 using the test set indices
# Ensure indices align with X_test
test_indices = X_test.index
y1_test = gold_data_l23.loc[test_indices, 'CLASSIFIED_L1']

# Verify that the indices match
assert len(y1_test) == len(X_test), "Mismatch in number of test samples"
assert y1_test.index.equals(X_test.index), "Index mismatch between y1_test and X_test"

# Create a mask for misclassified instances
# Encode y1_test using le1 for comparison with y1_test_pred
y1_test_enc = le1.transform(y1_test)
misclassified_mask = y1_test_pred != y1_test_enc

# Create DataFrame for misclassified instances
misclassified_df = pd.DataFrame({
    'CONTACTID': contact_ids[misclassified_mask],
    'USER_CONTENT': X_test[misclassified_mask],
    'True_Label': y1_test[misclassified_mask],  # Use original CLASSIFIED_L1 values
    'Predicted_Label': le1.inverse_transform(y1_test_pred[misclassified_mask]),
    'Confidence': models['confidence']['level1'][misclassified_mask]
})

# Reset index for clarity
misclassified_df = misclassified_df.reset_index(drop=True)

# Save to CSV
misclassified_df.to_csv('/dbfs/FileStore/tables/ms/misclassified_level1.csv', index=False)

# Print the DataFrame to inspect
print("Misclassified Instances for Level 1:")
print(misclassified_df)

# Verify True_Label matches gold_data_l23
print("\nSample of True_Label values from misclassified_df:")
print(misclassified_df['True_Label'].head())
print("\nSample of CLASSIFIED_L1 values from gold_data_l23 for comparison:")
print(gold_data_l23.loc[test_indices[misclassified_mask], 'CLASSIFIED_L1'].head())

In [0]:
# Check for NaNs in your embedded test features
import pandas as pd
import numpy as np

# Example for level 3 inputs (the last model in the pipeline)
nan_rows = np.isnan(X_test_level3).any(axis=1)

# See which rows are affected
print("Rows with NaNs in X_test_level3:")
print(np.where(nan_rows)[0])

# Optionally view full data row info (e.g. content and contact ID)
print("Problematic rows with CONTACTID:")
print(models['contact_ids'][nan_rows])
print(models['X_test'][nan_rows])


### s009 Evaluate and export the supervised model results

In [0]:
# Decode predictions
pred_l1 = models['level1'][3].inverse_transform(models['level1'][1])
pred_l2 = models['level2'][3].inverse_transform(models['level2'][1])
pred_l3 = models['level3'][3].inverse_transform(models['level3'][1])

# Get confidence scores
conf_l1 = models['confidence']['level1']
conf_l2 = models['confidence']['level2']
conf_l3 = models['confidence']['level3']

# Align content with test indices
content_test = merged_df.loc[models['contact_ids'].index, 'CONTENT'].reset_index(drop=True)

# Build result dataframe
results_df = pd.DataFrame({
    'CONTACTID': models['contact_ids'],
    'CONTENT': content_test,
    'PREDICTED_L1': pred_l1,
    'CONFIDENCE_L1': conf_l1,
    'PREDICTED_L2': pred_l2,
    'CONFIDENCE_L2': conf_l2,
    'PREDICTED_L3': pred_l3,
    'CONFIDENCE_L3': conf_l3
})

# Save to CSV
results_df.to_csv('hierarchical_predictions2.csv', index=False)

In [0]:
# Upload results to Snowflake table
df_spark = spark.createDataFrame(results_df)

# Add the current timestamp as a new column
df_spark = df_spark.withColumn("created_datetime", lit(datetime.now()))

# Now write to Snowflake
df_spark.write \
    .format("snowflake") \
    .options(**sf_options) \
    .option("dbtable", "SANDBOX.AIC_LABEL_LEVEL123_20250612") \
    .mode("overwrite") \
    .save()

In [0]:
# Plot accuracy by topic level
acc_df = pd.DataFrame({
    'Level': ['Level 1', 'Level 2', 'Level 3'],
    'Accuracy': [accuracies['level1'], accuracies['level2'], accuracies['level3']]
})

plt.figure(figsize=(8, 5))
barplot = sns.barplot(data=acc_df, x='Level', y='Accuracy', palette='Blues_d')
plt.ylim(0, 1)
plt.title('Hierarchical SVM Accuracy by Level')
plt.ylabel('Accuracy')
plt.xlabel('Classification Level')
plt.grid(True, axis='y', linestyle='--', alpha=0.7)

# Add accuracy values on top of the bars
for i, acc in enumerate(acc_df['Accuracy']):
    plt.text(i, acc + 0.02, f"{acc:.2f}", ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()
