# Semantic Analysis - Phase 2: Unclassified Ticket Clustering

This notebook processes the 3,208 tickets that couldn't be classified by hardcoded rules and uses sentence-transformers + clustering to find semantic groups representing core problem statements.

## Process:
1. Load unclassified tickets from improved classification results
2. Encode ticket descriptions using sentence-transformers
3. Apply DBSCAN clustering to find semantic groups
4. Analyze clusters to identify core problem statements
5. Generate final semantic taxonomy

## Step 1: Import Libraries (Test Each One)

Let's import libraries one by one to identify any issues:

In [5]:
# Basic libraries first
import pandas as pd
import numpy as np
import json
import logging
from pathlib import Path
from collections import Counter, defaultdict

print("✅ Basic libraries imported successfully")
print(f"Pandas: {pd.__version__}")
print(f"NumPy: {np.__version__}")

✅ Basic libraries imported successfully
Pandas: 2.3.1
NumPy: 2.3.2


In [6]:
# Try matplotlib first (usually works)
import matplotlib.pyplot as plt
import seaborn as sns

print("✅ Visualization libraries imported successfully")

✅ Visualization libraries imported successfully


In [7]:
# Advanced Alternative: Use HuggingFace transformers directly (no scipy needed)
try:
    from transformers import AutoTokenizer, AutoModel
    import torch
    print("✅ HuggingFace transformers available - can do advanced semantic analysis!")
    transformers_available = True
    
    # Custom cosine similarity using pure PyTorch/NumPy
    def compute_cosine_similarity(embeddings):
        """Compute cosine similarity matrix using NumPy."""
        # Normalize embeddings
        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
        normalized_embeddings = embeddings / norms
        # Compute cosine similarity
        similarity_matrix = np.dot(normalized_embeddings, normalized_embeddings.T)
        return similarity_matrix
    
    # Custom DBSCAN implementation
    def custom_dbscan(distance_matrix, eps=0.3, min_samples=3):
        """Custom DBSCAN implementation without scipy."""
        n_points = distance_matrix.shape[0]
        labels = np.full(n_points, -1)  # -1 means noise
        cluster_id = 0
        
        for i in range(n_points):
            if labels[i] != -1:  # Already processed
                continue
                
            # Find neighbors within eps distance
            neighbors = np.where(distance_matrix[i] <= eps)[0]
            
            if len(neighbors) < min_samples:
                continue  # Point is noise
                
            # Start new cluster
            labels[i] = cluster_id
            seed_set = list(neighbors)
            
            j = 0
            while j < len(seed_set):
                point = seed_set[j]
                
                if labels[point] == -1:  # Change noise to border point
                    labels[point] = cluster_id
                elif labels[point] != -1:  # Already in cluster
                    j += 1
                    continue
                    
                labels[point] = cluster_id
                
                # Find new neighbors
                new_neighbors = np.where(distance_matrix[point] <= eps)[0]
                if len(new_neighbors) >= min_samples:
                    for new_point in new_neighbors:
                        if new_point not in seed_set:
                            seed_set.append(new_point)
                j += 1
                
            cluster_id += 1
            
        return labels
    
    print("✅ Custom advanced clustering functions ready")
    
except ImportError as e:
    print(f"❌ HuggingFace transformers import failed: {e}")
    transformers_available = False

# Try basic sklearn (might work even if scipy fails)
try:
    import sklearn
    print("✅ Scikit-learn base imported")
    sklearn_base_available = True
except ImportError:
    sklearn_base_available = False
    print("❌ Scikit-learn not available")

❌ HuggingFace transformers import failed: Could not import module 'AutoTokenizer'. Are this object's requirements defined correctly?
❌ Scikit-learn not available


In [4]:
# Try sentence-transformers (might fail here)
try:
    from sentence_transformers import SentenceTransformer
    print("✅ Sentence-transformers imported successfully")
    sentence_transformers_available = True
except ImportError as e:
    print(f"❌ Sentence-transformers import failed: {e}")
    sentence_transformers_available = False

❌ Sentence-transformers import failed: No module named 'scipy._cyutility'


  from .autonotebook import tqdm as notebook_tqdm


## Alternative: Simple Semantic Analysis (If Libraries Fail)

If the advanced libraries fail, we can do semantic analysis using basic libraries:

In [None]:
# Fallback: Simple similarity using basic Python
def simple_jaccard_similarity(set1, set2):
    """Calculate Jaccard similarity between two sets."""
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union > 0 else 0

def simple_text_similarity(text1, text2):
    """Simple text similarity using word overlap."""
    words1 = set(text1.lower().split())
    words2 = set(text2.lower().split())
    return simple_jaccard_similarity(words1, words2)

print("✅ Fallback similarity functions ready")

## Step 2: Load Unclassified Tickets

In [None]:
# Load classification results
results_file = Path('../outputs/improved_classification_results.json')

if not results_file.exists():
    print(f"❌ Error: Classification results file not found at {results_file}")
    print("   Please run improved_semantic_grouping_clean.py first.")
else:
    print(f"✅ Found classification results file")
    
    with open(results_file, 'r') as f:
        results = json.load(f)
    
    unclassified_tickets = results['unclassified_tickets']
    print(f"📊 Loaded {len(unclassified_tickets)} unclassified tickets")
    
    # Convert to DataFrame
    df_unclassified = pd.DataFrame(unclassified_tickets)
    print(f"✅ Created DataFrame with shape: {df_unclassified.shape}")

In [None]:
# Load original ticket data to get full descriptions
consolidated_file = Path('../data/processed/consolidated_tickets.csv')

if consolidated_file.exists():
    df_original = pd.read_csv(consolidated_file)
    
    # Merge to get full ticket information
    df_unclassified = df_unclassified.merge(df_original, 
                     left_on='ticket_index', 
                     right_index=True, 
                     how='left',
                     suffixes=('', '_orig'))
    
    print(f"✅ Successfully merged with original ticket data")
    print(f"📊 Final DataFrame shape: {df_unclassified.shape}")
    print(f"📋 Columns: {list(df_unclassified.columns)}")
else:
    print("❌ Original ticket data not found - using limited information")

In [None]:
# Show category breakdown
print("📋 CATEGORY BREAKDOWN:")
category_counts = df_unclassified['category'].value_counts()
for category, count in category_counts.head(10).items():
    print(f"   {category}: {count:,} tickets")

print(f"\n📝 SAMPLE TICKETS:")
for i, row in df_unclassified.head(5).iterrows():
    print(f"   {i+1}. [{row['category']}] '{row['short_description']}'")

## Step 3: Prepare Text for Analysis

In [None]:
def prepare_text_for_analysis(row):
    """Prepare text for semantic analysis."""
    # Get short description and full description
    short_desc = str(row.get('Short description', ''))
    description = str(row.get('Description', ''))
    
    # Clean up
    short_desc = short_desc.strip()
    description = description.strip()
    
    # Strategy: Give more weight to short description (more focused)
    # but include full description for context
    if len(description) > 0 and description.lower() != 'nan':
        # Combine with emphasis on short description
        combined = f"{short_desc}. {short_desc}. {description}"
    else:
        # Only short description available
        combined = f"{short_desc}. {short_desc}."
    
    return combined.strip()

# Prepare texts
df_unclassified['prepared_text'] = df_unclassified.apply(prepare_text_for_analysis, axis=1)

print("✅ Text preparation complete")
print(f"\n📝 SAMPLE PREPARED TEXTS:")
for i, text in enumerate(df_unclassified['prepared_text'].head(3), 1):
    print(f"   {i}. {text[:100]}...")

## Step 4A: Advanced Semantic Analysis (If Libraries Work)

In [None]:
if transformers_available:
    print("🚀 Using advanced semantic analysis with HuggingFace transformers")
    
    # Load model and tokenizer
    model_name = 'sentence-transformers/all-MiniLM-L6-v2'
    print(f"Loading model: {model_name}...")
    
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)
        
        def encode_texts(texts, batch_size=32):
            """Encode texts using HuggingFace transformers."""
            embeddings = []
            
            for i in range(0, len(texts), batch_size):
                batch = texts[i:i+batch_size]
                
                # Tokenize
                inputs = tokenizer(batch, padding=True, truncation=True, 
                                 return_tensors='pt', max_length=512)
                
                # Get embeddings
                with torch.no_grad():
                    outputs = model(**inputs)
                    # Mean pooling
                    batch_embeddings = outputs.last_hidden_state.mean(dim=1)
                    embeddings.extend(batch_embeddings.numpy())
                
                print(f"  Processed {min(i+batch_size, len(texts))}/{len(texts)} tickets")
            
            return np.array(embeddings)
        
        print("✅ Model loaded successfully")
        
        # Encode texts
        print(f"Encoding {len(df_unclassified)} ticket descriptions...")
        texts = df_unclassified['prepared_text'].tolist()
        
        embeddings = encode_texts(texts)
        print(f"✅ Generated embeddings with shape: {embeddings.shape}")
        
        # Normalize embeddings
        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
        embeddings = embeddings / norms
        
    except Exception as e:
        print(f"❌ HuggingFace encoding failed: {e}")
        transformers_available = False
        
elif sentence_transformers_available:
    print("🚀 Using sentence-transformers (if working)")
    # Previous sentence-transformers code would go here
    print("⚠️ Sentence-transformers had scipy issues")
    
else:
    print("⚠️ Advanced transformers not available, will use simple analysis")

In [None]:
if transformers_available:
    print("🔍 Applying custom DBSCAN clustering...")
    
    # Calculate similarity matrix using our custom function
    similarity_matrix = compute_cosine_similarity(embeddings)
    distance_matrix = 1 - similarity_matrix
    
    # Try different DBSCAN parameters
    eps_values = [0.2, 0.3, 0.4]
    min_samples_values = [3, 5]
    
    best_params = None
    best_score = -1
    
    for eps in eps_values:
        for min_samples in min_samples_values:
            labels = custom_dbscan(distance_matrix, eps=eps, min_samples=min_samples)
            
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            n_noise = list(labels).count(-1)
            
            if n_clusters > 0:
                score = n_clusters * (1 - n_noise / len(labels))
                print(f"  eps={eps}, min_samples={min_samples}: {n_clusters} clusters, {n_noise} noise ({n_noise/len(labels):.1%})")
                
                if score > best_score:
                    best_score = score
                    best_params = (eps, min_samples, labels)
    
    if best_params:
        eps, min_samples, labels = best_params
        print(f"\\n✅ Best parameters: eps={eps}, min_samples={min_samples}")
        
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = list(labels).count(-1)
        print(f"📊 Results: {n_clusters} clusters, {n_noise} noise points ({n_noise/len(labels):.1%})")
        
        # Add cluster labels to dataframe
        df_unclassified['cluster_label'] = labels
        
    else:
        print("❌ No good clustering parameters found")
        
else:
    print("⚠️ Advanced clustering not available, will use simple grouping")

## Step 4B: Simple Semantic Analysis (Fallback)

In [None]:
if not (sentence_transformers_available and sklearn_available):
    print("🔧 Using simple semantic analysis based on keyword similarity")
    
    # Extract keywords from each ticket
    def extract_keywords(text):
        """Extract keywords from text."""
        import re
        
        # Simple keyword extraction
        text = text.lower()
        text = re.sub(r'[^\w\s]', ' ', text)
        words = text.split()
        
        # Filter out common words
        stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'user', 'system', 'issue', 'problem', 'help', 'support', 'ticket', 'request'}
        
        keywords = [word for word in words if len(word) > 2 and word not in stop_words]
        return set(keywords)
    
    df_unclassified['keywords'] = df_unclassified['prepared_text'].apply(extract_keywords)
    
    # Simple clustering based on keyword similarity
    print("🔍 Finding similar tickets using keyword matching...")
    
    similarity_threshold = 0.3
    clusters = []
    processed = set()
    
    for i, row in df_unclassified.iterrows():
        if i in processed:
            continue
            
        cluster = [i]
        keywords_i = row['keywords']
        
        # Find similar tickets
        for j, other_row in df_unclassified.iterrows():
            if j <= i or j in processed:
                continue
                
            keywords_j = other_row['keywords']
            similarity = simple_jaccard_similarity(keywords_i, keywords_j)
            
            if similarity >= similarity_threshold:
                cluster.append(j)
        
        if len(cluster) > 1:  # Only keep clusters with multiple tickets
            clusters.append(cluster)
            processed.update(cluster)
    
    # Assign cluster labels
    labels = [-1] * len(df_unclassified)  # -1 means noise/unassigned
    
    for cluster_id, cluster_indices in enumerate(clusters):
        for idx in cluster_indices:
            labels[idx] = cluster_id
    
    df_unclassified['cluster_label'] = labels
    
    n_clusters = len(clusters)
    n_noise = labels.count(-1)
    print(f"✅ Simple clustering complete: {n_clusters} clusters, {n_noise} unassigned tickets")
    
else:
    print("ℹ️ Using advanced clustering results")

## Step 5: Analyze Semantic Clusters

In [None]:
# Analyze clusters
print("🔍 ANALYZING SEMANTIC CLUSTERS")
print("="*50)

unique_labels = set(df_unclassified['cluster_label'])
if -1 in unique_labels:
    unique_labels.remove(-1)

clusters_analysis = {}

# Analyze each cluster
for cluster_id in sorted(unique_labels):
    cluster_tickets = df_unclassified[df_unclassified['cluster_label'] == cluster_id]
    
    if len(cluster_tickets) == 0:
        continue
    
    # Get cluster information
    cluster_info = {
        'size': len(cluster_tickets),
        'categories': cluster_tickets['category'].value_counts().to_dict(),
        'sample_descriptions': cluster_tickets['short_description'].head(5).tolist(),
        'ticket_indices': cluster_tickets['ticket_index'].tolist()
    }
    
    # Extract common keywords if available
    if 'keywords' in cluster_tickets.columns:
        all_keywords = []
        for keywords in cluster_tickets.get('keywords', []):
            if isinstance(keywords, (set, list)):
                all_keywords.extend(list(keywords))
        
        if all_keywords:
            cluster_info['common_keywords'] = dict(Counter(all_keywords).most_common(10))
    
    clusters_analysis[f"cluster_{cluster_id}"] = cluster_info

# Handle noise points
noise_tickets = df_unclassified[df_unclassified['cluster_label'] == -1]
if len(noise_tickets) > 0:
    clusters_analysis['noise'] = {
        'size': len(noise_tickets),
        'categories': noise_tickets['category'].value_counts().to_dict(),
        'sample_descriptions': noise_tickets['short_description'].head(10).tolist(),
        'note': 'These tickets could not be grouped semantically - may represent unique issues'
    }

print(f"📊 CLUSTERING STATISTICS:")
print(f"   Total clusters: {len(unique_labels)}")
print(f"   Tickets in clusters: {len(df_unclassified[df_unclassified['cluster_label'] != -1])}")
print(f"   Noise/unique tickets: {len(noise_tickets)}")
print(f"   Clustering rate: {(len(df_unclassified[df_unclassified['cluster_label'] != -1]) / len(df_unclassified)) * 100:.1f}%")

In [None]:
# Display top clusters
print("\n🎯 TOP SEMANTIC CLUSTERS:")
print("="*50)

semantic_clusters = {k: v for k, v in clusters_analysis.items() if k != 'noise'}
sorted_clusters = sorted(semantic_clusters.items(), key=lambda x: x[1]['size'], reverse=True)

for i, (cluster_name, cluster_info) in enumerate(sorted_clusters[:15], 1):
    print(f"\n{i:2d}. {cluster_name.upper()}: {cluster_info['size']} tickets")
    print(f"     Categories: {list(cluster_info['categories'].keys())}")
    print(f"     Sample descriptions:")
    for j, desc in enumerate(cluster_info['sample_descriptions'][:3], 1):
        print(f"       {j}. '{desc}'")
    
    if 'common_keywords' in cluster_info and cluster_info['common_keywords']:
        top_keywords = list(cluster_info['common_keywords'].keys())[:5]
        print(f"     Common keywords: {top_keywords}")

## Step 6: Create Visualization (If Possible)

In [None]:
# Create cluster size visualization
if len(semantic_clusters) > 0:
    cluster_sizes = [info['size'] for info in semantic_clusters.values()]
    
    plt.figure(figsize=(12, 6))
    
    # Cluster size distribution
    plt.subplot(1, 2, 1)
    plt.hist(cluster_sizes, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
    plt.xlabel('Cluster Size')
    plt.ylabel('Number of Clusters')
    plt.title('Distribution of Cluster Sizes')
    plt.grid(True, alpha=0.3)
    
    # Top clusters bar chart
    plt.subplot(1, 2, 2)
    top_10_clusters = sorted_clusters[:10]
    cluster_names = [f"Cluster {i+1}" for i in range(len(top_10_clusters))]
    sizes = [info['size'] for _, info in top_10_clusters]
    
    plt.bar(cluster_names, sizes, color='lightcoral', alpha=0.7)
    plt.xlabel('Clusters')
    plt.ylabel('Number of Tickets')
    plt.title('Top 10 Largest Clusters')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("✅ Visualization created")
else:
    print("⚠️ No clusters found for visualization")

## Step 7: Save Results

In [None]:
# Save results
output_dir = Path('../outputs')
output_dir.mkdir(exist_ok=True)

# Save cluster analysis
semantic_results = {
    'analysis_timestamp': pd.Timestamp.now().isoformat(),
    'methodology': {
        'approach': 'Advanced' if sentence_transformers_available else 'Simple keyword-based',
        'model': 'all-MiniLM-L6-v2' if sentence_transformers_available else 'Keyword similarity',
        'clustering': 'DBSCAN' if sklearn_available else 'Simple grouping'
    },
    'statistics': {
        'total_tickets': len(df_unclassified),
        'total_clusters': len(unique_labels),
        'tickets_clustered': len(df_unclassified[df_unclassified['cluster_label'] != -1]),
        'noise_points': len(noise_tickets),
        'clustering_rate': (len(df_unclassified[df_unclassified['cluster_label'] != -1]) / len(df_unclassified)) * 100
    },
    'clusters': clusters_analysis
}

# Save JSON results
results_file = output_dir / 'semantic_analysis_results.json'
with open(results_file, 'w') as f:
    json.dump(semantic_results, f, indent=2, default=str)

print(f"✅ Results saved to: {results_file}")

# Save cluster assignments CSV
cluster_assignments = df_unclassified[['ticket_index', 'category', 'short_description', 'cluster_label']].copy()
assignments_file = output_dir / 'semantic_cluster_assignments.csv'
cluster_assignments.to_csv(assignments_file, index=False)

print(f"✅ Cluster assignments saved to: {assignments_file}")

## Step 8: Summary and Next Steps

In [None]:
print("\n" + "="*60)
print("SEMANTIC ANALYSIS COMPLETE!")
print("="*60)

stats = semantic_results['statistics']
print(f"📊 FINAL RESULTS:")
print(f"   Total unclassified tickets processed: {stats['total_tickets']:,}")
print(f"   Semantic clusters found: {stats['total_clusters']}")
print(f"   Tickets successfully clustered: {stats['tickets_clustered']:,} ({stats['clustering_rate']:.1f}%)")
print(f"   Unique/noise tickets: {stats['noise_points']:,}")

print(f"\n🎯 METHODOLOGY USED:")
method = semantic_results['methodology']
print(f"   Approach: {method['approach']}")
print(f"   Model: {method['model']}")
print(f"   Clustering: {method['clustering']}")

print(f"\n🔄 NEXT STEPS:")
print(f"   1. Review semantic clusters to validate problem groupings")
print(f"   2. Create core problem statements for each cluster")
print(f"   3. Combine with hardcoded classifications (639 tickets) for final taxonomy")
print(f"   4. Calculate total automation potential across all classifications")

# Calculate combined classification rate
hardcoded_classified = 639  # From previous results
semantic_clustered = stats['tickets_clustered']
total_tickets = 3847  # Total consolidated tickets
combined_rate = ((hardcoded_classified + semantic_clustered) / total_tickets) * 100

print(f"\n🏆 COMBINED CLASSIFICATION SUCCESS:")
print(f"   Hardcoded rules: {hardcoded_classified:,} tickets (16.6%)")
print(f"   Semantic clustering: {semantic_clustered:,} tickets ({stats['clustering_rate']:.1f}%)")
print(f"   Total classified: {hardcoded_classified + semantic_clustered:,} tickets ({combined_rate:.1f}%)")
print(f"   Remaining unique: {total_tickets - hardcoded_classified - semantic_clustered:,} tickets")