# CCGT Demo Notebook

This notebook demonstrates the CCGT pipeline step by step.

## Steps:
1. Text preprocessing and sentence segmentation
2. Embedding generation
3. Graph construction
4. Model inference
5. Visualization


In [None]:
import sys
from pathlib import Path

# Add backend to path
backend_path = Path('../backend')
sys.path.insert(0, str(backend_path))

import numpy as np
import matplotlib.pyplot as plt
try:
    import networkx as nx
except ImportError:
    print("NetworkX not available, graph visualization will be limited")
    nx = None

from app.pipeline.preprocess import preprocess_text
from app.models.embeddings import embed_sentences
from app.pipeline.graph_builder import build_graph
from app.pipeline.scorer import score_text
from app.models.model import get_model_instance


## Step 1: Load Sample Text


In [None]:
sample_text = """
The field of natural language processing has advanced significantly in recent years. 
Machine learning models can now understand and generate human-like text. 
However, evaluating text quality remains a challenging problem. 
Coherence is one important aspect of text quality. 
It measures how well sentences flow together and form a cohesive narrative.
"""

print("Sample text:")
print(sample_text)


## Step 2: Preprocessing


In [None]:
sentences, discourse_markers = preprocess_text(sample_text)

print(f"Segmented into {len(sentences)} sentences:\n")
for i, (sent, markers) in enumerate(zip(sentences, discourse_markers), 1):
    print(f"{i}. {sent}")
    if markers:
        print(f"   Discourse markers: {', '.join(markers)}")
    print()


## Step 3: Generate Embeddings


In [None]:
embeddings = embed_sentences(sentences)
print(f"Embedding shape: {embeddings.shape}")
print(f"Embedding dimension: {embeddings.shape[1]}")


## Step 4: Build Graph


In [None]:
graph, similarity_matrix, entropy_array = build_graph(
    sentences,
    embeddings,
    discourse_markers
)

print(f"Graph nodes: {graph.x.shape[0]}")
print(f"Graph edges: {graph.edge_index.shape[1] // 2}")
print(f"\nSimilarity matrix shape: {similarity_matrix.shape}")
print(f"\nAverage similarity: {np.mean(similarity_matrix):.4f}")
print(f"Average entropy: {np.mean(entropy_array):.4f}")


## Step 5: Compute Coherence Score


In [None]:
coherence_score, disruption_report = score_text(graph, similarity_matrix, entropy_array)

print(f"Coherence Score: {coherence_score:.4f}")
print(f"Coherence Percent: {int(coherence_score * 100)}%")

if disruption_report:
    print(f"\nDisruption Report ({len(disruption_report)} issues):")
    for i, disruption in enumerate(disruption_report, 1):
        print(f"\n  {i}. Sentence {disruption['from_idx'] + 1} â†’ Sentence {disruption['to_idx'] + 1}")
        print(f"     Reason: {disruption['reason']}")
        print(f"     Score: {disruption['score']:.4f}")


## Step 6: Visualize Similarity Matrix


In [None]:
plt.figure(figsize=(8, 6))
plt.imshow(similarity_matrix, cmap='viridis', aspect='auto')
plt.colorbar(label='Similarity')
plt.title('Sentence Similarity Matrix')
plt.xlabel('Sentence Index')
plt.ylabel('Sentence Index')
plt.show()


## Step 7: Visualize Graph (if NetworkX available)


In [None]:
if nx:
    import torch
    
    # Convert to NetworkX for visualization
    G = nx.Graph()
    
    # Add nodes
    for i, sent in enumerate(sentences):
        G.add_node(i, label=f"S{i+1}", text=sent[:30] + "...")
    
    # Add edges
    edge_index = graph.edge_index.numpy()
    edge_attr = graph.edge_attr.numpy().flatten() if graph.edge_attr is not None else None
    
    for i in range(0, edge_index.shape[1], 2):  # Skip duplicates
        source = int(edge_index[0, i])
        target = int(edge_index[1, i])
        if source < target:  # Only add once
            weight = float(edge_attr[i]) if edge_attr is not None else 1.0
            G.add_edge(source, target, weight=weight)
    
    # Draw graph
    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(G, k=2, iterations=50)
    
    # Draw nodes
    nx.draw_networkx_nodes(G, pos, node_color='lightblue', node_size=1000, alpha=0.8)
    
    # Draw edges with weights
    edges = G.edges()
    weights = [G[u][v]['weight'] for u, v in edges]
    nx.draw_networkx_edges(G, pos, width=[w * 3 for w in weights], alpha=0.5)
    
    # Draw labels
    labels = {i: f"S{i+1}" for i in range(len(sentences))}
    nx.draw_networkx_labels(G, pos, labels, font_size=10)
    
    plt.title('Coherence Graph')
    plt.axis('off')
    plt.tight_layout()
    plt.show()
    
    print("\nGraph structure:")
    print(f"Nodes: {G.number_of_nodes()}")
    print(f"Edges: {G.number_of_edges()}")
else:
    print("NetworkX not available. Install with: pip install networkx matplotlib")
