# Setup Environment and Dependencies
Import required libraries (torch, transformers, sklearn) and set up initial configurations including rich console setup.

In [1]:
# Cell 1 - Setup and Imports
from model import *  # Import all functions from model.py
import torch
import numpy as np
import matplotlib.pyplot as plt
from rich.console import Console
from rich.traceback import install

# Initialize console and traceback
install(show_locals=False)
console = Console()

In [2]:
# Define Evidence dataclass
@dataclass
class Evidence:
    """Represents a piece of evidence associated with a claim."""
    evidence_id: str
    content: str
    embedding: Optional[np.ndarray] = None

# Define Claim dataclass
@dataclass
class Claim:
    """Represents a claim with its associated metadata and evidences."""
    claim_id: str
    content: str
    label: int
    explanation: str
    evidences: List[Evidence] = field(default_factory=list)
    embedding: Optional[np.ndarray] = None
    clustered_evidences: Optional[Dict[int, List[Evidence]]] = None

# Cell 2 - Load Example Data
# Load example data from LIAR dataset
example_data = json.load(open("../dataset/LIAR-RAW/test.json"))[403]
example_claim = Claim(
    claim_id="403",
    content=example_data["claim"],
    label=example_data["label"],
    explanation=example_data["explain"],
)

# Load evidences
for evidence in example_data["reports"]:
    example_claim.evidences.append(
        Evidence(evidence_id=evidence["report_id"], content=evidence["content"])
    )

print(f"Claim: {example_claim.content}")
print(f"Number of evidences: {len(example_claim.evidences)}")

Claim: Im not a conspiracy theorist and I never allow conspiracy theorists on my program.
Number of evidences: 28


In [3]:
evidences = example_claim.evidences

# Get evidence texts
evidence_texts = [ev.content for ev in evidences]

# Adjust parameters based on evidence count
batch_size = min(32, len(evidence_texts))

In [4]:
evidence_texts, len(evidence_texts), batch_size

(['In the current research , we investigate whether belief in conspiracy theory satisfy people ’ s need for uniqueness .\n We find that the tendency to believe in conspiracy theory be associate with the feeling of possess scarce information about the situation explain by the conspiracy theory ( Study 1 ) and high need for uniqueness ( Study 2 ) .\n Further two study use two different manipulation of need for uniqueness ( Studies 3 and 4 ) show that people in a high need for uniqueness condition display high conspiracy belief than people in a low need for uniqueness condition .\n These study suggest that conspiracy theory may serve people ’ s desire to be unique , highlight a motivational underpinning of conspiracy belief .\n , the 2001 tsunami in Japan ) , many people start to disbelieve official explanation in favor of what be know a conspiracy theory .\n For example , one conspiracy theory attribute the Charlie Hebdo attack to Mossad a an effort to make Muslims look bad .\n In recent

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TextUMC().to(device)

for i in range(0, len(evidence_texts), batch_size):
    batch_texts = evidence_texts[i : i + batch_size]
    embeddings = model(batch_texts)

print(embeddings.detach().cpu().numpy(), embeddings.shape)  

[[-0.1245   0.06445 -0.148   ... -0.2355   0.02013  0.03513]
 [-0.2386  -0.02573 -0.11237 ... -0.124   -0.04126 -0.05435]
 [-0.10266  0.082   -0.09595 ... -0.1411   0.01507 -0.08386]
 ...
 [-0.1009   0.02028 -0.1278  ... -0.1964   0.05225  0.01616]
 [-0.16     0.02711 -0.1466  ... -0.1792  -0.02843 -0.05884]
 [-0.0932   0.11755 -0.1512  ... -0.178   -0.0353  -0.04843]] torch.Size([28, 128])


In [6]:
from sklearn.cluster import KMeans

num_clusters = min(5, len(evidence_texts))
current_centroids = None

kmeans = KMeans(
    n_clusters=num_clusters,
    init=current_centroids if current_centroids is not None else "k-means++",
    n_init=5,
    max_iter=200,
)
kmeans.fit(embeddings.detach().cpu().numpy())

In [7]:
# Cell 3 - Model Training
# Initialize model and train
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TextUMC().to(device)

trained_evidences, metrics = umc_train(
    model=model,
    evidences=example_claim.evidences,
    num_clusters=min(len(example_claim.evidences), 5),
    batch_size=32,
    num_epochs=10,
    learning_rate=2e-5,
)

Output()

In [15]:
# Get evidence embeddings
evidence_texts = [ev.content for ev in example_claim.evidences]
with torch.no_grad():
    evidence_embeddings = model(evidence_texts)
    evidence_embeddings_np = evidence_embeddings.cpu().numpy()

# Save embeddings into evidence objects
for ev, emb in zip(example_claim.evidences, evidence_embeddings_np):
    ev.embedding = emb

example_claim.evidences

[Evidence(evidence_id=7668687, content='In the current research , we investigate whether belief in conspiracy theory satisfy people ’ s need for uniqueness .\n We find that the tendency to believe in conspiracy theory be associate with the feeling of possess scarce information about the situation explain by the conspiracy theory ( Study 1 ) and high need for uniqueness ( Study 2 ) .\n Further two study use two different manipulation of need for uniqueness ( Studies 3 and 4 ) show that people in a high need for uniqueness condition display high conspiracy belief than people in a low need for uniqueness condition .\n These study suggest that conspiracy theory may serve people ’ s desire to be unique , highlight a motivational underpinning of conspiracy belief .\n , the 2001 tsunami in Japan ) , many people start to disbelieve official explanation in favor of what be know a conspiracy theory .\n For example , one conspiracy theory attribute the Charlie Hebdo attack to Mossad a an effort t

# Training Pipeline
Implement the training loop with clustering-based pseudo-labels and contrastive learning objectives.

In [12]:
len(evidence_texts)

28

In [13]:
model

TextUMC(
  (bert_encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [11]:
# Cell 4 - Clustering and Visualization
# Get evidence embeddings
evidence_texts = [ev.content for ev in example_claim.evidences]
with torch.no_grad():
    evidence_embeddings = model(evidence_texts)
    evidence_embeddings_np = evidence_embeddings.cpu().numpy()

# Cluster evidences
n_clusters = min(len(evidence_texts), 5)
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(evidence_embeddings_np)

# Print clusters
print_evidence_clusters(example_claim, cluster_labels)

# Group evidences
example_claim.clustered_evidences = {
    i: [ev for ev, label in zip(example_claim.evidences, cluster_labels) if label == i]
    for i in range(n_clusters)
}

# Visualize clusters
visualize_clusters(
    evidence_embeddings_np,
    cluster_labels,
    f"Evidence Clusters for Claim {example_claim.claim_id}",
    "evidence_clusters.png"
)

# Clustering and Visualization
Implement functions for clustering embeddings, visualizing clusters with PCA, and displaying results using rich tables.

In [9]:
# Clustering and Visualization

# Function to cluster claims
def cluster_claims(claims, num_clusters, random_state=42):
    """Clusters the claims using the trained model.

    Args:
        claims (List[Claim]): List of Claim objects.
        num_clusters (int): Number of clusters.

    Returns:
        List[Claim]: Claims with assigned cluster labels.
    """
    all_embeddings = np.array(
        [claim.embedding for claim in claims]
    )  # Extract all of the embeddings
    kmeans = KMeans(
        n_clusters=num_clusters, random_state=random_state, n_init=10
    )  # Perform K-Means using embeddings.
    kmeans.fit(all_embeddings)
    cluster_labels = kmeans.labels_  # Extract the cluster labels.

    for i, claim in enumerate(claims):
        claim_cluster = cluster_labels[i]
        if claim.clustered_evidences is None:
            claim.clustered_evidences = {}
        if claim_cluster not in claim.clustered_evidences:
            claim.clustered_evidences[claim_cluster] = []
        claim.clustered_evidences[claim_cluster].extend(
            claim.evidences
        )  # assign cluster to the evidences

    return claims

# Function to visualize clusters
def visualize_clusters(embeddings: np.ndarray, labels: np.ndarray, title: str, save_path: str):
    """Plot clusters using PCA for dimensionality reduction"""
    # Reduce to 2D for visualization
    pca = PCA(n_components=2)
    reduced_embeddings = pca.fit_transform(embeddings)

    # Create scatter plot
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(
        reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=labels, cmap="viridis"
    )
    plt.colorbar(scatter)
    plt.title(title)
    plt.xlabel("First Principal Component")
    plt.ylabel("Second Principal Component")
    plt.savefig(save_path)
    plt.close()

# Function to print evidence clusters
def print_evidence_clusters(claim: Claim, cluster_labels: np.ndarray):
    """Display evidence clusters in a readable format"""
    table = Table(title=f"Evidence Clusters for Claim: {claim.claim_id}")
    table.add_column("Cluster", justify="center", style="cyan")
    table.add_column("Evidence Content", justify="left", style="green")

    for cluster_id in range(max(cluster_labels) + 1):
        # Get evidences for this cluster
        cluster_evidences = [
            ev
            for ev, label in zip(claim.evidences, cluster_labels)
            if label == cluster_id
        ]

        # Add row for each evidence in cluster
        for idx, evidence in enumerate(cluster_evidences):
            # Truncate content for readability
            content = (
                evidence.content[:100] + "..."
                if len(evidence.content) > 100
                else evidence.content
            )
            table.add_row(f"Cluster {cluster_id}" if idx == 0 else "", content)
        # Add separator between clusters
        table.add_row("", "")

    console.print(table)

# Get evidence embeddings
evidence_texts = [ev.content for ev in example_claim.evidences]
with torch.no_grad():
    evidence_embeddings = model(evidence_texts)
    evidence_embeddings_np = evidence_embeddings.cpu().numpy()

# Cluster evidences
n_clusters = min(len(evidence_texts), 5)
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(evidence_embeddings_np)

# Print clusters
print_evidence_clusters(example_claim, cluster_labels)

# Group evidences
example_claim.clustered_evidences = {
    i: [
        ev
        for ev, label in zip(example_claim.evidences, cluster_labels)
        if label == i
    ]
    for i in range(n_clusters)
}

# Visualize clusters
plot_path = "evidence_clusters.png"
visualize_clusters(
    evidence_embeddings_np,
    cluster_labels,
    f"Evidence Clusters for Claim {example_claim.claim_id}",
    plot_path,
)