## Setup

This notebook uses Python. Run cells in order from top to bottom.

In [32]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [33]:
from tqdm.notebook import tqdm
tqdm.pandas()

## Configuration

Edit the values in the cell below to match your data **before running the notebook**.

In [34]:
# =============================================================
# CONFIGURATION â€” Edit these values before running
# =============================================================

# Path to directory containing .txt files (one document per file)
DATA_DIR = "data/"

# --- Required: at least one text column ---
TEXT_COL_PRIMARY   = "text"   # Content of each .txt file
TEXT_COL_SECONDARY = None     # No secondary column for plain text files

# --- Optional metadata columns ---
# Set any of these to None if your data doesn't include them
DATE_COL   = None  # No date column
DOMAIN_COL = None  # No domain column
URL_COL    = None  # No URL column

# --- Deduplication ---
# Columns to deduplicate on â€” set to [] to skip deduplication entirely
DEDUPE_COLS = []

# --- Clustering parameters (cosine distance, range 0â€“2) ---
# eps:         max cosine distance between two points in the same cluster
#              lower = tighter clusters (e.g. 0.2); higher = looser (e.g. 0.5)
# min_samples: minimum number of documents required to form a cluster
DBSCAN_EPS         = 0.65
DBSCAN_MIN_SAMPLES = 4

# --- Output ---
OUTPUT_DIR = "output"   # Folder to save results (created automatically if it doesn't exist)

## Load Data & Remove Duplicates ðŸ§¹

In [35]:
# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load data from .txt files â€” one document per file
records = []
for fname in sorted(os.listdir(DATA_DIR)):
    if fname.endswith(".txt"):
        fpath = os.path.join(DATA_DIR, fname)
        with open(fpath, "r", encoding="utf-8") as f:
            content = f.read().strip()
        records.append({"filename": os.path.splitext(fname)[0], "text": content})

df = pd.DataFrame(records)
print(f"Loaded {len(df):,} documents from '{DATA_DIR}'")
df.head()

Loaded 67 documents from 'data/'


Unnamed: 0,filename,text
0,chunk_00,"Well, thank you very much, everybody. It's rea..."
1,chunk_01,And you've seen nothing yet. We're going to do...
2,chunk_02,[Laughter] The Biden administration and its al...
3,chunk_03,"In four long years, the last administration go..."
4,chunk_04,American oil production is up by more than 600...


Rows that will be removed as duplicates:

In [36]:
if DEDUPE_COLS:
    dupes = df[df.duplicated(subset=DEDUPE_COLS, keep=False)].sort_values(by=DEDUPE_COLS)
    print(f"{len(dupes):,} duplicate rows found (columns used: {DEDUPE_COLS})")
    display(dupes.head())
else:
    print("DEDUPE_COLS is empty â€” skipping deduplication preview.")

DEDUPE_COLS is empty â€” skipping deduplication preview.


In [37]:
if DEDUPE_COLS:
    before = len(df)
    df = df.drop_duplicates(subset=DEDUPE_COLS, keep="last").reset_index(drop=True)
    print(f"Removed {before - len(df):,} duplicates. {len(df):,} rows remaining.")
df.head()

Unnamed: 0,filename,text
0,chunk_00,"Well, thank you very much, everybody. It's rea..."
1,chunk_01,And you've seen nothing yet. We're going to do...
2,chunk_02,[Laughter] The Biden administration and its al...
3,chunk_03,"In four long years, the last administration go..."
4,chunk_04,American oil production is up by more than 600...


## Embeddings

Generate a vector embedding for each row. **Run Option A *or* Option B â€” not both.**

| | Option A | Option B |
|---|---|---|
| **Model** | `all-MiniLM-L6-v2` (local) | `text-embedding-3-small` (OpenAI) |
| **Cost** | Free | ~$0.02 / million tokens |
| **Requires** | Nothing extra | API key in `.env` |
| **Download** | ~80 MB (first run only) | None |
| **Quality** | Good enough for clustering | Higher |

In [38]:
import tiktoken

ENCODING   = "cl100k_base"
MAX_TOKENS = 8000
enc        = tiktoken.get_encoding(ENCODING)

def make_combined_text(row):
    """Combine primary and (optional) secondary text columns for embedding."""
    primary = str(row[TEXT_COL_PRIMARY]) if pd.notna(row[TEXT_COL_PRIMARY]) else ""
    if (TEXT_COL_SECONDARY
            and TEXT_COL_SECONDARY in row.index
            and pd.notna(row[TEXT_COL_SECONDARY])):
        secondary = str(row[TEXT_COL_SECONDARY])
        return f"Title: {primary}; Content: {secondary}"
    return primary

df["combined"] = df.apply(make_combined_text, axis=1)
df["n_tokens"] = df["combined"].apply(lambda x: len(enc.encode(x)))
df = df.sort_values("n_tokens", ascending=False)
print(f"Longest article: {df['n_tokens'].max():,} tokens | Average: {df['n_tokens'].mean():.0f} tokens")
df[["combined", "n_tokens"]].head()

Longest article: 552 tokens | Average: 193 tokens


Unnamed: 0,combined,n_tokens
63,How did that work out? Not too good. [Laughter...,552
56,They've already developed missiles that can th...,521
65,250 years is a long time in the life of a nati...,489
5,We ended DEI in America. We cut a record numbe...,440
9,"And this year, and I must say, I got them both...",430


In [39]:
too_long = df[df["n_tokens"] > MAX_TOKENS]
print(f"Removing {len(too_long):,} articles exceeding {MAX_TOKENS:,} tokens.")
df = df[df["n_tokens"] <= MAX_TOKENS].reset_index(drop=True)
print(f"{len(df):,} rows remaining.")

Removing 0 articles exceeding 8,000 tokens.
67 rows remaining.


### Option A â€” Local model

No API key needed. Model downloads ~80 MB on first run, then works offline.

> Skip to Option B if you prefer to use the OpenAI API.

In [40]:
# # â”€â”€ Option A: local model â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# from sentence_transformers import SentenceTransformer

# model = SentenceTransformer("all-MiniLM-L6-v2")  # downloads ~80 MB on first run

# df["embedding"] = model.encode(
#     df["combined"].tolist(),
#     show_progress_bar=True,
#     batch_size=64,
# ).tolist()

# print("Done.")

### Option B â€” OpenAI API

Higher quality embeddings. Requires an API key in `.env` and the `openai` + `python-dotenv` packages.

```bash
pip install openai python-dotenv
```

> Skip this cell if you already ran Option A above.

In [41]:
# â”€â”€ Option B only â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
client = OpenAI()

EMBEDDING_MODEL = "text-embedding-3-small"

In [42]:
# â”€â”€ Option B: OpenAI embeddings â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
def get_embeddings(texts):
    response = client.embeddings.create(model=EMBEDDING_MODEL, input=texts)
    return [item.embedding for item in response.data]

def process_in_batches(df, column, batch_size=30):
    all_embeddings = []
    texts = df[column].tolist()
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i : i + batch_size]
        all_embeddings.extend(get_embeddings(batch))
    return all_embeddings

print(f"Generating embeddings for {len(df):,} rows...")
df["embedding"] = process_in_batches(df, "combined")
print("Done.")

Generating embeddings for 67 rows...


  0%|          | 0/3 [00:00<?, ?it/s]

Done.


In [43]:
df = df.drop(columns=["combined", "n_tokens"])
df.head()

Unnamed: 0,filename,text,embedding
0,chunk_63,How did that work out? Not too good. [Laughter...,"[0.042420320212841034, 0.014713354408740997, -..."
1,chunk_56,They've already developed missiles that can th...,"[0.016662893816828728, 0.030341805890202522, 0..."
2,chunk_65,250 years is a long time in the life of a nati...,"[0.04109662026166916, -0.0008324729278683662, ..."
3,chunk_05,We ended DEI in America. We cut a record numbe...,"[0.07365907728672028, -0.02919101156294346, -0..."
4,chunk_09,"And this year, and I must say, I got them both...","[0.02930293418467045, 0.01574314571917057, 0.0..."


## Dimensionality Reduction (t-SNE)

Embeddings are high-dimensional vectors (~1,536 dimensions). t-SNE projects them to 2D (x, y) for plotting.
This step is cached â€” re-running the notebook will load saved coordinates instead of recomputing.

In [44]:
from sklearn.manifold import TSNE

tsne_cache = os.path.join(OUTPUT_DIR, "tsne-cache.csv")

if os.path.exists(tsne_cache):
    print(f"Loading cached t-SNE coordinates from '{tsne_cache}'")
    coords = pd.read_csv(tsne_cache)
    df["x"] = coords["x"].values
    df["y"] = coords["y"].values
else:
    print("Running t-SNE (this may take a few minutes on large datasets)...")
    matrix = np.array(df["embedding"].tolist())
    tsne = TSNE(
        n_components=2,
        perplexity=min(30, len(df) // 5),  # scales with dataset size
        random_state=42,
        init="pca",           # stable, deterministic
        learning_rate="auto") # adapts to dataset size
    vis_dims = tsne.fit_transform(matrix)
    df["x"] = vis_dims[:, 0]
    df["y"] = vis_dims[:, 1]
    df[["x", "y"]].to_csv(tsne_cache, index=False)
    print(f"Done. Coordinates cached to '{tsne_cache}'")

Loading cached t-SNE coordinates from 'output/tsne-cache.csv'


In [45]:
df.head()

Unnamed: 0,filename,text,embedding,x,y
0,chunk_63,How did that work out? Not too good. [Laughter...,"[0.042420320212841034, 0.014713354408740997, -...",13.182227,0.281404
1,chunk_56,They've already developed missiles that can th...,"[0.016662893816828728, 0.030341805890202522, 0...",2.473453,-0.294863
2,chunk_65,250 years is a long time in the life of a nati...,"[0.04109662026166916, -0.0008324729278683662, ...",-8.28146,-1.582805
3,chunk_05,We ended DEI in America. We cut a record numbe...,"[0.07365907728672028, -0.02919101156294346, -0...",8.946043,-4.424563
4,chunk_09,"And this year, and I must say, I got them both...","[0.02930293418467045, 0.01574314571917057, 0.0...",10.667722,2.498163


## Topic Modeling (DBSCAN)

[DBSCAN](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html) clusters points in vector space â€” articles in the same cluster tend to be about the same topic.

- **Topic 0** = noise (articles that don't fit into any cluster)
- Adjust `DBSCAN_EPS` and `DBSCAN_MIN_SAMPLES` in the Configuration cell if you get too few or too many clusters.

In [46]:
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_distances

df = df.reset_index(drop=True)

matrix = np.array(df["embedding"].tolist())

# Diagnostic: show distance distribution to help tune eps
dists = cosine_distances(matrix)
np.fill_diagonal(dists, np.nan)
flat = dists[~np.isnan(dists)]
print("Cosine distance percentiles (use these to set DBSCAN_EPS):")
for p in [10, 25, 50, 75, 90]:
    print(f"  {p}th percentile: {np.percentile(flat, p):.3f}")

labels = DBSCAN(eps=DBSCAN_EPS, min_samples=DBSCAN_MIN_SAMPLES, metric="cosine").fit_predict(matrix)
df["topic"] = labels + 1   # shift so DBSCAN noise (-1) becomes topic 0

topic_counts = (df.groupby("topic")
                  .size()
                  .reset_index(name="count")
                  .sort_values("count", ascending=False))
n_clusters = (topic_counts["topic"] > 0).sum()
n_noise    = topic_counts.loc[topic_counts["topic"] == 0, "count"].sum()
print(f"\nFound {n_clusters} clusters + {n_noise:,} uncategorized (noise) articles")
topic_counts

Cosine distance percentiles (use these to set DBSCAN_EPS):
  10th percentile: 0.555
  25th percentile: 0.633
  50th percentile: 0.715
  75th percentile: 0.799
  90th percentile: 0.857

Found 1 clusters + 0 uncategorized (noise) articles


Unnamed: 0,topic,count
0,1,67


In [47]:
df["topic_label"] = df["topic"].apply(
    lambda t: "Uncategorized" if t == 0 else f"Topic {t}"
)

summary = (df.groupby(["topic", "topic_label"])
             .size()
             .reset_index(name="count")
             .sort_values("count", ascending=False))
summary

Unnamed: 0,topic,topic_label,count
0,1,Topic 1,67


## Export

In [48]:
# --- Full dataset (all columns, including embedding vectors) ---
full_path = os.path.join(OUTPUT_DIR, "data-with-embeddings.csv")
df.to_csv(full_path, index=False)
print(f"Full dataset    â†’ '{full_path}'")

# --- Visualization dataset ---
# Renames columns to match the visualization tool's expected schema:
#   title, text, date, org, x, y, url
rename_map = {}
if TEXT_COL_PRIMARY:   rename_map[TEXT_COL_PRIMARY]   = "title"
if TEXT_COL_SECONDARY: rename_map[TEXT_COL_SECONDARY] = "text"
if DATE_COL:           rename_map[DATE_COL]           = "date"
if DOMAIN_COL:         rename_map[DOMAIN_COL]         = "org"
if URL_COL:            rename_map[URL_COL]            = "url"

vis_df = df.rename(columns=rename_map)

vis_cols = [c for c in ["title", "text", "date", "org", "x", "y", "url", "topic_label"]
            if c in vis_df.columns]

vis_path = os.path.join(OUTPUT_DIR, "data-for-visualization.csv")
vis_df[vis_cols].to_csv(vis_path, index=False)
print(f"Visualization   â†’ '{vis_path}'")
print(f"\nColumns exported: {vis_cols}")

Full dataset    â†’ 'output/data-with-embeddings.csv'
Visualization   â†’ 'output/data-for-visualization.csv'

Columns exported: ['title', 'x', 'y', 'topic_label']


## Interactive Chart

In [49]:
import plotly.express as px
import textwrap

def wrap_hover(text, width=60):
    if pd.isna(text):
        return ""
    lines = textwrap.wrap(str(text), width=width)
    return "<br>".join(lines)

plot_df = df.copy()
plot_df["_primary"] = plot_df[TEXT_COL_PRIMARY].apply(lambda t: wrap_hover(t, width=60))

hover_name_col = "filename" if "filename" in df.columns else TEXT_COL_PRIMARY
hover_data = {"_primary": True, "x": False, "y": False}
if TEXT_COL_SECONDARY and TEXT_COL_SECONDARY in df.columns:
    plot_df["_secondary"] = plot_df[TEXT_COL_SECONDARY].apply(lambda t: wrap_hover(t, width=60))
    hover_data["_secondary"] = True

fig = px.scatter(
    plot_df,
    x="x",
    y="y",
    color="topic_label",
    hover_name=hover_name_col,
    hover_data=hover_data,
    title="Semantic Map",
    width=900,
    height=700,
)
fig.update_traces(marker=dict(size=9, opacity=0.7))
fig.update_layout(
    legend_title_text="Topic",
    hoverlabel=dict(namelength=-1),
)
fig.show()