In [268]:
article_1 = {
    "facts": [
        "Lion Air Flight 610 and Ethiopian Airlines Flight 302 both crashed while flying the Boeing 737 Max.",
        "The two crashes resulted in 346 deaths.",
        "Boeing avoided prosecution in 2021 through a deferred prosecution agreement with the U.S. Justice Department.",
        "In 2024, prosecutors said Boeing violated that agreement.",
        "A proposed plea deal in 2024 was rejected by a federal judge.",
        "The Justice Department is now considering a new deal that does not require Boeing to plead guilty.",
        "Families of crash victims were informed on 16 May 2025 and expressed strong opposition.",
        "Lawyer Sanjiv Singh criticized the deal for lacking accountability.",
        "Boeing’s share price fell slightly after the announcement."

    ],
    "nouns": [
        "Lion Air Flight 610","Ethiopian Airlines Flight 302", "Boeing 737 Max", "crashes", "346 deaths",
        "prosecution","agreement","Justice Department","prosecutors","plea deal","judge","families",
        "16 May 2025","opposition","lawyer","Sanjiv Singh","accountability","share price",
        "announcement"
    ]
}


article_2 = {
    "facts": [
        "Lion Air Flight 610 and Ethiopian Airlines Flight 302 crashed while operating the Boeing 737 Max model.",
        "Combined, the crashes led to the deaths of 346 individuals.",
        "In 2021, Boeing entered a deferred prosecution deal with the U.S. Justice Department to avoid criminal charges.",
        "Prosecutors alleged in 2024 that Boeing breached the terms of that agreement.",
        "A federal judge rejected a plea agreement with Boeing in 2024.",
        "Currently, the Justice Department is weighing a new agreement that avoids a guilty plea.",
        "Victims’ families were notified on 16 May 2025 and voiced serious concerns.",
        "Attorney Sanjiv Singh accused the new deal of failing to hold Boeing accountable.",
        "Following the news, Boeing’s stock experienced a slight drop."
    ],
    "nouns": [
        "Lion Air Flight 610", "Ethiopian Airlines Flight 302", "Boeing 737 Max", "crashes", "346 deaths",
        "prosecution deal", "Justice Department", "criminal charges", "prosecutors", "agreement",
        "judge", "plea agreement", "families", "16 May 2025", "concerns", "Attorney", "Sanjiv Singh",
        "accountability", "Boeing’s stock", "news"
    ]
}

article_3 = {
    "facts": [
        "Both Lion Air 610 and Ethiopian Airlines 302 crashed while using the Boeing 737 Max aircraft.",
        "The total death toll from the two accidents was 346.",
        "In 2021, Boeing secured a deferred prosecution agreement with U.S. authorities.",
        "Boeing was accused of violating the deal in 2024.",
        "A judge rejected a plea deal with Boeing due to procedural objections.",
        "The DOJ is now proposing a revised deal without requiring a guilty plea from Boeing.",
        "On 16 May 2025, victims' families were told of the potential new agreement.",
        "Sanjiv Singh, representing the families, said the deal lacked consequences for Boeing.",
        "Boeing’s shares declined marginally after the news broke."
    ],
    "nouns": [
        "Lion Air 610", "Ethiopian Airlines 302", "Boeing 737 Max", "aircraft", "346 death toll",
        "accidents", "prosecution agreement", "U.S. authorities", "deal", "judge",
        "plea deal", "DOJ", "agreement", "16 May 2025", "families", "Sanjiv Singh",
        "consequences", "Boeing’s shares", "news"
    ]
}


article_fake = {
    "facts": [
        "Both Lion Air 610 and Ethiopian Airlines 302 mysteriously crashed using the controversial Boeing 737 Max.",
        "Sources suggest over 400 people may have perished in the two disasters — though exact numbers remain disputed.",
        "In 2021, Boeing allegedly cut a secret deal with U.S. officials to dodge prosecution.",
        "By 2024, whistleblowers revealed Boeing broke the agreement behind closed doors.",
        "A federal judge threw out the plea deal, reportedly calling it a 'mockery of justice'.",
        "Now, the DOJ is preparing to quietly approve a backroom deal that clears Boeing without a guilty plea.",
        "Families were notified at the last minute on 16 May 2025, sparking outrage and protests.",
        "Attorney Sanjiv Singh blasted the agreement as a corporate cover-up to protect Boeing’s leadership.",
        "After public backlash, Boeing’s stock showed suspicious activity, dropping slightly amidst heavy insider trading."
    ],
    "nouns": [
        "Lion Air 610", "Ethiopian Airlines 302", "Boeing 737 Max", "sources", "400 people",
        "disasters", "deal", "U.S. officials", "prosecution", "whistleblowers", "agreement",
        "federal judge", "plea deal", "DOJ", "backroom deal", "families", "16 May 2025",
        "outrage", "protests", "Attorney Sanjiv Singh", "corporate cover-up", "leadership",
        "Boeing’s stock", "insider trading", "public backlash"
    ]
}


In [269]:
from openai import OpenAI
from getpass import getpass
import json
import pandas as pd


In [270]:
openai_key = getpass("Enter your API Key:")
client = OpenAI(api_key=openai_key)

In [286]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
import numpy as np

# Change this line to use mpnet
encoder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")


In [287]:
import numpy as np
import pandas as pd

def embed_and_build_dataframes(articles, encoder):
    fact_rows = []
    noun_rows = []

    for article_id, article in articles.items():
        facts = article["facts"]
        nouns = article["nouns"]

        # Embed and normalize facts
        fact_embeddings = encoder.embed_documents(facts)
        norm_fact_embeddings = [vec / np.linalg.norm(vec) for vec in fact_embeddings]

        # Embed and normalize nouns
        noun_embeddings = encoder.embed_documents(nouns)
        norm_noun_embeddings = [vec / np.linalg.norm(vec) for vec in noun_embeddings]

        # Add to rows
        for fact, emb in zip(facts, norm_fact_embeddings):
            fact_rows.append({
                "article": article_id,
                "text": fact,
                "embedding": emb
            })

        for noun, emb in zip(nouns, norm_noun_embeddings):
            noun_rows.append({
                "article": article_id,
                "text": noun,
                "embedding": emb
            })

    df_facts = pd.DataFrame(fact_rows)
    df_nouns = pd.DataFrame(noun_rows)

    return df_facts, df_nouns



In [288]:
# Define your encoder first
# encoder = OpenAIEmbeddings() or SentenceTransformer("model-name")

articles = {
    "article_1": article_1,
    "article_2": article_2,
    "article_3": article_3,
    "article_fake": article_fake
}

df_facts, df_nouns = embed_and_build_dataframes(articles, encoder)

In [289]:
df_facts.head()

Unnamed: 0,article,text,embedding
0,article_1,Lion Air Flight 610 and Ethiopian Airlines Fli...,"[0.017989880037550303, 0.0034294848938893198, ..."
1,article_1,The two crashes resulted in 346 deaths.,"[-0.02515258186733664, -0.0055169144164861, 0...."
2,article_1,Boeing avoided prosecution in 2021 through a d...,"[0.005384550969078203, 0.08753159761760608, -0..."
3,article_1,"In 2024, prosecutors said Boeing violated that...","[-0.0016094101063909385, 0.12598705788964218, ..."
4,article_1,A proposed plea deal in 2024 was rejected by a...,"[0.04251273521636891, 0.07212895560988376, -0...."


In [291]:
df_nouns.head()

Unnamed: 0,article,text,embedding
0,article_1,Lion Air Flight 610,"[0.022198906047796774, 0.06501072609885147, 0...."
1,article_1,Ethiopian Airlines Flight 302,"[0.03356577536199707, 0.040569984392137534, 0...."
2,article_1,Boeing 737 Max,"[0.011621095251986896, -0.0727650002388372, 0...."
3,article_1,crashes,"[-0.0322143780828792, -0.02975331740889792, -0..."
4,article_1,346 deaths,"[0.0024855463393623173, 0.0684266403816035, 0...."


In [292]:
from sklearn.cluster import DBSCAN
import numpy as np
import pandas as pd

def run_dbscan(df, eps=0.5, min_samples=2):
    """
    Run DBSCAN on a DataFrame with 'embedding' column containing vectors.
    
    Args:
        df (pd.DataFrame): DataFrame with 'embedding' column (vectors).
        eps (float): DBSCAN epsilon parameter.
        min_samples (int): DBSCAN min_samples parameter.
    
    Returns:
        pd.DataFrame: Original DataFrame with added 'cluster' column.
    """
    # Stack embeddings into a matrix
    X = np.vstack(df["embedding"].values)

    # Run DBSCAN
    db = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine')
    clusters = db.fit_predict(X)

    # Add cluster labels to DataFrame
    df_with_clusters = df.copy()
    df_with_clusters["cluster"] = clusters

    return df_with_clusters


In [305]:
# DBSCAN clustering
df_facts_clustered = run_dbscan(df_facts, eps=0.5, min_samples=1)
df_nouns_clustered = run_dbscan(df_nouns, eps=0.4, min_samples=1)

In [306]:
print(f"Unqiue clusters found for facts: {df_facts_clustered['cluster'].unique()}")
df_facts_clustered.head()

Unqiue clusters found for facts: [0 1 2 3]


Unnamed: 0,article,text,embedding,cluster
0,article_1,Lion Air Flight 610 and Ethiopian Airlines Fli...,"[0.017989880037550303, 0.0034294848938893198, ...",0
1,article_1,The two crashes resulted in 346 deaths.,"[-0.02515258186733664, -0.0055169144164861, 0....",1
2,article_1,Boeing avoided prosecution in 2021 through a d...,"[0.005384550969078203, 0.08753159761760608, -0...",2
3,article_1,"In 2024, prosecutors said Boeing violated that...","[-0.0016094101063909385, 0.12598705788964218, ...",2
4,article_1,A proposed plea deal in 2024 was rejected by a...,"[0.04251273521636891, 0.07212895560988376, -0....",2


In [307]:
print(f"Unqiue clusters found for nouns: {df_nouns_clustered['cluster'].unique()}")
df_nouns_clustered.head()

Unqiue clusters found for nouns: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]


Unnamed: 0,article,text,embedding,cluster
0,article_1,Lion Air Flight 610,"[0.022198906047796774, 0.06501072609885147, 0....",0
1,article_1,Ethiopian Airlines Flight 302,"[0.03356577536199707, 0.040569984392137534, 0....",0
2,article_1,Boeing 737 Max,"[0.011621095251986896, -0.0727650002388372, 0....",1
3,article_1,crashes,"[-0.0322143780828792, -0.02975331740889792, -0...",2
4,article_1,346 deaths,"[0.0024855463393623173, 0.0684266403816035, 0....",3


In [308]:
df_nouns_clustered.head()

Unnamed: 0,article,text,embedding,cluster
0,article_1,Lion Air Flight 610,"[0.022198906047796774, 0.06501072609885147, 0....",0
1,article_1,Ethiopian Airlines Flight 302,"[0.03356577536199707, 0.040569984392137534, 0....",0
2,article_1,Boeing 737 Max,"[0.011621095251986896, -0.0727650002388372, 0....",1
3,article_1,crashes,"[-0.0322143780828792, -0.02975331740889792, -0...",2
4,article_1,346 deaths,"[0.0024855463393623173, 0.0684266403816035, 0....",3


In [309]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def associate_noun_to_fact_clusters_df(df_nouns, df_facts):
    # Remove noise points
    df_nouns = df_nouns[df_nouns["cluster"] != -1]
    df_facts = df_facts[df_facts["cluster"] != -1]

    # Compute centroids for each cluster
    noun_centroids = {
        cluster: np.mean(np.vstack(group["embedding"]), axis=0)
        for cluster, group in df_nouns.groupby("cluster")
    }

    fact_centroids = {
        cluster: np.mean(np.vstack(group["embedding"]), axis=0)
        for cluster, group in df_facts.groupby("cluster")
    }

    # Associate each noun cluster to the most similar fact cluster
    association_rows = []
    for noun_cluster, noun_vec in noun_centroids.items():
        best_fact_cluster = None
        best_score = -1
        for fact_cluster, fact_vec in fact_centroids.items():
            score = cosine_similarity([noun_vec], [fact_vec])[0][0]
            if score > best_score:
                best_score = score
                best_fact_cluster = fact_cluster

        # Collect texts
        noun_texts = df_nouns[df_nouns["cluster"] == noun_cluster]["text"].tolist()
        fact_texts = df_facts[df_facts["cluster"] == best_fact_cluster]["text"].tolist()

        association_rows.append({
            "noun_cluster": noun_cluster,
            "fact_cluster": best_fact_cluster,
            "nouns": noun_texts,
            "facts": fact_texts,
            "similarity_score": best_score
        })

    return pd.DataFrame(association_rows)



In [310]:
df_cluster_associations = associate_noun_to_fact_clusters_df(df_nouns_clustered, df_facts_clustered)
df_cluster_associations

Unnamed: 0,noun_cluster,fact_cluster,nouns,facts,similarity_score
0,0,0,"[Lion Air Flight 610, Ethiopian Airlines Fligh...",[Lion Air Flight 610 and Ethiopian Airlines Fl...,0.650021
1,1,0,"[Boeing 737 Max, Boeing 737 Max, Boeing 737 Ma...",[Lion Air Flight 610 and Ethiopian Airlines Fl...,0.596703
2,2,0,"[crashes, crashes]",[Lion Air Flight 610 and Ethiopian Airlines Fl...,0.341143
3,3,1,"[346 deaths, 346 deaths, 346 death toll]","[The two crashes resulted in 346 deaths., Comb...",0.770013
4,4,2,"[prosecution, Justice Department, prosecutors,...",[Boeing avoided prosecution in 2021 through a ...,0.412658
5,5,2,"[agreement, agreement, agreement, agreement]",[Boeing avoided prosecution in 2021 through a ...,0.201381
6,6,2,"[judge, judge, judge, federal judge]",[Boeing avoided prosecution in 2021 through a ...,0.225317
7,7,3,"[families, families, families, families]",[Families of crash victims were informed on 16...,0.336359
8,8,3,"[16 May 2025, 16 May 2025, 16 May 2025, 16 May...",[Families of crash victims were informed on 16...,0.305906
9,9,3,[opposition],[Families of crash victims were informed on 16...,0.241152
